In [None]:
import ast
import os
import json

import pandas as pd

from tqdm import tqdm
from transformers import pipeline

2023-12-15 10:43:40.020762: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
root_path = os.path.dirname(os.path.abspath(""))

## Data

Note that data preprocessing has been done in [preprocess.ipynb](preprocess.ipynb). The notebook will clean, merge and save the data to `data/processed`.

In [None]:
df = pd.read_csv('../data/processed/preprocessed.csv', index_col=0)
# Transform genres column to list of strings, remove brackets split by comma and remove ' from begining and end
df['genres'] = df['genres'].apply(ast.literal_eval)
print(df.shape)
df.head(5)

In [None]:
genre_labels = ['Drama', 'Comedy', 'Action', 'Horror', 'Romance', 'Thriller', 'Adventure', 'Documentary']
theme_labels = ['Science-Fiction', 'Crime', 'Fantasy', 'War', 'Mystery', 'History', 'Western', 'Biographical', 'Family']

In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

#### Classification experiment

In order to make sure our assumption that splitting the genres and themes will improve the performance of the model, we will run a classification experiment. We will use the genres, the themes, and both of them as labels and compare the results.

In [None]:
decade = 1910
decade_df = df[(df.release_year >= decade) & (df.release_year < decade + 10)]

In [None]:
probabilities = {}
hypothesis_template = 'The genre of this movie is {}.'

for i, row in tqdm(decade_df.iterrows(), total=decade_df.shape[0]):
    probabilities[row.wikipedia_id] = {}

    genre_probs = classifier(row['plot'], genre_labels, multilabel=True, hypothesis_template=hypothesis_template)
    genre_probs.pop('sequence')
    probabilities[row.wikipedia_id]['genres'] = genre_probs

    theme_probs = classifier(row['plot'], theme_labels, multilabel=True, hypothesis_template=hypothesis_template)
    theme_probs.pop('sequence')
    probabilities[row.wikipedia_id]['themes'] = theme_probs

In [None]:
with open(os.path.join(root_path, 'data', 'classification', 'plots', f'plots_{decade}.json'), "w") as outfile:
    json.dump(probabilities, outfile)