In [13]:
import ast
import os
import json

import pandas as pd

from tqdm import tqdm
from transformers import pipeline

In [16]:
root_path = os.path.dirname(os.path.abspath(""))

## Data

Note that data preprocessing has been done in [preprocess.ipynb](preprocess.ipynb). The notebook will clean, merge and save the data to `data/processed`.

In [2]:
df = pd.read_csv('../data/processed/preprocessed.csv', index_col=0)
# Transform genres column to list of strings, remove brackets split by comma and remove ' from begining and end
df['genres'] = df['genres'].apply(ast.literal_eval)
print(df.shape)
df.head(5)

(22394, 10)


Unnamed: 0,name,release_year,rating,num_votes,wikipedia_id,plot,freebase_id,languages,countries,genres
0,Miss Jerry,1894.0,5.3,207,10109752,After finding out that her father is suffering...,/m/02q23xk,['Silent film'],['United States of America'],"[Short Film, Silent film, Indie, Black-and-white]"
1,The Corbett-Fitzsimmons Fight,1897.0,5.3,484,28703057,The film no longer exists in its entirety; how...,/m/0czdh_n,[],[],[Sports]
2,The Story of the Kelly Gang,1906.0,6.0,855,142995,The Story of the Kelly Gangs tone is of sorrow...,/m/0120y4,['English Language'],['Australia'],"[Crime Fiction, Silent film, Biography, Indie,..."
3,Robbery Under Arms,1907.0,4.3,25,32986669,Key scenes of the film included the branding o...,/m/04p7yxx,['Silent film'],['Australia'],"[Silent film, Drama]"
4,"Captain Midnight, the Bush King",1911.0,5.4,18,32987200,Edgar Dalimore is the son of wealthy station o...,/m/0h569x9,['Silent film'],[],[Silent film]


In [3]:
genre_labels = ['Drama', 'Comedy', 'Action', 'Horror', 'Romance', 'Thriller', 'Adventure', 'Documentary']
theme_labels = ['Science-Fiction', 'Crime', 'Fantasy', 'War', 'Mystery', 'History', 'Western', 'Biographical', 'Family']

In [4]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

#### Classification experiment

In order to make sure our assumption that splitting the genres and themes will improve the performance of the model, we will run a classification experiment. We will use the genres, the themes, and both of them as labels and compare the results.

In [9]:
decade = 1910
decade_df = df[(df.release_year >= decade) & (df.release_year < decade + 10)]

In [23]:
probabilities = {}
hypothesis_template = 'The genre of this movie is {}.'

for i, row in tqdm(decade_df.iterrows(), total=decade_df.shape[0]):
    
    probabilities[row.wikipedia_id] = {}
    
    genre_probs = classifier(row['plot'], genre_labels, multilabel=True, hypothesis_template=hypothesis_template)
    genre_probs.pop('sequence')
    probabilities[row.wikipedia_id]['genres'] = genre_probs
    
    theme_probs = classifier(row['plot'], theme_labels, multilabel=True, hypothesis_template=hypothesis_template)
    theme_probs.pop('sequence')
    probabilities[row.wikipedia_id]['themes'] = theme_probs

100%|██████████| 2/2 [00:47<00:00, 23.81s/it]


In [24]:
with open(os.path.join(root_path, 'data', 'classification', 'plots', f'plots_{decade}.json'), "w") as outfile: 
    json.dump(probabilities, outfile)