In [None]:
import sys
sys.path.append("../../")
import pandas as pd

from tqdm import tqdm
from bechdelai.data.tmdb import search_movie_from_query
from analyzer import TextAnalyzer
from bechdelai.data.opensubtitles import search
from bechdelai.data.opensubtitles import get_subtitle_link
from bechdelai.data.opensubtitles import download_subtitle_from_url
import plotly.express as px

# Create Dataset

## Metadata

In [None]:
movies = pd.read_html("https://fr.wikipedia.org/wiki/Liste_des_plus_gros_succ%C3%A8s_du_box-office_au_Canada_et_aux_%C3%89tats-Unis")[0]

In [None]:
def get_tmdb_metadata(query):
    data = search_movie_from_query(query)
    results = data.get("results", [])
    return results[0] if results else None

In [None]:
metadatas = movies["Titre"].apply(lambda x: get_tmdb_metadata(x)).tolist()

In [None]:
metadatas_df = pd.json_normalize(metadatas.to_list())

In [None]:
movies_with_metadata = pd.concat([movies, metadatas_df], axis=1)

In [None]:
movies_with_metadata.to_csv("box_office_NA_movies.csv", index=False)

## Subtitles

In [None]:
def get_open_subtitles(movie_name, language_code="eng"):
    search_url = search(movie_name, language_code)
    wanted_movie = list(search_url.keys())[0]
    subtitle_url = get_subtitle_link(search_url[wanted_movie])
    res = download_subtitle_from_url(subtitle_url)
    return res

# Analyze

In [None]:
movies_with_metadata = pd.read_csv("box_office_NA_movies.csv")

In [None]:
analyzer = TextAnalyzer(input_file="subs.txt", output_directory="predictions/")

In [None]:
agents = pd.DataFrame()
entities = pd.DataFrame()
patients = pd.DataFrame()
posseses = pd.DataFrame()
modifiers = pd.DataFrame()

In [None]:
# WIP - issues with analyzer not being reinitialized after each movie

def postprocess(dataframes):
    agents, entities, patients, posseses, modifiers = dataframes
    tmp_agent = analyzer.agent.copy()
    tmp_agent["movie_name"] = movie_name
    agents = pd.concat([agents, tmp_agent], ignore_index=True)
    del tmp_agent
    
    tmp_entities = analyzer.entities.copy()
    tmp_entities["movie_name"] = movie_name
    entities = pd.concat([entities, tmp_entities], ignore_index=True)
    del tmp_entities
    
    tmp_patients = analyzer.patient.copy()
    tmp_patients["movie_name"] = movie_name
    patients = pd.concat([patients, tmp_patients], ignore_index=True)
    del tmp_patients
    
    tmp_posseses = analyzer.possess.copy()
    tmp_posseses["movie_name"] = movie_name
    posseses = pd.concat([posseses, tmp_posseses], ignore_index=True)
    del tmp_posseses
    
    tmp_modifiers = analyzer.modifiers.copy()
    tmp_modifiers["movie_name"] = movie_name
    modifiers = pd.concat([modifiers, tmp_modifiers], ignore_index=True)
    del tmp_modifiers
    
    return agents, entities, patients, posseses, modifiers

In [None]:
# WIP - rate limit issues

for index, row in tqdm(movies_with_metadata.iterrows(), total=movies_with_metadata.shape[0]):
    movie_name = row.title
    try:
        analyzer.analyze(movie_name, get_open_subtitles(movie_name))
        agents, entities, patients, posseses, modifiers = postprocess([agents, entities, patients, posseses, modifiers])
    except Exception as e:
        continue

In [None]:
agents.to_csv("agents.csv")
entities.to_csv("entities.csv")
patients.to_csv("patients.csv")
posseses.to_csv("posseses.csv")
modifiers.to_csv("modifiers.csv")

# Analysis

## Agents

In [None]:
px.histogram(agents, x="referential_gender")

In [None]:
agents[agents.referential_gender == "he/him/his"].groupby(["attr"]).cnt.sum().reset_index().sort_values("cnt", ascending=False)

In [None]:
agents[agents.referential_gender == "she/her"].groupby(["attr"]).cnt.sum().reset_index().sort_values("cnt", ascending=False)

In [None]:
agents[agents.referential_gender == "they/them/their"].groupby(["attr"]).cnt.sum().reset_index().sort_values("cnt", ascending=False)

In [None]:
px.histogram(entities, x="cat", nbins=20)

In [None]:
px.histogram(patients, x="referential_gender")

In [None]:
px.histogram(posseses, x="referential_gender")

In [None]:
px.histogram(modifiers, x="referential_gender")