In [101]:
import sys
import pandas as pd
import re
from tqdm import tqdm
sys.path.append("../../")
from bechdelai.data.opensubtitles import search, get_subtitle_link, download_subtitle_from_url
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import plotly.express as px

In [102]:
ner_model_path = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(ner_model_path)
model = AutoModelForTokenClassification.from_pretrained(ner_model_path)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average")

In [49]:
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment = pipeline(
    task='sentiment-analysis',
    model=model_path,
    tokenizer=model_path
)

In [50]:
def remove_html(text):
    return re.sub(r'<[^>]*>', '', text)

In [51]:
movie_name = "OSS 117 - Le caire nid d'espions"
language_code = "fre"

In [52]:
search_url = search(movie_name, language_code)
print(search_url)

{'OSS 117: Cairo, Nest of Spies (2006)': 'https://www.opensubtitles.org/en/subtitles/7481711/oss-117-cairo-nest-of-spies-fr'}


In [53]:
wanted_movie = 'OSS 117: Cairo, Nest of Spies (2006)'

In [54]:
subtitle_url = get_subtitle_link(search_url[wanted_movie])

In [55]:
subs = download_subtitle_from_url(subtitle_url)

In [106]:
replicas = []

for replica in tqdm(re.split(r'[.!?]', subs.text)):
    replica = remove_html(replica).replace("\n", " ").replace("-", " ")
    entities = []
    if len(replica) == 0:
        continue
    try:
        for entity in ner(replica):
            entities.append((entity.get("entity_group"), entity.get("word")))
    except TypeError: # some issues with aggregation strategy sometimes
        continue
    replicas.append({
        "text": replica,
        "entities": entities,
        "sentiment": sentiment(replica)[0].get("label")
    })

100%|██████████| 1680/1680 [03:16<00:00,  8.56it/s]


In [130]:
replicas_df = pd.json_normalize(replicas)
entity_types = replicas_df.entities.explode().dropna().apply(lambda x: x[0] )
entity_labels = replicas_df.entities.explode().dropna().apply(lambda x: x[1] )

In [108]:
fig = px.histogram(replicas_df, x="sentiment", category_orders={"sentiment": [ "Negative", "Neutral", "Positive"]})
fig.show()

In [109]:
fig = px.histogram(x=entity_types)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [110]:
fig = px.histogram(x=entity_labels)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [162]:
top_entities = entity_labels.value_counts().nlargest(10)
fig = px.histogram(entity_labels.reset_index(),
              x="index",  color='entities', nbins=20)
fig.show()