In [1]:
# mutliple outputs in cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# cell width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import os
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import numpy as np
import joblib

PATH_IN = './data/'

In [3]:
# Loading NLP data
tokens_fname = os.path.join(PATH_IN, 'df_tokens.parquet')
dependencies_fname = os.path.join(PATH_IN, 'df_dependencies.parquet')
coref_fname = os.path.join(PATH_IN, 'df_coreference.parquet')
entities_fname= os.path.join(PATH_IN, 'df_entities.parquet')

tokens = pd.read_parquet(tokens_fname)
dependencies = pd.read_parquet(dependencies_fname)
coreference = pd.read_parquet(coref_fname)
entities = pd.read_parquet(entities_fname)

In [4]:
tokens['token_id'] = tokens.groupby(['Wikipedia_movie_id', 'Sentence_id']).cumcount() + 1 # adding token id, not present by default in 4.5.5
tokens = tokens.drop(['Sentiment', 'COB', 'COE'], axis=1) # can use Sentiment later

In [5]:
# 15M            15M               1.8M               2.2M
len(tokens), len(dependencies), len(coreference), len(entities)

(15046378, 15376339, 1879672, 2220379)

### Preprocessing
Running on such big dataframes will take too much time. To solve this, we will first filter on movie plots that contain a character (from the entities dataframe check for `Entity_type == PERSON` and a non NaN `Optional_probability`) and on also filter on the dependencies (from the dependencies dataframe check that the `Dependency_type` satisfies an agent verb, patient verb or attribute).

In [6]:
%%time
agent_verbs = ["agent", "nsubj"]
patient_verbs = ["dobj", "nsubjpass", "iobj"] # no prep_ using coreNLP4.5.5
attributes_av = ["nsubj", "appos"]
attributes_pv = ["nsubj", "appos", "amod", "nn"]

def is_matching_dependency(dep_type):
    if dep_type in agent_verbs or dep_type in patient_verbs or dep_type in attributes_av or dep_type in attributes_pv:
        return True
    else: return False

tokens_with_character = tokens[tokens["NER"] == "PERSON"]["Wikipedia_movie_id"].tolist()
entities_with_character = entities[(entities['Entity_type'] == 'PERSON') & (entities['Optional_probability'].notna())]["Wikipedia_movie_id"].tolist()
prefiltered_entities = entities[(entities['Entity_type'] == 'PERSON') & (entities['Optional_probability'].notna())].copy()
dependencies_with_dep = dependencies[dependencies['Dependency_type'].apply(is_matching_dependency)]["Wikipedia_movie_id"].tolist()
prefiltered_dependencies = dependencies[dependencies['Dependency_type'].apply(is_matching_dependency)].copy()

set_tokens = set(tokens_with_character)
set_entities = set(entities_with_character)
set_dependencies = set(dependencies_with_dep)

intersection_set = set_tokens & set_entities & set_dependencies

filtered_tokens = tokens[tokens["Wikipedia_movie_id"].isin(intersection_set)]
filtered_entities = prefiltered_entities[prefiltered_entities["Wikipedia_movie_id"].isin(intersection_set)]
filtered_dependencies = prefiltered_dependencies[prefiltered_dependencies["Wikipedia_movie_id"].isin(intersection_set)]
filtered_coreference = coreference[coreference["Wikipedia_movie_id"].isin(intersection_set)] # complex to use, maybe for P3

CPU times: total: 12.7 s
Wall time: 13.2 s


In [7]:
len(filtered_tokens), len(filtered_dependencies), len(filtered_coreference), len(filtered_entities)

(14714763, 3603836, 1856122, 848088)

In [8]:
%%time
word_to_character = {}
for _, row in filtered_entities.iterrows():
    character_name = row['Word']
    movie_id = row['Wikipedia_movie_id']
    sentence_id = row['Sentence_id']
    for word in character_name.split():
        key = (movie_id, sentence_id, word)
        word_to_character[key] = character_name

filtered_tokens_copy = filtered_tokens.copy()

def map_word_to_character(row):
    key = (row['Wikipedia_movie_id'], row['Sentence_id'], row['Word'])
    return word_to_character.get(key, '')

filtered_tokens_copy['Character'] = filtered_tokens_copy.apply(map_word_to_character, axis=1)

CPU times: total: 1min 49s
Wall time: 1min 56s


In [9]:
%%time
merge1 = pd.merge(filtered_tokens_copy, filtered_dependencies, left_on=['Wikipedia_movie_id', 'Sentence_id', 'token_id'], right_on=['Wikipedia_movie_id', 'Sentence_id', 'Word_1_idx'])

merge2 = pd.merge(filtered_tokens_copy, filtered_dependencies, left_on=['Wikipedia_movie_id', 'Sentence_id', 'token_id'], right_on=['Wikipedia_movie_id', 'Sentence_id', 'Word_2_idx'])

final_merged_df = pd.concat([merge1, merge2], ignore_index=True)

CPU times: total: 19.1 s
Wall time: 20.2 s


In [10]:
final_merged_df

Unnamed: 0,Wikipedia_movie_id,Sentence_id,Word,POS,Lemma,NER,token_id,Character,Dependency_type,Word_1,Word_1_idx,Word_2,Word_2_idx
0,11784534,1,Bergman,NNP,Bergman,PERSON,7,Ingrid Bergman,nn,Bergman,7,Ingrid,6
1,11784534,1,Sanders,NNP,Sanders,PERSON,10,George Sanders,nn,Sanders,10,George,9
2,11784534,1,couple,NN,couple,O,15,,nsubj,couple,15,Joyces,2
3,11784534,1,couple,NN,couple,O,15,,amod,couple,15,English,14
4,11784534,1,gone,VBN,go,O,18,,nsubj,gone,18,who,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207667,9990262,48,min,NNP,min,O,10,,nsubj,asks,12,min,10
7207668,9990262,48,min,NNP,min,O,10,,nsubj,get,14,min,10
7207669,9990262,48,head,NN,head,O,16,,nn,surgery,22,head,16
7207670,9990262,48,toe,NN,toe,O,20,,nn,plastic,21,toe,20


In [11]:
%load_ext memory_profiler
%memit
import gc

del coref_fname
del coreference
del dependencies_fname
del filtered_coreference
del filtered_entities
del filtered_tokens_copy
del filtered_dependencies
del filtered_tokens
del dependencies
del dependencies_with_dep
del entities
del entities_fname
del entities_with_character
del intersection_set
del merge1
del merge2 
del prefiltered_dependencies
del prefiltered_entities
del set_dependencies        
del set_entities              
del set_tokens
del tokens
del tokens_fname
del tokens_with_character
gc.collect()
%memit

peak memory: 9712.51 MiB, increment: 0.61 MiB


0

peak memory: 6410.87 MiB, increment: 0.02 MiB


### Adding the POS and lemma of dependencies to the merged dataframe before running extractor

In [12]:
def get_dependency_pos_lemma(row):
    if row['Character']:
        dependency_idx = row['Word_2_idx'] if row['Word'] == row['Word_1'] else row['Word_1_idx']

        pos_lemma = final_merged_df.loc[(final_merged_df['Wikipedia_movie_id'] == row['Wikipedia_movie_id']) &
                                        (final_merged_df['Sentence_id'] == row['Sentence_id']) &
                                        (final_merged_df['token_id'] == dependency_idx), ['POS', 'Lemma']].iloc[0]
        return pos_lemma.POS, pos_lemma.Lemma
    return '', ''

In [13]:
%%time
def process_chunk(chunk):
    return chunk.apply(get_dependency_pos_lemma, axis=1)

num_partitions = joblib.cpu_count()
chunk_size = int(np.ceil(final_merged_df.shape[0] / num_partitions))
chunks = [final_merged_df.iloc[i:i + chunk_size] for i in range(0, final_merged_df.shape[0], chunk_size)]

results = Parallel(n_jobs=num_partitions)(delayed(process_chunk)(chunk) for chunk in chunks)

pos_lemma_df = pd.concat(results, ignore_index=True)

final_merged_df = pd.concat([final_merged_df.reset_index(drop=True), pos_lemma_df], axis=1)

final_merged_df_filtered = final_merged_df[final_merged_df['Character'] != '']



CPU times: total: 10min 54s
Wall time: 2h 8min 16s


In [None]:
final_merged_df_filtered = final_merged_df_filtered.copy()

final_merged_df_filtered.loc[:, 'Dependency_POS'] = final_merged_df_filtered[0].apply(lambda x: x[0])
final_merged_df_filtered.loc[:, 'Dependency_Lemma'] = final_merged_df_filtered[0].apply(lambda x: x[1])

final_merged_df_filtered = final_merged_df_filtered.drop(columns=[0])

final_merged_df_filtered

In [29]:
del final_merged_df
gc.collect()

2594

In [38]:
final_merged_df_filtered

Unnamed: 0,Wikipedia_movie_id,Sentence_id,Word,POS,Lemma,NER,token_id,Character,Dependency_type,Word_1,Word_1_idx,Word_2,Word_2_idx,Dependency_POS,Dependency_Lemma
0,11784534,1,Bergman,NNP,Bergman,PERSON,7,Ingrid Bergman,nn,Bergman,7,Ingrid,6,NNP,Ingrid
1,11784534,1,Sanders,NNP,Sanders,PERSON,10,George Sanders,nn,Sanders,10,George,9,NNP,George
18,10131263,1,Coyote,NNP,Coyote,PERSON,5,Wile E. Coyote,nn,Coyote,5,Wile,3,NNP,Wile
19,10131263,1,Coyote,NNP,Coyote,PERSON,5,Wile E. Coyote,nn,Coyote,5,E.,4,NNP,E.
189,10131263,41,Coyote,NNP,Coyote,PERSON,5,Coyote,amod,Coyote,5,charred,4,JJ,charred
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207599,9990262,38,Jenny,NNP,Jenny,PERSON,7,Jenny,dobj,encourages,4,Jenny,7,VBZ,encourage
7207600,9990262,38,Jenny,NNP,Jenny,PERSON,7,Jenny,nsubj,do,9,Jenny,7,VB,do
7207605,9990262,39,Jenny,NNP,Jenny,PERSON,5,Jenny,nsubj,sing,8,Jenny,5,VB,sing
7207611,9990262,40,Jenny,NNP,Jenny,PERSON,8,Jenny,nsubj,fake,12,Jenny,8,JJ,fake


### Extractor

In [15]:
characters = pd.DataFrame(columns=['Wikipedia_movie_id', 'Character', 'AV', 'PV', 'Att']) # dataframe where we will store the character, its actions (agent and patient) and attributes
verb_pos_tags = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

In [47]:
%%time
def process_chunk(chunk):
    temp_data = defaultdict(lambda: {'AV': [], 'PV': [], 'Att': []})
    for _, row in chunk.iterrows():
        result = process_row(row)
        key = (result['Wikipedia_movie_id'], result['Character'])
        temp_data[key]['AV'].extend(result['AV'])
        temp_data[key]['PV'].extend(result['PV'])
        temp_data[key]['Att'].extend(result['Att'])
    return temp_data

# Divide final_merged_df_filtered into chunks
num_partitions = joblib.cpu_count()
chunk_size = int(np.ceil(final_merged_df_filtered.shape[0] / num_partitions))
chunks = [final_merged_df_filtered.iloc[i:i + chunk_size] for i in range(0, final_merged_df_filtered.shape[0], chunk_size)]

# Process each chunk in parallel
chunk_results = Parallel(n_jobs=num_partitions)(delayed(process_chunk)(chunk) for chunk in chunks)

# Combine the results from all chunks
combined_results = defaultdict(lambda: {'AV': [], 'PV': [], 'Att': []})
for chunk_result in chunk_results:
    for key, values in chunk_result.items():
        combined_results[key]['AV'].extend(values['AV'])
        combined_results[key]['PV'].extend(values['PV'])
        combined_results[key]['Att'].extend(values['Att'])

# Convert combined_results to DataFrame
character_data = []
for (movie_id, character), values in combined_results.items():
    character_data.append({
        'Wikipedia_movie_id': movie_id,
        'Character': character,
        'AV': values['AV'],
        'PV': values['PV'],
        'Att': values['Att']
    })

characters = pd.DataFrame(character_data)

CPU times: total: 7 s
Wall time: 9.21 s


In [49]:
characters

Unnamed: 0,Wikipedia_movie_id,Character,AV,PV,Att
0,11784534,Ingrid Bergman,[],[],"[Ingrid, Bergman]"
1,11784534,George Sanders,[],[],"[George, Sanders]"
2,10131263,Wile E. Coyote,[cook],[],"[Wile, E., Coyote, Coyote]"
3,10131263,Coyote,[walk],[],"[charred, gullible]"
4,1067527,Lau,[handle],[],"[one, inspector]"
...,...,...,...,...,...
229415,999394,Bootstrap,[tell],[],[]
229416,999394,Dalma,"[tell, say]",[],[]
229417,999394,Gibbs,"[encounter, hire]",[tell],[]
229418,999394,Barbossa,[],[],[captain]


In [54]:
characters.to_parquet("characters_new_annotations.parquet", compression="Brotli")

### Generate bag of words
A bag is a tupple of $(r,w)$, where $r$ is of {agent verb, patient verb, attribute} and $w$ is the lemma of the word.

In [51]:
def generate_bags_of_words(characters_df: pd.DataFrame):
    bags_of_words = []

    for _, row in tqdm(characters_df.iterrows()):
        movie_id = row['Wikipedia_movie_id']
        character_name = row['Character']

        av = row['AV'] if isinstance(row['AV'], list) else []
        pv = row['PV'] if isinstance(row['PV'], list) else []
        att = row['Att'] if isinstance(row['Att'], list) else []

        for verb in av:
            bags_of_words.append((movie_id, character_name, 'agent_verb', verb))

        for verb in pv:
            bags_of_words.append((movie_id, character_name, 'patient_verb', verb))

        for attribute in att:
            bags_of_words.append((movie_id, character_name, 'attribute', attribute))

    return bags_of_words

bags_of_words = generate_bags_of_words(characters)

0it [00:00, ?it/s]

CPU times: total: 2.77 s
Wall time: 7.37 s


In [55]:
bags_df = pd.DataFrame(bags_of_words, columns=['movie_id', 'character_name', 'type', 'word'])
bags_df.to_parquet('bags_new_annotations.parquet', compression='Brotli')

### From tupples to topics using LDA

In [None]:
old_anno_results_path=os.path.join(PATH_IN,'new_annotations_results')
bags_df_path = os.path.join(old_anno_results_path, 'bags_new_annotations.parquet')
bags_df = pd.read_parquet(bags_df_path)
bags_of_words=bags_df.values

bags_df.head(10)

In [None]:
from collections import defaultdict

# global char version
character_docs = defaultdict(list)
for _, character, _, word in bags_of_words:
    character_docs[character].append(word)
    
for character in character_docs:
    character_docs[character] = " ".join(character_docs[character])

In [None]:
documents = list(character_docs.values()) # can probably remove it, and directly use character_docs.values()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 50
lda = LatentDirichletAllocation(n_components=n_topics, verbose=2, max_iter=10, random_state = 0)
lda.fit(X)

In [None]:
joblib.dump(lda, 'lda_model_new_annotations.pkl')

In [None]:
import joblib

#Load LDA trained model
lda = joblib.load(os.path.join(PATH_IN,'old_annotations_results/lda_model_new_annotations.pkl'))

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        
print_top_words(lda, vectorizer.get_feature_names_out(), 10)

In [None]:
character_topic = lda.transform(X)
character_names = list(character_docs.keys())

for i, topic_dist in enumerate(character_topic):
    topic_most_pr = topic_dist.argmax()
    print(f"{character_names[i]}: Topic {topic_most_pr}")

In [None]:
# add plot for a char => cluster result