In [None]:
import pandas as pd
import os
import sys
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import joblib
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('..')
from helpers.readers import read_dataframe

DATA_PATH = '../generated/annotations_2013/'

### How to get the data
1. Download from [here](https://drive.google.com/drive/folders/1qEoAM8HcLksss1-gfKOTN_6GDi6-MsTT?usp=sharing) `tokens.parquet` and `dependencies.parquet` (generated from [`XML to Dataframes (2013).ipnyb`](https://github.com/epfl-ada/ada-2023-project-crunchychicken/blob/main/pipelines/XML%20to%20Dataframes%20(2013).ipynb))
2. Place the files inside `annotations_2013/`:
```
project_root/
│
├── P2.ipynb
│
├── generated/
│   ├── annotations_2013/
```

In [None]:
tokens = read_dataframe('cmu/tokens_2013')
dependencies = read_dataframe('cmu/dependencies_2013')

### Preprocessing
Find movies that contain at least one character (from tokens check for atleast one `NER=="PERSON"`) and check that dependencies contain only the dependencies we are looking for as in *Learning Latent Personas of Film Characters* from David Bamman Brendan O’Connor Noah A. Smith.

![dep_needed](https://i.postimg.cc/sfbxBMV6/image-2023-11-15-113351627.png)

In [None]:
tokens[tokens["NER"] == "PERSON"].head(3) # using old annotations, NER can miss some characters, for example label them as ORGANIZATION
movies_with_atleast_one_person = tokens[tokens["NER"] == "PERSON"]["movie_id"].unique().tolist()
len(movies_with_atleast_one_person)

In [None]:
%%time
agent_verbs = ["agent", "nsubj"]
patient_verbs = ["dobj", "nsubjpass", "iobj"] # + prep_* (handled seperatly)
attributes_av = ["nsubj", "appos"]
attributes_pv = ["nsubj", "appos", "amod", "nn"]

dependencies_needed = dependencies[(dependencies["dependency_class"] == "collapsed-ccprocessed") & 
                                   ((dependencies["dependency_type"].isin(agent_verbs + patient_verbs + attributes_av + attributes_pv)) |  
                                    (dependencies["dependency_type"].str.startswith("prep_")))].copy()

dependencies_needed.head(3)

movies_with_atleast_one_dep = dependencies_needed["movie_id"].unique().tolist()

len(movies_with_atleast_one_dep)

In [None]:
%%time
set_movies_with_person = set(movies_with_atleast_one_person)
set_movies_with_dep = set(movies_with_atleast_one_dep)

common_movies = set_movies_with_person.intersection(set_movies_with_dep)

common_movies_list = list(common_movies)

len(common_movies_list)

In [None]:
filtered_tokens = tokens[tokens["movie_id"].isin(common_movies_list)].copy()
filtered_dependencies = dependencies_needed[dependencies_needed["movie_id"].isin(common_movies_list)].copy()

filtered_tokens = filtered_tokens.drop(["COB", "COE"], axis=1)
filtered_dependencies = filtered_dependencies.drop(["dependency_class"], axis=1)

In [None]:
%%time
merge1 = pd.merge(filtered_tokens, filtered_dependencies,
                 left_on=['movie_id', 'sentence_id', 'token_id'],
                 right_on=['movie_id', 'sentence_id', 'governor_id'],
                 how='inner')

merge2 = pd.merge(filtered_tokens, filtered_dependencies,
                 left_on=['movie_id', 'sentence_id', 'token_id'],
                 right_on=['movie_id', 'sentence_id', 'dependent_id'],
                 how='inner')

merged_data = pd.concat([merge1, merge2]).drop_duplicates().reset_index(drop=True)
merged_data

In [None]:
merged_data[(merged_data["token_id"] != merged_data["governor_id"]) & (merged_data["token_id"] != merged_data["dependent_id"])] # should be empty ✅

In [None]:
# !pip install memory_profiler

# install and load memory_profiler to use %memit, use %whos to see what's in memory
%load_ext memory_profiler 
%memit
import gc

del common_movies
del set_movies_with_dep
del set_movies_with_person
del common_movies_list
del movies_with_atleast_one_dep
del movies_with_atleast_one_person

del merge1
del merge2

del filtered_tokens
del filtered_dependencies

del dependencies_needed
del dependencies
del tokens

gc.collect()
%memit

### For each plot:
1. Extract the characters (NER=PERSON)
2. For each character extract (dependencies + coreference)
    - agent verbs
    - patient verbs
    - attributes

In [None]:
characters = pd.DataFrame(columns=['movie_id', 'character', 'AV', 'PV', 'Att']) # dataframe where we will store the character, its actions (agent and patient) and attributes
verb_pos_tags = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] # to indentify verbs from attributes

### Extractor using parallelization

In [None]:
%%time
# there are more precise ways to do it, but as we are using not very accurate annotations data, we will spend more time enhancing the new annotations.

def process_group(chunk):
    temp_data = []
    for (movie_id, character), group in chunk.groupby(['movie_id', 'word']):
        AV, PV, Att = [], [], [] 

        for _, row in group.iterrows():
            if row['governor_word'] == character:
                dependency_word = row['dependent_word']
                dependency_idx = row['dependent_id']
            elif row['dependent_word'] == character:
                dependency_word = row['governor_word']
                dependency_idx = row['governor_id']
            else:
                print("Error")

            pos_row = merged_data[(merged_data['movie_id'] == movie_id) & 
                                  (merged_data['sentence_id'] == row['sentence_id']) & 
                                  (merged_data['token_id'] == dependency_idx)]

            if not pos_row.empty and pos_row.iloc[0]['POS'] in verb_pos_tags:
                if row['dependency_type'] in agent_verbs or row['dependency_type'].startswith("prep_"):
                    AV.append(pos_row.iloc[0]['lemma'])
                elif row['dependency_type'] in patient_verbs:
                    PV.append(pos_row.iloc[0]['lemma'])
            else:
                Att.append(pos_row.iloc[0]['lemma'])

        temp_data.append({
            'movie_id': movie_id,
            'character': character,
            'AV': AV,
            'PV': PV,
            'Att': Att
        })
    return temp_data

character_data = merged_data[merged_data["NER"] == "PERSON"]

num_partitions = joblib.cpu_count() 
chunk_size = int(character_data.shape[0] / num_partitions)
chunks = [character_data.iloc[i:i + chunk_size] for i in range(0, character_data.shape[0], chunk_size)]

results = Parallel(n_jobs=num_partitions)(delayed(process_group)(chunk) for chunk in tqdm(chunks))

flattened_results = [item for sublist in results for item in sublist]
characters = pd.concat([pd.DataFrame(flattened_results)], ignore_index=True)

In [None]:
characters.to_parquet(os.path.join(DATA_PATH, "characters.parquet"), compression= "brotli")

In [None]:
characters # PV looks empty but it's not

### Generate bag of words
A bag is a tupple of $(r,w)$, where $r$ is of {agent verb, patient verb, attribute} and $w$ is the lemma of the word.

In [None]:
def generate_bags_of_words(characters_df: pd.DataFrame):
    bags_of_words = []

    for _, row in tqdm(characters_df.iterrows()):
        movie_id = row['movie_id']
        character_name = row['character']

        av = row['AV'] if isinstance(row['AV'], list) else []
        pv = row['PV'] if isinstance(row['PV'], list) else []
        att = row['Att'] if isinstance(row['Att'], list) else []

        for verb in av:
            bags_of_words.append((movie_id, character_name, 'agent_verb', verb))

        for verb in pv:
            bags_of_words.append((movie_id, character_name, 'patient_verb', verb))

        for attribute in att:
            bags_of_words.append((movie_id, character_name, 'attribute', attribute))

    return bags_of_words

bags_of_words = generate_bags_of_words(characters)

In [None]:
bags_df = pd.DataFrame(bags_of_words, columns=['movie_id', 'character_name', 'type', 'word'])
bags_df.to_parquet(os.path.join(DATA_PATH, "bags.parquet"), compression= "brotli")

### From tupples to topics using LDA

In [None]:
bags_df = read_dataframe('cmu/bags_2013')
bags_of_words=bags_df.values

bags_df.head(3)

In [None]:
# currently global character version (doesn't make the distinction of the same character in different movies)
character_docs = defaultdict(list)
for _, character, _, word in bags_of_words:
    character_docs[character].append(word)
    
for character in character_docs:
    character_docs[character] = " ".join(character_docs[character])
    
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(character_docs.values())

n_topics = 50
lda = LatentDirichletAllocation(n_components=n_topics, verbose=2, max_iter=10, random_state = 0)
lda.fit(X)

In [None]:
joblib.dump(lda, os.path.join(DATA_PATH, "lda_model.gz"), compress=('gzip', 9))

In [None]:
lda = joblib.load(os.path.join(DATA_PATH, "lda_model.gz"))

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic {topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

n_topic_words = 10
print_top_words(lda, vectorizer.get_feature_names_out(), n_topic_words)

In [None]:
character_topic = lda.transform(X)
character_names = list(character_docs.keys())

character_classification=[]

for i, topic_dist in enumerate(character_topic):
    topic_most_prob = topic_dist.argmax()
    character_classification.append((character_names[i], topic_most_prob, topic_dist))

character_classification_df=pd.DataFrame(character_classification,columns=['character_name', 'topic', 'topic_dist'])
character_classification_df.head(3)

In [None]:
character_classification_df.to_parquet(os.path.join(DATA_PATH, "character_classification.parquet"), compression= "brotli")

In [None]:
character='Batman'

character_topics = character_classification_df[character_classification_df['character_name'] == character]['topic_dist'].iloc[0]

topics = range(len(character_topics))

# Creating the plot
plt.figure(figsize=(16, 6))
plt.bar(topics, character_topics)
plt.xlabel('Topic')
plt.ylabel('Probability (log scale)')
plt.title(f'Topic Distribution for {character}')
plt.yscale('log')  
plt.xticks(topics)
plt.show()

In [None]:
topic_counts = character_classification_df['topic'].value_counts()

topic_counts = topic_counts.sort_index()

plt.figure(figsize=(16, 6))
plt.bar(topic_counts.index, topic_counts.values)

plt.xlabel('Topic')
plt.ylabel('Number of Characters')
plt.title('Distribution of Characters Across Topics')
plt.xticks(topic_counts.index)
plt.show()