<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Loading" data-toc-modified-id="Data-Loading-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Loading</a></span></li><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pipeline</a></span></li><li><span><a href="#clustering-interaactions-in-different-topics" data-toc-modified-id="clustering-interaactions-in-different-topics-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>clustering interaactions in different topics</a></span></li><li><span><a href="#Graph-drawing" data-toc-modified-id="Graph-drawing-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Graph drawing</a></span></li></ul></div>

# Data Loading

In [None]:
import math
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import re
import spacy
from spacy.tokens import Token
from helper import *
import ast
from fastcoref import FCoref
import tqdm

#extracting dataframe paths
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath("alt2.ipynb")))
char_data_path= os.path.join(parent_folder, "Data\\character.metadata.tsv")
plot_data_path= os.path.join(parent_folder, "Data\\resolved_texts_fastcoref.csv")

#building plot summaries dataframe
ind={0:"Wikipedia movie ID", 1:"Freebase movie ID", 2:"Movie release date", 3:"Character name", 4:"Actor date of birth", 5:"Actor gender", 6:"Actor height", 7:"Actor ethnicity", 8:"Actor name", 9:"Actor age at movie release", 10:"Freebase character/actor map ID", 11:"Freebase character ID", 12:"Freebase actor ID"}
characters_df= pd.read_csv(char_data_path, delimiter="\t", header=None)
characters_df=characters_df.rename(columns=ind)
PLOT_SUMMARY_PATH = "..\\data\\plot_summaries.txt"
plot_summaries = pd.read_csv(PLOT_SUMMARY_PATH, sep='\t', header=None)
plot_summaries.columns = ['wiki_id', 'plot']
movies = pd.read_csv(os.path.join(parent_folder, "Data\\movie.metadata.tsv"), sep='\t', header=None)
movies.columns = [
    'Wikipedia movie ID',
    'freebase_movie_id',
    'movie_name',
    'movie_release_date',
    'movie_box_office_revenue',
    'movie_runtime',
    'movie_languages',
    'movie_countries',
    'movie_genres'
]
movies['year'] = movies['movie_release_date'].str.extract('(\d{4})', expand=False)
movies['year'] = pd.to_numeric(movies['year'], downcast='integer')
movies['movie_box_office_revenue'] = pd.to_numeric(movies['movie_box_office_revenue'], errors='coerce')
movies['movie_runtime'] = pd.to_numeric(movies['movie_runtime'], errors='coerce')
# movie language distribution
query = re.compile(r'"(\w+) Language"')
movies['movie_languages'] = movies['movie_languages'].apply(lambda x: query.findall(x))
query = re.compile(r': "(.+)"')
movies['movie_countries'] = movies['movie_countries'].apply(lambda x: query.findall(x)[0] if query.findall(x) != [] else '')
query = re.compile(r': "(.+?)"')
movies['movie_genres'] = movies['movie_genres'].apply(lambda x: query.findall(x))
characters_df_full= pd.read_csv(os.path.join(parent_folder, "Data\\character.metadata.tsv"), sep='\t', header=None)
characters_df_full.columns = [
    'Wikipedia movie ID',
    'freebase_movie_id',
    'movie_release_date',
    'character_name',
    'actor_dob',
    'actor_gender',
    'actor_height',
    'actor_ethnicity',
    'actor_name',
    'actor_age',
    'freebase_character_map_1',
    'freebase_character_map_2',
    'freebase_character_map_3'
]
characters_df=characters_df_full[['Wikipedia movie ID','character_name', 'actor_dob','actor_gender','actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'freebase_character_map_1', 'freebase_character_map_2','freebase_character_map_3']]

# Pipeline

This code snippet performs the coreference on the plots and returns the resolved texts

In [None]:
resolved_texts = []
model = FCoref(device='cuda:0')
characters = characters_df['character_name'].unique().tolist()
characters = [name for name in characters if type(name) == str]
characters = ' '.join(characters)

for idx, row in plot_summaries.iterrows():
    text = row['plot']
    text_split = text.split()
    resolved_text = text.split()
    preds = model.predict(
        texts=text.split(),
        is_split_into_words=True
    )
    clusters = preds.get_clusters(as_strings=False)
    for cluster in clusters:
        character_name = None
        for token_offset in cluster:
            token = ' '.join(text_split[token_offset[0]:token_offset[1]])
            if token in characters and token.lower() not in ["he", "him", "his", "she", "her", "hers", "they", "them", "their"]:
                character_name = token
            elif token.lower() in ["he", "him", "his", "she", "her", "hers", "they", "them", "their"] and character_name!=None:
                resolved_text[token_offset[0]] = character_name
    resolved_texts.append(' '.join(resolved_text))

wiki_ids = plot_summaries['wiki_id'].tolist()
resolved_df = pd.DataFrame({'wiki_id': wiki_ids, 'resolved_text': resolved_texts})
resolved_df.to_csv('../data/resolved_texts_fastcoref.csv', index=False)
plots_df = pd.read_csv(plot_data_path, delimiter=',')
plots_df = plots_df.rename(columns={"wiki_id":"Wikipedia movie ID", "resolved_text":"Plot Summary"})
plots_df=plots_df.set_index("Wikipedia movie ID")

this code snippet was used to create Verb_Subject_Object.csv and character_list.csv

In [None]:
final_df=pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object"])
char_df=pd.DataFrame(columns=["Wikipedia movie ID", "characters"])
nlp = spacy.load("en_core_web_lg")

for i, id in enumerate(plots_df.index):
    sent_df=pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object", "nsubj", "nsubjpass", "dobj", "agent", "ccomp"])
    plot=plots_df.iloc[i]["Plot Summary"]
    doc= nlp(plot)
    characters = get_characters(doc)
    print(characters)
    char_values = {"Wikipedia movie ID": id,"characters": characters}
    char_df.loc[len(char_df)] = char_values
    sent_num = 0
    for sent in doc.sents:
        # print the verb and all its children and their dependency relations
        sent_num += 1
        for token in sent:
            if token.pos_ == "VERB":
                # Create a dictionary with the values to be assigned
                values = {"Wikipedia movie ID": id,"Sentence": sent_num, "Verb": token.lemma_}
                for child in token.children:
                    if child.dep_ in sent_df.columns:
                        values[child.dep_] = get_all_children(child)
                # Append the dictionary as a new row to sent_df
                sent_df.loc[len(sent_df)] = values
    sent_df["Object"] = sent_df["dobj"].combine_first(sent_df["nsubjpass"]).combine_first(sent_df["ccomp"])
    sent_df.drop(columns=["dobj", "nsubjpass", "ccomp"], inplace=True)
    sent_df["Subject"] = sent_df["nsubj"].combine_first(sent_df["agent"])
    sent_df.drop(columns=["nsubj", "agent"], inplace=True)
    final_df=pd.concat([final_df, sent_df], ignore_index=True)
    if (i+1) % 5000 == 0:
        final_df.to_csv(f'Verb_Subject_Object_{i}.csv', index=False)
        char_df.to_csv(f'characters_{i}.csv', index=False)
        char_df=pd.DataFrame(columns=["Wikipedia movie ID", "characters"])
        final_df = pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object"])
    print("Done with movie : ", i)

char_df.to_csv('Data\\characters_end.csv', index=False)
final_df.to_csv('Data\\Verb_Subject_Object_end.csv', index=False)

this code snippet was used to create charA_action_charB.csv and graph_df.csv

In [None]:
filtered_df = df[df.apply(lambda row: (isinstance(row["Subject"], list) and any(elem in row["characters"] for elem in row["Subject"])) or (isinstance(row["Object"], list) and any(elem in row["characters"] for elem in row["Object"])), axis=1)]
filtered_df.reset_index(drop=True, inplace=True)

def transform_row(row):
    verb = [row["Verb"]] if pd.notna(row["Verb"]) else []
    subject = row["Subject"] if isinstance(row["Subject"], list) else []
    obj = row["Object"] if isinstance(row["Object"], list) else []
    characters= row["characters"] if isinstance(row["characters"], list) else []
    char_A = set([char for char in subject if  char in characters])
    char_B = set([char for char in obj if char in characters])
    action = []
    for word in (verb + obj):
        if word in char_A:
            continue
        elif word in characters:
            break
        action.append(word)
    print("done with index", row.name)
    return pd.Series({"Wikipedia movie ID": row["Wikipedia movie ID"],"char A": list(char_A), "action": action, "char B": list(char_B)})

transformed_df = filtered_df.apply(transform_row, axis=1)
graph_df=transformed_df[(transformed_df["char A"].apply(lambda x: len(x) > 0)) & (transformed_df["char B"].apply(lambda x: len(x) > 0)) & (transformed_df["char A"]!=transformed_df["char B"])]
graph_df=graph_df.explode('char B').explode('char A')
graph_df=graph_df[graph_df['char A']!=graph_df['char B']].reset_index(drop=True)
graph_df['combined_action'] = graph_df['action'].apply(lambda x: ' '.join(x))
transformed_df.to_csv('Data\\charA_action_charB.csv', index=False)  
graph_df.to_csv('Data\\graph_df.csv', index=False)

this code snippet is used to create final_merged_df.csv which is the main dataframe we used for analysis

In [None]:
nlp = spacy.load("en_core_web_lg")

vocab_raw = set(graph_df['combined_action'].str.split().explode())

vocab_filtered = {word.lower() for word in vocab_raw if word.isalpha() and not word.isupper()}

lemmatizer = {word: token.lemma_ for word in vocab_filtered for token in nlp(word)}

word_vectors = {word: nlp(word).vector for word in vocab}
def replace_chars(row):
    characters= characters_df[characters_df['Wikipedia movie ID']==row['Wikipedia movie ID']]['character_name'].dropna().tolist()
    if not characters:
        return [row['char A'], row['char B']]
    i=0
    char_A_name=""
    char_B_name=""
    for chars in characters:
        if row['char A'] in chars:
            char_A_name = chars
            i+=1
        if row['char B'] in chars:
            char_B_name = chars
            i+=1
        if i==2:
            print("Done with row: ", row.name)
            return [char_A_name, char_B_name]
    if char_A_name=="":
        char_A_name = row['char A']
    if char_B_name=="":
        char_B_name = row['char B']
    print("Done with row: ", row.name)
    return [char_A_name, char_B_name]

#lemmatize the action column
def lemmatize_action(action):
    return [lemmatizer.get(word, word) for word in action]

# Apply the lemmatizer function to create a new column 'lemmatized_action'
graph_df['lemmatized_action'] = graph_df['action'].apply(lemmatize_action)
temp_df=graph_df.apply(lambda row: replace_chars(row), axis=1)
real_char=pd.DataFrame(temp_df)
real_char[['charA', 'charB']] = real_char[0].apply(pd.Series)
real_char.drop(0, axis=1, inplace=True)
graph_df[['char A', 'char B']] = real_char[['charA', 'charB']]
graph_df = graph_df.merge(movies, how='left', on='Wikipedia movie ID', suffixes=('_graph', '_movies'))
graph_df = graph_df.merge(characters_df, how='left', left_on=['Wikipedia movie ID', 'char A'], right_on=['Wikipedia movie ID', 'character_name'], suffixes=('', '_charA'))
graph_df.columns = [f'{col}_charA' if col in characters_df.columns and col != 'Wikipedia movie ID' else col for col in graph_df.columns]
graph_df = graph_df.merge(characters_df, how='left', left_on=['Wikipedia movie ID', 'char B'], right_on=['Wikipedia movie ID', 'character_name'], suffixes=('', '_charB'))
graph_df.columns = [f'{col}_charB' if col in characters_df.columns and col != 'Wikipedia movie ID' else col for col in graph_df.columns]
graph_df.drop_duplicates(['Wikipedia movie ID', 'char A', 'combined_action', 'char B'], inplace=True)
graph_df.to_csv('Data\\final_merged_df.csv', index=False)

# clustering interaactions in different topics

In this section, we will explore how we cluster on the vocabulary of our interactions, using GloVe embedings through spacy to get different interactions topics.

We start by defining our topics (centroids of our clusters), then we proceed to compute the set of words that have a bigger similarity with the centroid than a fixed threshold (0.5 here).
We then one-hot encode all the rows where one of the words of the clusters are present.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

topics=['fight', 'crime', 'care', 'fear', 'love', 'reject', 'sadness', 'collaborate', 'rivalry', 'mentor']

min_similarity = 0.5 # You can adjust this value based on your needs

for word in topics:
        centroid_vector = nlp(word).vector
        word_vectors_list = list(word_vectors.values())
        # Calculate cosine similarity with 'centroid_vector' for each word in 'vocab'
        similarity_scores = cosine_similarity([centroid_vector], word_vectors_list)
        # Get the indices and similarity scores of words within the minimum similarity
        close_words_indices = np.where(similarity_scores > min_similarity)[1]
        close_words_similarity = [(list(word_vectors.keys())[i], similarity_scores[0, i]) for i in close_words_indices]
        # Sort the words based on similarity, from most similar to least
        close_words_similarity.sort(key=lambda x: x[1], reverse=True)
        # Extract the sorted words
        close_words = [word for word, _ in close_words_similarity]
        print(f"{len(close_words)} similar to '{word}': {close_words}")
        graph_df[word] = graph_df['lemmatized_action'].apply(lambda x: 1 if any(word in x for word in close_words) else 0)

# Graph drawing

In [None]:
id=
data = graph_df[graph_df['Wikipedia movie ID']==id]
plt.figure(figsize=(15, 15))
G=nx.MultiDiGraph()

def edge_center_coordinates(edge, curvature, pos):
    start_x, start_y = pos[edge[0]]
    end_x, end_y = pos[edge[1]]
    
    # Calculate the control point coordinates for the curved edge
    control_x = 0.5 * (start_x + end_x) + curvature * (end_y - start_y)
    control_y = 0.5 * (start_y + end_y) - curvature * (end_x - start_x)
    
    # Calculate the Bezier curve parameters
    t = 0.5  # Midpoint of the curve
    bx = (1 - t)**2 * start_x + 2 * (1 - t) * t * control_x + t**2 * end_x
    by = (1 - t)**2 * start_y + 2 * (1 - t) * t * control_y + t**2 * end_y
    
    return bx, by

for subj in data["char A"].unique():
    G.add_node(subj, color="lightblue", node_size=2000)
    i=1
    for verb, obj in data[data["char A"]==subj][["combined_action", "char B"]].itertuples(index=False):
        G.add_edge(subj, obj, label=verb, curvature=i/10)
        i+=2

pos = nx.circular_layout(G)

# Draw nodes separately
nx.draw_networkx_nodes(G, pos, node_size=2000, node_color='lightblue')
# Draw node labels if needed
nx.draw_networkx_labels(G, pos, font_size=10, font_color='black')

# Draw the graph with curved edges based on the 'curvature' attribute
for edge in G.edges(data=True):
    edge_data = edge[2]
    label = edge_data['label']
    curvature = edge_data.get('curvature', 0.1)  # Default curvature if 'curvature' is not present
    #draw edges
    nx.draw_networkx_edges(G, pos, edgelist=[(edge[0], edge[1])], connectionstyle=f'arc3,rad={curvature}', edge_color='black', width=2, alpha=0.7, label=label, arrows=True, arrowsize=50, arrowstyle='-|>')
    center_coordinates = edge_center_coordinates(edge, curvature, pos)
    plt.text(center_coordinates[0], center_coordinates[1], label, color='red', fontsize=8, ha='center', va='center', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

# Show the plot
plt.show()