In [6]:
import math
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import spacy
import spacy_experimental
from spacy.tokens import Token
from helper import *

def get_all_children(token):
    children = [token]
    for child in token.children:
        children.extend(get_all_children(child))
    children=sorted(children, key=lambda x: x.i)
    return children

# Function to replace tokens with their references
def replace_tokens_with_refs(tokens):
    if not isinstance(tokens, list):
        return tokens
    updated_tokens = []
    for token in tokens:
        if token.pos_ == "PRON" and hasattr(token._, 'ref') and token._.ref is not None:
            updated_tokens.extend(token._.ref)
        else:
            updated_tokens.append(token.text)
    return updated_tokens

#extracting dataframe paths
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath("alt2.ipynb")))
char_data_path= os.path.join(parent_folder, "Data\character.metadata.tsv")
plot_data_path= os.path.join(parent_folder, "Data\plot_summaries.txt")

#building character matadata dataframe
ind={0:"Wikipedia movie ID", 1:"Freebase movie ID", 2:"Movie release date", 3:"Character name", 4:"Actor date of birth", 5:"Actor gender", 6:"Actor height", 7:"Actor ethnicity", 8:"Actor name", 9:"Actor age at movie release", 10:"Freebase character/actor map ID", 11:"Freebase character ID", 12:"Freebase actor ID"}
characters_df= pd.read_csv(char_data_path, delimiter="\t", header=None)
characters_df=characters_df.rename(columns=ind)

#building plot summaries dataframe
plots_df = pd.read_csv(plot_data_path, delimiter='\t', header=None)
plots_df = plots_df.rename(columns={0:"Wikipedia movie ID", 1:"Plot Summary"})
plots_df=plots_df.set_index("Wikipedia movie ID")


In [19]:
sent_df=pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object", "nsubj", "nsubjpass", "dobj", "agent", "ccomp"])
nlp = spacy.load("en_core_web_md")

for i in range(1):
    i=1
    plot=plots_df.iloc[i]["Plot Summary"]
    id=plots_df.index[i]
    charSex=characters_df[characters_df["Wikipedia movie ID"] == id][["Character name","Actor gender"]]
    charSex=charSex.set_index("Character name")
    doc= nlp(plot)
    characters = clean_character_list(get_characters(doc), doc)
    doc=replace_pronouns(doc, charSex, characters)
    sent_num = 0
    for sent in doc.sents:
        # print the verb and all its children and their dependency relations
        sent_num += 1
        for token in sent:
            if token.pos_ == "VERB":
                # Create a dictionary with the values to be assigned
                values = {"Wikipedia movie ID": id,"Sentence": sent_num, "Verb": token.lemma_}
                for child in token.children:
                    if child.dep_ in sent_df.columns:
                        values[child.dep_] = get_all_children(child)
                # Append the dictionary as a new row to sent_df
                sent_df.loc[len(sent_df)] = values
    print("Done with movie : ", i)

sent_df["Object"]=sent_df["dobj"]
sent_df.drop(columns=["dobj"], inplace=True)
ind_obj = sent_df[sent_df["Object"].isna()].index
sent_df.loc[ind_obj, "Object"] = sent_df.loc[ind_obj, "nsubjpass"]
sent_df.drop(columns=["nsubjpass"], inplace=True)
ind_obj = sent_df[sent_df["Object"].isna()].index
sent_df.loc[ind_obj, "Object"] = sent_df.loc[ind_obj, "ccomp"]
sent_df.drop(columns=["ccomp"], inplace=True)

sent_df["Subject"]=sent_df["nsubj"]
sent_df.drop(columns=["nsubj"], inplace=True)
ind_subj = sent_df[sent_df["Subject"].isna()].index
sent_df.loc[ind_subj, "Subject"] = sent_df.loc[ind_subj, "agent"]
sent_df.drop(columns=["agent"], inplace=True)

sent_df['Subject'] = sent_df['Subject'].apply(replace_tokens_with_refs)
sent_df['Object'] = sent_df['Object'].apply(replace_tokens_with_refs)

sent_df.to_csv('Verb_Subject_Object.csv', index=False)
sent_df



Done with movie :  1


Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
0,31186339,1,consist,"[The, nation, of, Panem]",
1,31186339,2,provide,"[each, district]","[a, boy, and, girl, ]"
2,31186339,2,select,"[by, lottery, , for, the, annual, Hunger, Games]",
3,31186339,3,fight,"[The, tributes]",
4,31186339,3,reward,,"[the, sole, survivor]"
...,...,...,...,...,...
126,31186339,50,proclaim,,"[the, victors, of, the, 74th, Hunger, Games]"
127,31186339,51,warn,[Haymitch],[Katniss]
128,31186339,51,make,[Katniss],"[powerful, enemies]"
129,31186339,52,lock,,[Crane]


In [14]:
for key in doc.spans:
    if key.startswith("coref_head_clusters_"):
        print(key, doc.spans[key])

coref_head_clusters_1 [her, Everdeen, Her]
coref_head_clusters_2 [sister, her, Katniss, she, Katniss, She, she, Katniss, Katniss, her, Katniss, her, Katniss, Katniss, she, Katniss, Katniss, she, her, She, Katniss, she, Katniss, She, she, Katniss, She, herself, her, Katniss, she, her, her, her, Katniss, Katniss]
coref_head_clusters_3 [District, District]
coref_head_clusters_4 [Mellark, Peeta, Peeta, his, Peeta, he, Peeta, Peeta, Crane, Crane, Crane, Crane]
coref_head_clusters_5 [Capitol, Capitol]
coref_head_clusters_6 [Katniss, their, them, Katniss, Katniss, they, Peeta, they, they, She]
coref_head_clusters_7 [mentor, He, Haymitch]
coref_head_clusters_8 [reveals, it]
coref_head_clusters_9 [Games, Games, Games, Games]
coref_head_clusters_10 [tributes, tributes]
coref_head_clusters_11 [Rue, her, Rue, Rue, Rue, Rue, her, Rue, him, Rue, Rue, Rue, Rue, Rue, Katniss, Peeta, Katniss, Katniss, Katniss, she, Katniss, she, her]
coref_head_clusters_12 [nest, it]
coref_head_clusters_13 [besiegers, 

In [20]:
for key in doc.spans.keys():
    if key.startswith("coref_head_clusters_"):
        cluster_heads=set()
        for elem in doc.spans[key]:
            token=doc[elem.start]
            if (token.pos_ == "PROPN"):
                cluster_heads.add(token.text)
        print(cluster_heads)

{'Everdeen'}
{'Katniss'}
{'District'}
{'Mellark', 'Crane', 'Peeta'}
{'Capitol'}
{'Peeta', 'Katniss'}
{'Haymitch'}
set()
{'Games'}
set()
{'Rue', 'Peeta', 'Katniss'}
set()
set()
set()
set()
set()
set()
set()
{'Snow'}
{'District'}
set()
{'Peeta'}
set()
{'Clove'}
{'Thresh'}
set()
set()
set()
{'Cato'}


In [77]:
Token.set_extension("ref", default=None, force=True)
for key in doc.spans.keys():
    if key.startswith("coref_head_clusters_"):
        cluster_heads=set()
        for elem in doc.spans[key]:
            token=doc[elem.start]
            if not (token.pos_ == "PRON"):
                cluster_heads.add(token.text)
        for elem in doc.spans[key]:
            token=doc[elem.start]
            if token.pos_ == "PRON":
                token._.set("ref", cluster_heads)

In [17]:
def remove_stopwords_from_df(df):
    # Define a function to remove stop words from a list of tokens or strings
    def remove_stopwords(tokens_or_strings):
        if isinstance(tokens_or_strings, list):
            # If it's a list, check if elements are spaCy tokens or strings
            cleaned_list = []
            for item in tokens_or_strings:
                if isinstance(item, spacy.tokens.Token):
                    # If it's a spaCy token, filter out stop words
                    cleaned_list.append(item.text) if not item.is_stop else None
                elif isinstance(item, str):
                    # If it's a string, convert to spaCy tokens and filter out stop words
                    tokens = nlp(item)
                    cleaned_list.extend([token.text for token in tokens if not token.is_stop])
                else:
                    cleaned_list.append(item)
            return cleaned_list
        else:
            return None
    
    # Apply the function to the "Subject" and "Object" columns
    df['Subject'] = df['Subject'].apply(remove_stopwords)
    df['Object'] = df['Object'].apply(remove_stopwords)
    return df.dropna(axis=0, how="any")

remove_stopwords_from_df(sent_df).dropna(axis=0, how="any")

Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
1,31186339,2,provide,[district],"[boy, girl, ]"
7,31186339,5,take,"[older, sister, Katniss, volunteers]","[Katniss, place]"
8,31186339,6,give,[Peeta],[bread]
12,31186339,8,warn,[Peeta],"[Katniss, Peeta]"
15,31186339,9,reveal,[Peeta],"[Peeta, love, Katniss]"
18,31186339,10,provide,"["", sponsors, ""]","[-, Games, gifts, food, ,, medicine, ,, tools]"
19,31186339,11,discover,[Katniss],"[Peeta, meant, Peeta, said]"
20,31186339,11,mean,[Peeta],"[Peeta, said]"
21,31186339,11,say,[Peeta],[]
25,31186339,12,survive,[Katniss],"[televised, Games, begin, half, tributes, kill..."


In [5]:
def get_characters(doc):
    characters = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.text.istitle():
            characters.append(ent.text)
    return list(set(characters))

characters = get_characters(doc)


def contains_character(elem, characters):
    if isinstance(elem, list):
        for token in elem:
            if isinstance(token, str):
                if token in characters:
                    return True
            elif token.text in characters:
                return True
    else:
        return False

# Apply the function to each row
sent_df[sent_df.apply(lambda row: any(contains_character(elem, characters) for elem in row[2:]), axis=1)][["Sentence", "Verb", "Subject", "Object"]]

Unnamed: 0,Sentence,Verb,Subject,Object
7,5,take,"[Her, older, sister, Katniss, volunteers]","[Katniss, place]"
8,6,give,[Peeta],[bread]
9,6,starve,[Katniss],
10,7,take,,"[Katniss, and, Peeta]"
11,7,accompany,"[by, Katniss, Peeta, frequently, drunk, mentor...",
...,...,...,...,...
124,49,give,[Katniss],"[half, of, the, nightlock]"
125,50,commit,"[Peeta, Katniss]",[suicide]
127,51,warn,[Haymitch],[Katniss]
128,51,make,[Katniss],"[powerful, enemies]"
