In [2]:
import math
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import spacy
import spacy_experimental
from spacy.tokens import Token
from helper import *

#iterate over sentences
def get_all_children(token):
    if token.pos_=="PRON" and token._.ref:
        children = [token._.ref]
    else:
        children = [token]
    for child in token.children:
        children.extend(get_all_children(child))
    children=sorted(children, key=lambda x: x.i)
    return children

#extracting dataframe paths
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath("alt2.ipynb")))
char_data_path= os.path.join(parent_folder, "Data\character.metadata.tsv")
plot_data_path= os.path.join(parent_folder, "Data\plot_summaries.txt")

#building character matadata dataframe
ind={0:"Wikipedia movie ID", 1:"Freebase movie ID", 2:"Movie release date", 3:"Character name", 4:"Actor date of birth", 5:"Actor gender", 6:"Actor height", 7:"Actor ethnicity", 8:"Actor name", 9:"Actor age at movie release", 10:"Freebase character/actor map ID", 11:"Freebase character ID", 12:"Freebase actor ID"}
characters_df= pd.read_csv(char_data_path, delimiter="\t", header=None)
characters_df=characters_df.rename(columns=ind)

#building plot summaries dataframe
plots_df = pd.read_csv(plot_data_path, delimiter='\t', header=None)
plots_df = plots_df.rename(columns={0:"Wikipedia movie ID", 1:"Plot Summary"})
plots_df=plots_df.set_index("Wikipedia movie ID")


In [9]:
sent_df=pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object", "nsubj", "nsubjpass", "dobj", "agent", "ccomp"])
nlp = spacy.load("en_core_web_md")
nlp_coref = spacy.load("en_coreference_web_trf")
nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])
nlp.add_pipe("coref", source=nlp_coref)
nlp.add_pipe("span_resolver", source=nlp_coref)

for i in range(1):
    i=1
    plot=plots_df.iloc[i]["Plot Summary"]
    id=plots_df.index[i]
    doc= nlp(plot)
    sent_num = 0
    for sent in doc.sents:
        # print the verb and all its children and their dependency relations
        sent_num += 1
        for token in sent:
            if token.pos_ == "VERB":
                # Create a dictionary with the values to be assigned
                values = {"Wikipedia movie ID": id,"Sentence": sent_num, "Verb": token.lemma_}
                for child in token.children:
                    if child.dep_ in sent_df.columns:
                        values[child.dep_] = get_all_children(child)
                # Append the dictionary as a new row to sent_df
                sent_df.loc[len(sent_df)] = values
    print("Done with movie : ", i)

sent_df["Object"]=sent_df["dobj"]
sent_df.drop(columns=["dobj"], inplace=True)
ind_obj = sent_df[sent_df["Object"].isna()].index
sent_df.loc[ind_obj, "Object"] = sent_df.loc[ind_obj, "nsubjpass"]
sent_df.drop(columns=["nsubjpass"], inplace=True)
ind_obj = sent_df[sent_df["Object"].isna()].index
sent_df.loc[ind_obj, "Object"] = sent_df.loc[ind_obj, "ccomp"]
sent_df.drop(columns=["ccomp"], inplace=True)

sent_df["Subject"]=sent_df["nsubj"]
sent_df.drop(columns=["nsubj"], inplace=True)
ind_subj = sent_df[sent_df["Subject"].isna()].index
sent_df.loc[ind_subj, "Subject"] = sent_df.loc[ind_subj, "agent"]
sent_df.drop(columns=["agent"], inplace=True)

sent_df.to_csv('Verb_Subject_Object.csv', index=False)
sent_df



Done with movie :  1


Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
0,31186339,1,consist,"[The, nation, of, Panem]",
1,31186339,2,provide,"[each, district]","[a, boy, and, girl, ]"
2,31186339,2,select,"[by, lottery, , for, the, annual, Hunger, Games]",
3,31186339,3,fight,"[The, tributes]",
4,31186339,3,reward,,"[the, sole, survivor]"
...,...,...,...,...,...
126,31186339,50,proclaim,,"[the, victors, of, the, 74th, Hunger, Games]"
127,31186339,51,warn,[Haymitch],[Katniss]
128,31186339,51,make,[she],"[powerful, enemies]"
129,31186339,52,lock,,[Crane]


In [67]:
for key in doc.spans:
    if key.startswith("coref_head_clusters_"):
        print(key, doc.spans[key])

coref_head_clusters_1 [her, Everdeen, Her]
coref_head_clusters_2 [sister, her, Katniss, she, Katniss, She, she, Katniss, Katniss, her, Katniss, her, Katniss, Katniss, she, Katniss, Katniss, she, her, She, Katniss, she, Katniss, She, she, Katniss, She, herself, her, Katniss, she, her, her, her, Katniss, Katniss]
coref_head_clusters_3 [District, District]
coref_head_clusters_4 [Mellark, Peeta, Peeta, his, Peeta, he, Peeta, Peeta, Crane, Crane, Crane, Crane]
coref_head_clusters_5 [Capitol, Capitol]
coref_head_clusters_6 [Katniss, their, them, Katniss, Katniss, they, Peeta, they, they, She]
coref_head_clusters_7 [mentor, He, Haymitch]
coref_head_clusters_8 [reveals, it]
coref_head_clusters_9 [Games, Games, Games, Games]
coref_head_clusters_10 [tributes, tributes]
coref_head_clusters_11 [Rue, her, Rue, Rue, Rue, Rue, her, Rue, him, Rue, Rue, Rue, Rue, Rue, Katniss, Peeta, Katniss, Katniss, Katniss, she, Katniss, she, her]
coref_head_clusters_12 [nest, it]
coref_head_clusters_13 [besiegers, 

In [68]:
for key in doc.spans.keys():
    if key.startswith("coref_head_clusters_"):
        cluster_heads=set()
        for elem in doc.spans[key]:
            token=doc[elem.start]
            if not (token.pos_ == "PRON"):
                cluster_heads.add(token.text)
        print(cluster_heads)

{'Everdeen'}
{'Katniss', 'sister'}
{'District'}
{'Crane', 'Peeta', 'Mellark'}
{'Capitol'}
{'Peeta', 'Katniss'}
{'Haymitch', 'mentor'}
{'reveals'}
{'Games'}
{'tributes'}
{'Katniss', 'Peeta', 'Rue'}
{'nest'}
{'besiegers'}
{'alliance'}
{'supplies'}
{'stockpile', 'pile'}
{'spear'}
set()
{'Snow'}
{'district', 'District'}
{'announced'}
{'Peeta'}
{'feast'}
{'Clove'}
{'Thresh'}
{'Foxface'}
{'berries'}
{'pack'}
{'Cato'}


In [66]:
Token.set_extension("ref", default=None, force=True)
for key in doc.spans.keys():
    if key.startswith("coref_head_clusters_"):
        cluster_heads=set()
        for elem in doc.spans[key]:
            token=doc[elem.start]
            if not (token.pos_ == "PRON"):
                cluster_heads.add(token.text)
        for elem in doc.spans[key]:
            token=doc[elem.start]
            if token.pos_ == "PRON":
                token._.set("ref", cluster_heads)

In [15]:
def remove_stopwords_from_df(df):
    # Define a function to remove stop words from a list of tokens
    def remove_stopwords(tokens):
        if isinstance(tokens, list):
            return [token for token in tokens if not token.is_stop]
        else:
            return None
    # Apply the function to the "Subject" and "Object" columns
    df['Subject'] = df['Subject'].apply(remove_stopwords)
    df['Object'] = df['Object'].apply(remove_stopwords)
    return df

#sent_df[["Sentence", "Verb", "Subject", "Object"]].dropna(axis=0, how="any").apply(lambda x: stop_words_deletion(x) if x.name in ['Subject', 'Object'] else x, axis=1)
remove_stopwords_from_df(sent_df).dropna(axis=0, how="any")

Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
2,31186339,2,provide,[district],"[boy, girl, ]"
9,31186339,5,take,"[older, sister, Katniss, volunteers]",[place]
10,31186339,6,give,[],[bread]
14,31186339,8,warn,[],[]
17,31186339,9,reveal,[Peeta],"[love, Katniss]"
21,31186339,10,provide,"["", sponsors, ""]","[-, Games, gifts, food, ,, medicine, ,, tools]"
22,31186339,11,discover,[],"[Peeta, meant, said]"
23,31186339,11,mean,[Peeta],[said]
24,31186339,11,say,[],[]
28,31186339,12,survive,[Katniss],"[televised, Games, begin, half, tributes, kill..."


In [16]:
def get_characters(doc):
    characters = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.text.istitle():
            characters.append(ent.text)
    return list(set(characters))

characters = get_characters(doc)


def contains_character(elem, characters):
    if isinstance(elem, list):
        return any(any(char in token.text for char in characters) for token in elem)
    else:
        return False

# Apply the function to each row
sent_df[sent_df.apply(lambda row: any(contains_character(elem, characters) for elem in row[2:]), axis=1)][["Sentence", "Verb", "Subject", "Object"]]

Unnamed: 0,Sentence,Verb,Subject,Object
9,5,take,"[older, sister, Katniss, volunteers]",[place]
12,7,take,,"[Katniss, Peeta]"
17,9,reveal,[Peeta],"[love, Katniss]"
22,11,discover,[],"[Peeta, meant, said]"
23,11,mean,[Peeta],[said]
28,12,survive,[Katniss],"[televised, Games, begin, half, tributes, kill..."
29,12,ignore,,"[Haymitch, advice, run, away, melee, tempting,..."
33,13,form,[Peeta],"[uneasy, alliance, Careers]"
34,14,find,[],[Katniss]
37,15,draw,[Rue],[attention]
