In [24]:
import math
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import spacy
from spacy import displacy
from spacy.tokens import Token
from helper import *

#iterate over sentences
def get_all_children(token):
    children = [token]
    for child in token.children:
        children.extend(get_all_children(child))
    children=sorted(children, key=lambda x: x.i)
    return children

#extracting dataframe paths
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath("alt2.ipynb")))
char_data_path= os.path.join(parent_folder, "Data\character.metadata.tsv")
plot_data_path= os.path.join(parent_folder, "Data\plot_summaries.txt")

#building character matadata dataframe
ind={0:"Wikipedia movie ID", 1:"Freebase movie ID", 2:"Movie release date", 3:"Character name", 4:"Actor date of birth", 5:"Actor gender", 6:"Actor height", 7:"Actor ethnicity", 8:"Actor name", 9:"Actor age at movie release", 10:"Freebase character/actor map ID", 11:"Freebase character ID", 12:"Freebase actor ID"}
characters_df= pd.read_csv(char_data_path, delimiter="\t", header=None)
characters_df=characters_df.rename(columns=ind)

#building plot summaries dataframe
plots_df = pd.read_csv(plot_data_path, delimiter='\t', header=None)
plots_df = plots_df.rename(columns={0:"Wikipedia movie ID", 1:"Plot Summary"})
plots_df=plots_df.set_index("Wikipedia movie ID")


In [52]:
nlp = spacy.load("en_core_web_md")
sent_df=pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object", "nsubj", "nsubjpass", "dobj", "agent", "ccomp"])

for i in range(1):
    i=1
    plot=plots_df.iloc[i]["Plot Summary"]
    id=plots_df.index[i]
    doc= nlp(plot)
    sent_num = 0
    for sent in doc.sents:
        # print the verb and all its children and their dependency relations
        sent_num += 1
        for token in sent:
            if token.pos_ == "VERB":
                # Create a dictionary with the values to be assigned
                values = {"Wikipedia movie ID": id,"Sentence": sent_num, "Verb": token.lemma_}
                for child in token.children:
                    if child.dep_ in sent_df.columns:
                        values[child.dep_] = get_all_children(child)
                # Append the dictionary as a new row to sent_df
                sent_df.loc[len(sent_df)] = values
    print("Done with movie : ", i)

sent_df["Object"]=sent_df["dobj"]
sent_df.drop(columns=["dobj"], inplace=True)
ind_obj = sent_df[sent_df["Object"].isna()].index
sent_df.loc[ind_obj, "Object"] = sent_df.loc[ind_obj, "nsubjpass"]
sent_df.drop(columns=["nsubjpass"], inplace=True)
ind_obj = sent_df[sent_df["Object"].isna()].index
sent_df.loc[ind_obj, "Object"] = sent_df.loc[ind_obj, "ccomp"]
sent_df.drop(columns=["ccomp"], inplace=True)

sent_df["Subject"]=sent_df["nsubj"]
sent_df.drop(columns=["nsubj"], inplace=True)
ind_subj = sent_df[sent_df["Subject"].isna()].index
sent_df.loc[ind_subj, "Subject"] = sent_df.loc[ind_subj, "agent"]
sent_df.drop(columns=["agent"], inplace=True)

sent_df.to_csv('Verb_Subject_Object.csv', index=False)
sent_df

Done with movie :  1


Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
0,31186339,1,consist,"[The, nation, of, Panem]",
1,31186339,2,must,,
2,31186339,2,provide,"[each, district]","[a, boy, and, girl, ]"
3,31186339,2,select,"[by, lottery, ]",
4,31186339,3,must,,
...,...,...,...,...,...
134,31186339,51,warn,[Haymitch],[Katniss]
135,31186339,51,make,[she],"[powerful, enemies]"
136,31186339,52,return,"[She, and, Peeta]",
137,31186339,52,lock,,[Crane]


In [8]:
set_verb=set()
for sent in doc.sents:
    #print the verb and all its children and their dependency relations
    for token in sent:
        if token.pos_=="VERB":
            deps=[child.dep_ for child in token.children]
            set_verb=set_verb.union(set(deps))

In [13]:
sent_df=pd.DataFrame(columns=["Sentence", "Verb"] + list(set_verb))
sent_num = 0
for sent in doc.sents:
    # print the verb and all its children and their dependency relations
    sent_num += 1
    for token in sent:
        if token.pos_ == "VERB":
            # Create a dictionary with the values to be assigned
            values = {"Sentence": sent_num, "Verb": token}
            for child in token.children:
                if child.dep_ in sent_df.columns:
                    values[child.dep_] = get_all_children(child)
            # Append the dictionary as a new row to sent_df
            sent_df.loc[len(sent_df)] = values
sent_df

Unnamed: 0,Sentence,Verb,dobj,nsubj,aux,mark,compound,nsubjpass,conj,auxpass,...,xcomp,prt,acomp,punct,cc,prep,agent,advcl,advmod,oprd
0,1,consists,,"[The, nation, of, Panem]",,,,,,,...,,,,[.],,"[of, a, wealthy, Capitol, and, twelve, poorer,...",,,,
1,2,must,,,,,,,,,...,,,,,,,,,,
2,2,provide,"[a, boy, and, girl, ]","[each, district]",[must],,,,,,...,,,,[.],,"[between, the, ages, of, 12, and, 18, selected...",,,,
3,2,selected,,,,,,,,,...,,,,,,"[for, the, annual, Hunger, Games]","[by, lottery, ]",,,
4,3,must,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,51,warns,[Katniss],[Haymitch],,,,,,,...,,,,[.],,,,,,
135,51,made,"[powerful, enemies]",[she],[has],[that],,,,,...,,,,,,"[after, her, display, of, defiance]",,,,
136,52,return,,"[She, and, Peeta]",,,,,"[President, Snow, considers, the, situation, .]",,...,,,,"[,]",[and],"[to, District, 12]",,"[while, Crane, is, locked, in, a, room, with, ...",,
137,52,locked,,,,[while],,[Crane],,[is],...,,,,,,"[in, a, room, with, a, bowl, of, nightlock, be...",,,,


In [12]:
#create an empty dataframe, with columns : Sentence, Subject, Verb, Object, Place, Time
sent_df[sent_df["ccomp"].notna()].dropna(axis=1, how="all")

Unnamed: 0,Sentence,Verb,nsubj,nsubjpass,dobj,ccomp
6,3,rewarded,,"[the, sole, survivor]",,"[The, tributes, must, fight, to, the, death, i..."
18,10,believing,,,,"[it, to, be, a, ploy, to, gain, audience, supp..."
22,11,discovers,[she],,,"[Peeta, meant, what, he, said]"
23,11,meant,[Peeta],,,"[what, he, said]"
28,12,survives,[Katniss],,,"[The, televised, Games, begin, with, half, of,..."
59,24,finds,[She],,,"[Rue, trapped, and, releases, her]"
64,25,causing,,,,"[it, to, stab, Rue, in, the, stomach, instead]"
66,26,shoots,[Katniss],,[him],[dead]
79,31,make,,,,"[a, rule, change, to, avoid, inciting, further..."
82,32,announced,,[It],,"[that, tributes, from, the, same, district, ca..."


In [53]:
def remove_stopwords_from_df(df):
    # Define a function to remove stop words from a list of tokens
    def remove_stopwords(tokens):
        if isinstance(tokens, list):
            return [token for token in tokens if not token.is_stop]
        else:
            return None
    # Apply the function to the "Subject" and "Object" columns
    df['Subject'] = df['Subject'].apply(remove_stopwords)
    df['Object'] = df['Object'].apply(remove_stopwords)
    return df

#sent_df[["Sentence", "Verb", "Subject", "Object"]].dropna(axis=0, how="any").apply(lambda x: stop_words_deletion(x) if x.name in ['Subject', 'Object'] else x, axis=1)
remove_stopwords_from_df(sent_df).dropna(axis=0, how="any")

Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
2,31186339,2,provide,[district],"[boy, girl, ]"
9,31186339,5,take,"[older, sister, Katniss, volunteers]",[place]
10,31186339,6,give,[],[bread]
14,31186339,8,warn,[],[]
17,31186339,9,reveal,[Peeta],"[love, Katniss]"
21,31186339,10,provide,"["", sponsors, ""]","[-, Games, gifts, food, ,, medicine, ,, tools]"
22,31186339,11,discover,[],"[Peeta, meant, said]"
23,31186339,11,mean,[Peeta],[said]
24,31186339,11,say,[],[]
28,31186339,12,survive,[Katniss],"[televised, Games, begin, half, tributes, kill..."


In [51]:
from spacy import displacy

displacy.render(doc, style="ent")

In [49]:
sent_df.head(20)

Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
0,31186339,1,consists,"[The, nation, of, Panem]",
1,31186339,2,must,,
2,31186339,2,provide,"[each, district]","[a, boy, and, girl, ]"
3,31186339,2,selected,"[by, lottery, ]",
4,31186339,3,must,,
5,31186339,3,fight,"[The, tributes]",
6,31186339,3,rewarded,,"[the, sole, survivor]"
7,31186339,4,chosen,,
8,31186339,5,volunteers,,
9,31186339,5,take,"[Her, older, sister, Katniss, volunteers]","[her, place]"


In [None]:
def get_characters(doc):
    characters = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.text.istitle():
            characters.append(ent.text)
    return list(set(characters))

characters = get_characters(doc)


def contains_character(elem, characters):
    if isinstance(elem, list):
        return any(any(char in token.text for char in characters) for token in elem)
    else:
        return False

# Apply the function to each row
sent_df[sent_df.apply(lambda row: any(contains_character(elem, characters) for elem in row[2:]), axis=1)][["Sentence", "Verb", "Subject", "Object"]]

NameError: name 'doc' is not defined

In [34]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Panem 14 19 ORG
Capitol 42 49 ORG
twelve 54 60 CARDINAL
between the ages of 12 and 18 158 187 DATE
the annual Hunger Games 213 236 EVENT
Reaping 353 360 PERSON
12-year-old 362 373 DATE
Primrose Everdeen 374 391 FAC
12 416 418 CARDINAL
Katniss 437 444 PERSON
Peeta Mellark 475 488 PERSON
baker 492 497 PERSON
Katniss 518 525 PERSON
District 12 568 579 LOC
Katniss 589 596 PERSON
Peeta 601 606 PERSON
Capitol 624 631 FAC
Haymitch Abernathy 691 709 PERSON
Caesar Flickerman 846 863 PERSON
Peeta 865 870 PERSON
Katniss 905 912 PERSON
Peeta 1077 1082 PERSON
half 1134 1138 CARDINAL
the first few minutes 1165 1186 TIME
Katniss 1188 1195 PERSON
Haymitch 1221 1229 PERSON
four 1399 1403 CARDINAL
Katniss 1429 1436 PERSON
Katniss 1571 1578 PERSON
Glimmer 1644 1651 PERSON
Katniss 1726 1733 PERSON
Katniss 1780 1787 PERSON
a couple of days 1792 1808 DATE
Katniss 1896 1903 PERSON
Rue 1908 1911 FAC
Cato 2008 2012 PERSON
Katniss 2052 2059 PERSON
Rue 2091 2094 FAC
Rue 2123 2126 FAC
District 1 2176 2186 LOC
Kat