In [4]:
import math
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import spacy
from spacy import displacy
from spacy.tokens import Token
from helper import *

In [5]:
#extracting dataframe paths
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath("alt2.ipynb")))
char_data_path= os.path.join(parent_folder, "Data\character.metadata.tsv")
plot_data_path= os.path.join(parent_folder, "Data\plot_summaries.txt")

#building character matadata dataframe
ind={0:"Wikipedia movie ID", 1:"Freebase movie ID", 2:"Movie release date", 3:"Character name", 4:"Actor date of birth", 5:"Actor gender", 6:"Actor height", 7:"Actor ethnicity", 8:"Actor name", 9:"Actor age at movie release", 10:"Freebase character/actor map ID", 11:"Freebase character ID", 12:"Freebase actor ID"}
characters_df= pd.read_csv(char_data_path, delimiter="\t", header=None)
characters_df=characters_df.rename(columns=ind)

#building plot summaries dataframe
plots_df = pd.read_csv(plot_data_path, delimiter='\t', header=None)
plots_df = plots_df.rename(columns={0:"Wikipedia movie ID", 1:"Plot Summary"})
plots_df=plots_df.set_index("Wikipedia movie ID")

In [6]:
nlp = spacy.load("en_core_web_md")
i=1
plot=plots_df.loc[plots_df.index[i]]["Plot Summary"]

doc= nlp(plot)

displacy.render(list(doc.sents)[11])

In [7]:
#iterate over sentences
def get_all_children(token):
    children = [token]
    for child in token.children:
        children.extend(get_all_children(child))
    children=sorted(children, key=lambda x: x.i)
    return children

for sent in doc.sents:
    #print the verb and all its children and their dependency relations
    for token in sent:
        if token.pos_=="VERB":
            for child in token.children:
                if child.dep_!="punct":
                    print(token.text, child.dep_, ":", get_all_children(child))

consists nsubj : [The, nation, of, Panem]
consists prep : [of, a, wealthy, Capitol, and, twelve, poorer, districts]
provide prep : [As, punishment, for, a, past, rebellion]
provide nsubj : [each, district]
provide aux : [must]
provide dobj : [a, boy, and, girl,  ]
provide prep : [between, the, ages, of, 12, and, 18, selected, by, lottery,  , for, the, annual, Hunger, Games]
selected agent : [by, lottery,  ]
selected prep : [for, the, annual, Hunger, Games]
fight nsubj : [The, tributes]
fight aux : [must]
fight prep : [to, the, death]
fight prep : [in, an, arena]
rewarded ccomp : [The, tributes, must, fight, to, the, death, in, an, arena]
rewarded nsubjpass : [the, sole, survivor]
rewarded auxpass : [is]
rewarded prep : [with, fame, and, wealth]
chosen prep : [In, her, first, Reaping, ,, 12-year, -, old, Primrose, Everdeen]
chosen auxpass : [is]
chosen prep : [from, District, 12]
volunteers compound : [Katniss]
take nsubj : [Her, older, sister, Katniss, volunteers]
take aux : [to]
take 

In [8]:
#iterate over sentences
def get_all_children(token):
    children = [token]
    for child in token.children:
        children.extend(get_all_children(child))
    children=sorted(children, key=lambda x: x.i)
    return children

set_verb=set()
for sent in doc.sents:
    #print the verb and all its children and their dependency relations
    for token in sent:
        if token.pos_=="VERB":
            deps=[child.dep_ for child in token.children]
            set_verb=set_verb.union(set(deps))

In [9]:
sent_df=pd.DataFrame(columns=["Sentence", "Verb"] + list(set_verb))
sent_num = 0
for sent in doc.sents:
    # print the verb and all its children and their dependency relations
    sent_num += 1
    for token in sent:
        if token.pos_ == "VERB":
            # Create a dictionary with the values to be assigned
            values = {"Sentence": sent_num, "Verb": token}
            for child in token.children:
                values[child.dep_] = get_all_children(child)
            # Append the dictionary as a new row to sent_df
            sent_df.loc[len(sent_df)] = values
sent_df

Unnamed: 0,Sentence,Verb,advmod,advcl,acomp,dative,neg,agent,dobj,prt,...,auxpass,mark,prep,nsubjpass,conj,ccomp,aux,compound,cc,xcomp
0,1,consists,,,,,,,,,...,,,"[of, a, wealthy, Capitol, and, twelve, poorer,...",,,,,,,
1,2,must,,,,,,,,,...,,,,,,,,,,
2,2,provide,,,,,,,"[a, boy, and, girl, ]",,...,,,"[between, the, ages, of, 12, and, 18, selected...",,,,[must],,,
3,2,selected,,,,,,"[by, lottery, ]",,,...,,,"[for, the, annual, Hunger, Games]",,,,,,,
4,3,must,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,51,warns,,,,,,,[Katniss],,...,,,,,,"[that, she, has, made, powerful, enemies, afte...",,,,
135,51,made,,,,,,,"[powerful, enemies]",,...,,[that],"[after, her, display, of, defiance]",,,,[has],,,
136,52,return,,"[while, Crane, is, locked, in, a, room, with, ...",,,,,,,...,,,"[to, District, 12]",,"[President, Snow, considers, the, situation, .]",,,,[and],
137,52,locked,,,,,,,,,...,[is],[while],"[in, a, room, with, a, bowl, of, nightlock, be...",[Crane],,,,,,


In [22]:
def process_df(df):
    # drop the punct column if present
    if "punct" in df.columns:
        df = df.drop(columns="punct")
    
    # drop aux column if present
    if "aux" in df.columns:
        df = df.drop(columns="aux")
    
    # drop auxpass column if present
    if "auxpass" in df.columns:
        df = df.drop(columns="auxpass")
    
    # drop mark column if present
    if "mark" in df.columns:
        df = df.drop(columns="mark")
    
    # drop advmod column if present
    if "advmod" in df.columns:
        df = df.drop(columns="advmod")
    
    # drop cc column if present
    if "cc" in df.columns:
        df = df.drop(columns="cc")
    
    # drop npadvmod column if present
    if "npadvmod" in df.columns:
        df = df.drop(columns="npadvmod")
    
    # drop oprd column if present
    if "oprd" in df.columns:
        df = df.drop(columns="oprd")
    
    return df

temp_df=process_df(sent_df)
temp_df

Unnamed: 0,Sentence,Verb,advcl,acomp,dative,neg,agent,dobj,prt,nsubj,prep,nsubjpass,conj,ccomp,compound,xcomp
0,1,consists,,,,,,,,"[The, nation, of, Panem]","[of, a, wealthy, Capitol, and, twelve, poorer,...",,,,,
1,2,must,,,,,,,,,,,,,,
2,2,provide,,,,,,"[a, boy, and, girl, ]",,"[each, district]","[between, the, ages, of, 12, and, 18, selected...",,,,,
3,2,selected,,,,,"[by, lottery, ]",,,,"[for, the, annual, Hunger, Games]",,,,,
4,3,must,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,51,warns,,,,,,[Katniss],,[Haymitch],,,,"[that, she, has, made, powerful, enemies, afte...",,
135,51,made,,,,,,"[powerful, enemies]",,[she],"[after, her, display, of, defiance]",,,,,
136,52,return,"[while, Crane, is, locked, in, a, room, with, ...",,,,,,,"[She, and, Peeta]","[to, District, 12]",,"[President, Snow, considers, the, situation, .]",,,
137,52,locked,,,,,,,,,"[in, a, room, with, a, bowl, of, nightlock, be...",[Crane],,,,


In [23]:
#create an empty dataframe, with columns : Sentence, Subject, Verb, Object, Place, Time
temp_df[temp_df["nsubj"].isna()].dropna(axis=1, how="all")

Unnamed: 0,Sentence,Verb,advcl,dative,neg,agent,dobj,prt,prep,nsubjpass,ccomp,compound,xcomp
1,2,must,,,,,,,,,,,
3,2,selected,,,,"[by, lottery, ]",,,"[for, the, annual, Hunger, Games]",,,,
4,3,must,,,,,,,,,,,
6,3,rewarded,,,,,,,"[with, fame, and, wealth]","[the, sole, survivor]","[The, tributes, must, fight, to, the, death, i...",,
7,4,chosen,,,,,,,"[from, District, 12]",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,47,revoked,,,,,,,,"[two, winners]",,,
129,48,shoot,,,,,[him],,,,,,
131,50,can,,,,,,,,,,,
133,50,proclaimed,"[before, they, can, commit, suicide]",,,,"[the, victors, of, the, 74th, Hunger, Games]",,,[they],,,


In [25]:
temp_df["Subject"]=temp_df["nsubj"]
temp_df["Object"]=temp_df["dobj"]
ind_subj = temp_df[temp_df["Subject"].isna()].index
ind_obj = temp_df[temp_df["Object"].isna()].index
temp_df.loc[ind_obj, "Object"] = temp_df.loc[ind_obj, "nsubjpass"]
temp_df.loc[ind_subj, "Subject"] = temp_df.loc[ind_subj, "agent"]
temp_df[["Sentence", "Verb", "Subject", "Object"]].dropna(axis=0, how="any")

Unnamed: 0,Sentence,Verb,Subject,Object
2,2,provide,"[each, district]","[a, boy, and, girl, ]"
9,5,take,"[Her, older, sister, Katniss, volunteers]","[her, place]"
10,6,gave,[who],[bread]
14,8,warns,[He],[them]
17,9,reveals,[Peeta],"[his, love, for, Katniss]"
21,10,provide,"["", sponsors, ""]","[in, -, Games, gifts, of, food, ,, medicine, ,..."
24,11,said,[he],[what]
33,13,forms,[Peeta],"[an, uneasy, alliance, with, the, four, Careers]"
34,14,find,[They],[Katniss]
37,15,draws,[Rue],"[her, attention]"


In [None]:
def get_characters(doc):
    characters = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.text.istitle():
            characters.append(ent.text)
    return list(set(characters))

characters = get_characters(doc)


def contains_character(elem, characters):
    if isinstance(elem, list):
        return any(any(char in token.text for char in characters) for token in elem)
    else:
        return False

# Apply the function to each row
fsent_df[sent_df.apply(lambda row: any(contains_character(elem, characters) for elem in row[2:]), axis=1)]

Unnamed: 0,Sentence,Verb,ccomp,advmod,punct,prep,npadvmod,acomp,xcomp,dative,...,nsubj,agent,nsubjpass,aux,neg,cc,compound,oprd,advcl,dobj
8,5,volunteers,,,,,,,,,...,,,,,,,[Katniss],,,
10,6,gave,,[once],,,,,,,...,[who],,,,,,,,"[when, she, was, starving]","[Katniss, bread]"
12,7,taken,,,[.],"[to, the, Capitol]",,,,,...,,,"[Katniss, and, Peeta]",,,,,,"[accompanied, by, their, frequently, drunk, me...",
13,7,accompanied,,,,"[past, victor, Haymitch, Abernathy]",,,,,...,,"[by, their, frequently, drunk, mentor, ,]",,,,,,,,
17,9,reveals,,[unexpectedly],[.],"[During, a, TV, interview, with, Caesar, Flick...",,,,,...,[Peeta],,,,,,,,,"[his, love, for, Katniss]"
22,11,discovers,"[Peeta, meant, what, he, said]",[However],[.],,,,,,...,[she],,,,,,,,,
23,11,meant,"[what, he, said]",,,,,,,,...,[Peeta],,,,,,,,,
28,12,survives,"[The, televised, Games, begin, with, half, of,...",[barely],[.],,,,"[ignoring, Haymitch, 's, advice, to, run, away...",,...,[Katniss],,,,,,,,,
29,12,ignoring,,,,,,,,,...,,,,,,,,,,"[Haymitch, 's, advice, to, run, away, from, th..."
33,13,forms,,,[.],,,,,,...,[Peeta],,,,,,,,,"[an, uneasy, alliance, with, the, four, Careers]"


In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Panem ORG
Capitol ORG
twelve CARDINAL
between the ages of 12 and 18 DATE
the annual Hunger Games EVENT
Reaping PERSON
12-year-old DATE
Primrose Everdeen FAC
12 CARDINAL
Katniss PERSON
Peeta Mellark PERSON
baker PERSON
Katniss PERSON
District 12 LOC
Katniss PERSON
Peeta PERSON
Capitol FAC
Haymitch Abernathy PERSON
Caesar Flickerman PERSON
Peeta PERSON
Katniss PERSON
Peeta PERSON
half CARDINAL
the first few minutes TIME
Katniss PERSON
Haymitch PERSON
four CARDINAL
Katniss PERSON
Katniss PERSON
Glimmer PERSON
Katniss PERSON
Katniss PERSON
a couple of days DATE
Katniss PERSON
Rue FAC
Cato PERSON
Katniss PERSON
Rue FAC
Rue FAC
District 1 LOC
Katniss PERSON
Rue FAC
Katniss PERSON
Rue FAC
Rue PERSON
Rue's FAC
11 CARDINAL
Seneca Crane PERSON
Gamemaker GPE
Katniss PERSON
Peeta PERSON
Haymitch PERSON
Crane PERSON
Katniss PERSON
Peeta PERSON
Peeta PERSON
Katniss PERSON
Thresh PERSON
District 11 ORG
Katniss PERSON
Rue FAC
Katniss PERSON
Rue FAC
Foxface ORG
District 5 LOC
Peeta PERSON
late at night