In [1]:
import math
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import spacy
from spacy import displacy
from spacy.tokens import Token
from helper import *
#import neuralcoref

In [2]:
#extracting dataframe paths
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath("alt2.ipynb")))
char_data_path= os.path.join(parent_folder, "Data\character.metadata.tsv")
plot_data_path= os.path.join(parent_folder, "Data\plot_summaries.txt")

#building character matadata dataframe
ind={0:"Wikipedia movie ID", 1:"Freebase movie ID", 2:"Movie release date", 3:"Character name", 4:"Actor date of birth", 5:"Actor gender", 6:"Actor height", 7:"Actor ethnicity", 8:"Actor name", 9:"Actor age at movie release", 10:"Freebase character/actor map ID", 11:"Freebase character ID", 12:"Freebase actor ID"}
characters_df= pd.read_csv(char_data_path, delimiter="\t", header=None)
characters_df=characters_df.rename(columns=ind)

#building plot summaries dataframe
plots_df = pd.read_csv(plot_data_path, delimiter='\t', header=None)
plots_df = plots_df.rename(columns={0:"Wikipedia movie ID", 1:"Plot Summary"})
plots_df=plots_df.set_index("Wikipedia movie ID")

In [3]:
nlp = spacy.load("en_core_web_sm")
i=1
plot=plots_df.loc[plots_df.index[i]]["Plot Summary"]

doc= nlp(plot)

displacy.render(list(doc.sents)[51])

In [4]:
#iterate over sentences
def get_all_children(token):
    children = [token]
    for child in token.children:
        children.extend(get_all_children(child))
    children=sorted(children, key=lambda x: x.i)
    return children

for sent in doc.sents:
    #print the verb and all its children and their dependency relations
    for token in sent:
        if token.pos_=="VERB":
            for child in token.children:
                if child.dep_!="punct":
                    print(token.text, child.dep_, ":", get_all_children(child))

consists nsubj : [The, nation, of, Panem]
consists prep : [of, a, wealthy, Capitol, and, twelve, poorer, districts]
provide prep : [As, punishment, for, a, past, rebellion]
provide nsubj : [each, district]
provide aux : [must]
provide dobj : [a, boy, and, girl,  , between, the, ages, of, 12, and, 18, selected, by, lottery,  , for, the, annual, Hunger, Games]
selected agent : [by, lottery,  ]
selected prep : [for, the, annual, Hunger, Games]
fight nsubj : [The, tributes]
fight aux : [must]
fight prep : [to, the, death]
fight prep : [in, an, arena]
rewarded ccomp : [The, tributes, must, fight, to, the, death, in, an, arena]
rewarded nsubjpass : [the, sole, survivor]
rewarded auxpass : [is]
rewarded prep : [with, fame, and, wealth]
chosen prep : [In, her, first, Reaping]
chosen nsubjpass : [12-year, -, old, Primrose, Everdeen]
chosen auxpass : [is]
chosen prep : [from, District, 12]
volunteers compound : [Katniss]
take aux : [to]
take dobj : [her, place]
gave nsubj : [who]
gave advmod : [

In [6]:
#iterate over sentences
def get_all_children(token):
    children = [token]
    for child in token.children:
        children.extend(get_all_children(child))
    children=sorted(children, key=lambda x: x.i)
    return children

set_verb=set()
for sent in doc.sents:
    #print the verb and all its children and their dependency relations
    for token in sent:
        if token.pos_=="VERB":
            deps=[child.dep_ for child in token.children]
            set_verb=set_verb.union(set(deps))

In [7]:
sent_df=pd.DataFrame(columns=["Sentence", "Verb"] + list(set_verb))
sent_num = 0
for sent in doc.sents:
    # print the verb and all its children and their dependency relations
    sent_num += 1
    for token in sent:
        if token.pos_ == "VERB":
            # Create a dictionary with the values to be assigned
            values = {"Sentence": sent_num, "Verb": token.text}
            for child in token.children:
                values[child.dep_] = get_all_children(child)
            # Append the dictionary as a new row to sent_df
            sent_df.loc[len(sent_df)] = values
sent_df

Unnamed: 0,Sentence,Verb,prep,prt,acomp,aux,nsubj,punct,npadvmod,xcomp,...,oprd,agent,dative,compound,advcl,auxpass,nsubjpass,dobj,advmod,neg
0,1,consists,"[of, a, wealthy, Capitol, and, twelve, poorer,...",,,,"[The, nation, of, Panem]",[.],,,...,,,,,,,,,,
1,2,must,,,,,,,,,...,,,,,,,,,,
2,2,provide,"[As, punishment, for, a, past, rebellion]",,,[must],"[each, district]",[.],,,...,,,,,,,,"[a, boy, and, girl, , between, the, ages, of,...",,
3,2,selected,"[for, the, annual, Hunger, Games]",,,,,,,,...,,"[by, lottery, ]",,,,,,,,
4,3,must,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,51,warns,,,,,[Haymitch],[.],,,...,,,,,,,,[Katniss],,
135,51,made,"[after, her, display, of, defiance]",,,[has],[she],,,,...,,,,,,,,"[powerful, enemies]",,
136,52,return,"[to, District, 12]",,,,"[She, and, Peeta]","[,]",,,...,,,,,"[while, Crane, is, locked, in, a, room, with, ...",,,,,
137,52,locked,"[in, a, room, with, a, bowl, of, nightlock, be...",,,,,,,,...,,,,,,[is],[Crane],,,


In [8]:
def process_df(df):
    # drop the punct column if present
    if "punct" in df.columns:
        df = df.drop(columns="punct")
    
    # drop aux column if present
    if "aux" in df.columns:
        df = df.drop(columns="aux")
    
    # drop auxpass column if present
    if "auxpass" in df.columns:
        df = df.drop(columns="auxpass")
    
    # drop mark column if present
    if "mark" in df.columns:
        df = df.drop(columns="mark")
    
    return df

temp_df=process_df(sent_df)
temp_df

Unnamed: 0,Sentence,Verb,prep,prt,acomp,nsubj,npadvmod,xcomp,ccomp,preconj,...,cc,oprd,agent,dative,compound,advcl,nsubjpass,dobj,advmod,neg
0,1,consists,"[of, a, wealthy, Capitol, and, twelve, poorer,...",,,"[The, nation, of, Panem]",,,,,...,,,,,,,,,,
1,2,must,,,,,,,,,...,,,,,,,,,,
2,2,provide,"[As, punishment, for, a, past, rebellion]",,,"[each, district]",,,,,...,,,,,,,,"[a, boy, and, girl, , between, the, ages, of,...",,
3,2,selected,"[for, the, annual, Hunger, Games]",,,,,,,,...,,,"[by, lottery, ]",,,,,,,
4,3,must,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,51,warns,,,,[Haymitch],,,"[that, she, has, made, powerful, enemies, afte...",,...,,,,,,,,[Katniss],,
135,51,made,"[after, her, display, of, defiance]",,,[she],,,,,...,,,,,,,,"[powerful, enemies]",,
136,52,return,"[to, District, 12]",,,"[She, and, Peeta]",,,,,...,[and],,,,,"[while, Crane, is, locked, in, a, room, with, ...",,,,
137,52,locked,"[in, a, room, with, a, bowl, of, nightlock, be...",,,,,,,,...,,,,,,,[Crane],,,


In [46]:
#create an empty dataframe, with columns : Sentence, Subject, Verb, Object, Place, Time
temp_df.columns
temp_df[temp_df["prt"].notna()].dropna(axis=1, how="all")

Unnamed: 0,Sentence,Verb,dobj,nsubj,prt
48,21,draw,[them],[Rue],[off]
50,21,setting,"[the, mines, planted, around, it]",,[off]
74,30,turning,,"[the, Games]",[out]
101,38,pins,[her],,[down]
116,43,speed,[things],,[up]


In [45]:
def get_characters(doc):
    characters = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.text.istitle():
            characters.append(ent.text)
    return list(set(characters))

characters = get_characters(doc)


def contains_character(row, df, characters):
    for col in df.columns[2:]:
        if isinstance(row[col], list) and (any(char in row[col] for char in characters)):
            return True
    return False

# Apply the function to each row
sent_df[sent_df.apply(lambda row: contains_character(row, sent_df, characters), axis=1)]

TypeError: Argument 'other' has incorrect type (expected spacy.tokens.token.Token, got str)

In [17]:
characters

['Caesar Flickerman',
 'Katniss',
 'Peeta Mellark',
 'Rue',
 'Clove',
 'Seneca Crane',
 'Primrose Everdeen',
 'Snow',
 'Haymitch',
 'Glimmer',
 'Haymitch Abernathy',
 'Peeta']