In [5]:
import math
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os
import spacy
import spacy_experimental
from spacy.tokens import Token
from helper import *

def get_all_children(token):
    children = [token]
    for child in token.children:
        children.extend(get_all_children(child))
    children=sorted(children, key=lambda x: x.i)
    return children

# Function to replace tokens with their references
def replace_tokens_with_refs(tokens):
    if not isinstance(tokens, list):
        return tokens
    updated_tokens = []
    for token in tokens:
        if token.pos_ == "PRON" and hasattr(token._, 'ref') and token._.ref is not None:
            updated_tokens.extend(token._.ref)
        else:
            updated_tokens.append(token.text)
    return updated_tokens

#extracting dataframe paths
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath("alt2.ipynb")))
char_data_path= os.path.join(parent_folder, "Data\\character.metadata.tsv")
plot_data_path= os.path.join(parent_folder, "Data\\resolved_texts_fastcoref.csv")

#building character matadata dataframe
ind={0:"Wikipedia movie ID", 1:"Freebase movie ID", 2:"Movie release date", 3:"Character name", 4:"Actor date of birth", 5:"Actor gender", 6:"Actor height", 7:"Actor ethnicity", 8:"Actor name", 9:"Actor age at movie release", 10:"Freebase character/actor map ID", 11:"Freebase character ID", 12:"Freebase actor ID"}
characters_df= pd.read_csv(char_data_path, delimiter="\t", header=None)
characters_df=characters_df.rename(columns=ind)

#building plot summaries dataframe
plots_df = pd.read_csv(plot_data_path, delimiter=',')
plots_df = plots_df.rename(columns={'wiki_id':"Wikipedia movie ID", 'resolved_text':"Plot Summary"})
plots_df=plots_df.set_index("Wikipedia movie ID")
plots_df

Unnamed: 0_level_0,Plot Summary
Wikipedia movie ID,Unnamed: 1_level_1
23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
31186339,The nation of Panem consists of a wealthy Capi...
20663735,Poovalli Induchoodan is sentenced for six year...
2231378,"The Lemon Drop Kid , a New York City swindler,..."
595909,Seventh-day Adventist Church pastor Michael Ch...
...,...
34808485,"The story is about Reema , a young Muslim scho..."
1096473,"In 1928 Hollywood, director Leo Andreyev looks..."
35102018,American Luthier focuses on Randy Parsons’ tra...
8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [12]:
final_df=pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object", "nsubj", "nsubjpass", "dobj", "agent", "ccomp"])
nlp = spacy.load("en_core_web_md")

for i in range(len(plots_df))[11000:]:
    sent_df=pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object", "nsubj", "nsubjpass", "dobj", "agent", "ccomp"])
    plot=plots_df.iloc[i]["Plot Summary"]
    id=plots_df.index[i]
    doc= nlp(plot)
    sent_num = 0
    for sent in doc.sents:
        # print the verb and all its children and their dependency relations
        sent_num += 1
        for token in sent:
            if token.pos_ == "VERB":
                # Create a dictionary with the values to be assigned
                values = {"Wikipedia movie ID": id,"Sentence": sent_num, "Verb": token.lemma_}
                for child in token.children:
                    if child.dep_ in sent_df.columns:
                        values[child.dep_] = get_all_children(child)
                # Append the dictionary as a new row to sent_df
                sent_df.loc[len(sent_df)] = values
    final_df=pd.concat([final_df, sent_df], ignore_index=True)
    if i % 4000 == 0:
        final_df["Object"]=final_df["dobj"]
        final_df.drop(columns=["dobj"], inplace=True)
        ind_obj = final_df[final_df["Object"].isna()].index
        final_df.loc[ind_obj, "Object"] = final_df.loc[ind_obj, "nsubjpass"]
        final_df.drop(columns=["nsubjpass"], inplace=True)
        ind_obj = final_df[final_df["Object"].isna()].index
        final_df.loc[ind_obj, "Object"] = final_df.loc[ind_obj, "ccomp"]
        final_df.drop(columns=["ccomp"], inplace=True)
        final_df["Subject"]=final_df["nsubj"]
        final_df.drop(columns=["nsubj"], inplace=True)
        ind_subj = final_df[final_df["Subject"].isna()].index
        final_df.loc[ind_subj, "Subject"] = final_df.loc[ind_subj, "agent"]
        final_df.drop(columns=["agent"], inplace=True)
        final_df.to_csv(f'Verb_Subject_Object_{i}.csv', index=False)
        final_df = pd.DataFrame(columns=["Wikipedia movie ID", "Sentence", "Verb", "Subject", "Object", "nsubj", "nsubjpass", "dobj", "agent", "ccomp"])
    print("Done with movie : ", i)

final_df["Object"]=final_df["dobj"]
final_df.drop(columns=["dobj"], inplace=True)
ind_obj = final_df[final_df["Object"].isna()].index
final_df.loc[ind_obj, "Object"] = final_df.loc[ind_obj, "nsubjpass"]
final_df.drop(columns=["nsubjpass"], inplace=True)
ind_obj = final_df[final_df["Object"].isna()].index
final_df.loc[ind_obj, "Object"] = final_df.loc[ind_obj, "ccomp"]
final_df.drop(columns=["ccomp"], inplace=True)

final_df["Subject"]=final_df["nsubj"]
final_df.drop(columns=["nsubj"], inplace=True)
ind_subj = final_df[final_df["Subject"].isna()].index
final_df.loc[ind_subj, "Subject"] = final_df.loc[ind_subj, "agent"]
final_df.drop(columns=["agent"], inplace=True)

final_df.to_csv('Verb_Subject_Object_end.csv', index=False)
final_df

Done with movie :  11000
Done with movie :  11001
Done with movie :  11002
Done with movie :  11003
Done with movie :  11004
Done with movie :  11005
Done with movie :  11006
Done with movie :  11007
Done with movie :  11008
Done with movie :  11009
Done with movie :  11010
Done with movie :  11011
Done with movie :  11012
Done with movie :  11013
Done with movie :  11014
Done with movie :  11015
Done with movie :  11016
Done with movie :  11017
Done with movie :  11018
Done with movie :  11019
Done with movie :  11020
Done with movie :  11021
Done with movie :  11022
Done with movie :  11023
Done with movie :  11024
Done with movie :  11025
Done with movie :  11026
Done with movie :  11027
Done with movie :  11028
Done with movie :  11029
Done with movie :  11030
Done with movie :  11031
Done with movie :  11032
Done with movie :  11033
Done with movie :  11034
Done with movie :  11035
Done with movie :  11036
Done with movie :  11037
Done with movie :  11038
Done with movie :  11039


Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
0,187616,1,take,"[the, USS, Nimitz]","[a, civilian, observer, ,, Warren, Lasky, ,]"
1,187616,1,depart,[it],"[Pearl, Harbor]"
2,187616,2,encounter,"[the, ship]","[a, strange, storm, -, like, vortex, which, di..."
3,187616,2,disappear,[which],
4,187616,2,pass,"[the, ship]",
...,...,...,...,...,...
110200,6040782,31,surrender,"[The, German, intruders]",
110201,6040782,32,become,"[Mainwaring, and, his, men]",
110202,6040782,33,look,"[Mainwaring, and, the, Home, Guard]",
110203,6040782,34,change,"[The, weather]",


In [20]:
data1_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_11423.csv")
data2_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_12000.csv")
data3_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_16000.csv")
data4_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_20000.csv")
data5_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_24000.csv")
data6_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_28000.csv")
data7_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_32000.csv")
data8_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_36000.csv")
data9_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_40000.csv")
data10_path= os.path.join(parent_folder, "temp\\Verb_Subject_Object_end.csv")
id=plots_df.index[11000]

first_df = pd.read_csv(data1_path, delimiter=',')
# Get the index of the row with the 11000th movie ID
index_11000th = first_df[first_df['Wikipedia movie ID'] == id].index[0]
# Filter the dataframe to keep only rows before the 11000th movie ID
first_df = first_df.loc[:index_11000th - 1]
second_df= pd.read_csv(data2_path, delimiter=',')
third_df = pd.read_csv(data3_path, delimiter=',')
fourth_df= pd.read_csv(data4_path, delimiter=',')
fifth_df = pd.read_csv(data5_path, delimiter=',')
sixth_df= pd.read_csv(data6_path, delimiter=',')
seventh_df = pd.read_csv(data7_path, delimiter=',')
eighth_df= pd.read_csv(data8_path, delimiter=',')
ninth_df = pd.read_csv(data9_path, delimiter=',')
tenth_df= pd.read_csv(data10_path, delimiter=',')

# Concatenate the dataframes and reset the index
df = pd.concat([first_df, second_df, third_df, fourth_df, fifth_df, sixth_df, seventh_df, eighth_df, ninth_df, tenth_df], ignore_index=True)
df.to_csv('Verb_Subject_Object.csv', index=False)


In [24]:
df = pd.read_csv('Verb_Subject_Object.csv', delimiter=',')
df

Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
0,23890098,1,work,,
1,23890098,1,develop,"[Shlykov, ,, a, hard, -, working, taxi, driver...","[a, bizarre, love, -, hate, relationship]"
2,23890098,1,realize,,"[they, are, n't, so, different, after, all]"
3,31186339,1,consist,"[The, nation, of, Panem]",
4,31186339,2,provide,"[each, district]","[a, boy, and, girl]"
...,...,...,...,...,...
2013704,6040782,31,surrender,"[The, German, intruders]",
2013705,6040782,32,become,"[Mainwaring, and, his, men]",
2013706,6040782,33,look,"[Mainwaring, and, the, Home, Guard]",
2013707,6040782,34,change,"[The, weather]",


In [19]:
second_df

Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
0,780889,1,discover,"[Rugged, mining, engineer, Rian, Mitchell]","[a, lost, emerald, mine, in, the, highlands, o..."
1,780889,1,lose,,
2,780889,1,operate,"[by, the, Spanish, conquistadors]",[which]
3,780889,2,consume,"[by, the, quest, for, wealth]",
4,780889,3,have,[he],
...,...,...,...,...,...
49762,5413972,9,marry,,
49763,5413972,10,connive,,
49764,5413972,10,force,,[her]
49765,5413972,11,take,"[his, dastardly, plan]","[full, effect]"


In [17]:
def remove_stopwords_from_df(df):
    # Define a function to remove stop words from a list of tokens or strings
    def remove_stopwords(tokens_or_strings):
        if isinstance(tokens_or_strings, list):
            # If it's a list, check if elements are spaCy tokens or strings
            cleaned_list = []
            for item in tokens_or_strings:
                if isinstance(item, spacy.tokens.Token):
                    # If it's a spaCy token, filter out stop words
                    cleaned_list.append(item.text) if not item.is_stop else None
                elif isinstance(item, str):
                    # If it's a string, convert to spaCy tokens and filter out stop words
                    tokens = nlp(item)
                    cleaned_list.extend([token.text for token in tokens if not token.is_stop])
                else:
                    cleaned_list.append(item)
            return cleaned_list
        else:
            return None
    
    # Apply the function to the "Subject" and "Object" columns
    df['Subject'] = df['Subject'].apply(remove_stopwords)
    df['Object'] = df['Object'].apply(remove_stopwords)
    return df.dropna(axis=0, how="any")

remove_stopwords_from_df(sent_df).dropna(axis=0, how="any")

Unnamed: 0,Wikipedia movie ID,Sentence,Verb,Subject,Object
1,31186339,2,provide,[district],"[boy, girl, ]"
7,31186339,5,take,"[older, sister, Katniss, volunteers]","[Katniss, place]"
8,31186339,6,give,[Peeta],[bread]
12,31186339,8,warn,[Peeta],"[Katniss, Peeta]"
15,31186339,9,reveal,[Peeta],"[Peeta, love, Katniss]"
18,31186339,10,provide,"["", sponsors, ""]","[-, Games, gifts, food, ,, medicine, ,, tools]"
19,31186339,11,discover,[Katniss],"[Peeta, meant, Peeta, said]"
20,31186339,11,mean,[Peeta],"[Peeta, said]"
21,31186339,11,say,[Peeta],[]
25,31186339,12,survive,[Katniss],"[televised, Games, begin, half, tributes, kill..."


In [5]:
def get_characters(doc):
    characters = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.text.istitle():
            characters.append(ent.text)
    return list(set(characters))

characters = get_characters(doc)


def contains_character(elem, characters):
    if isinstance(elem, list):
        for token in elem:
            if isinstance(token, str):
                if token in characters:
                    return True
            elif token.text in characters:
                return True
    else:
        return False

# Apply the function to each row
sent_df[sent_df.apply(lambda row: any(contains_character(elem, characters) for elem in row[2:]), axis=1)][["Sentence", "Verb", "Subject", "Object"]]

Unnamed: 0,Sentence,Verb,Subject,Object
7,5,take,"[Her, older, sister, Katniss, volunteers]","[Katniss, place]"
8,6,give,[Peeta],[bread]
9,6,starve,[Katniss],
10,7,take,,"[Katniss, and, Peeta]"
11,7,accompany,"[by, Katniss, Peeta, frequently, drunk, mentor...",
...,...,...,...,...
124,49,give,[Katniss],"[half, of, the, nightlock]"
125,50,commit,"[Peeta, Katniss]",[suicide]
127,51,warn,[Haymitch],[Katniss]
128,51,make,[Katniss],"[powerful, enemies]"
