In [1]:
import os 
import pandas as pd
import numpy as np
import stanza

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
stanza.download('en') # download English model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 16.4MB/s]
2022-11-11 10:21:07 INFO: Downloading default packages for language: en (English) ...
2022-11-11 10:21:09 INFO: File exists: C:\Users\nerea\stanza_resources\en\default.zip
2022-11-11 10:21:21 INFO: Finished downloading models and saved to C:\Users\nerea\stanza_resources.


In [3]:
#text = "".join(x.strip() for x in text.split("-"))

In [4]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, mwt, pos, lemma, depparse, ner')

2022-11-11 10:21:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 23.7MB/s]
2022-11-11 10:21:24 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |

2022-11-11 10:21:24 INFO: Use device: cpu
2022-11-11 10:21:24 INFO: Loading: tokenize
2022-11-11 10:21:24 INFO: Loading: pos
2022-11-11 10:21:25 INFO: Loading: lemma
2022-11-11 10:21:25 INFO: Loading: depparse
2022-11-11 10:21:25 INFO: Loading: ner
2022-11-11 10:21:27 INFO: Done loading processors!


In [5]:
text = '''The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the "Career" tributes who train intensively at special academies and almost always win. During a TV interview with Caesar Flickerman, Peeta unexpectedly reveals his love for Katniss.'''

In [6]:
#doc = nlp(text) # run annotation over a sentence
#print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

In [7]:
def get_characters(doc):
    characters = []
    characters_name = []
    for sent in doc.sentences:
        for word in sent.ents:
            if word.type == 'PERSON' and word.text not in characters:
                characters.append([word.text])
                characters_name.append([word.text.split(' ')[0]])
    characters = list(np.unique(characters))
    characters_name = list(np.unique(characters_name))
    return characters, characters_name

In [8]:
''' This function finds attributes recursively, by first checking the words in 
the sentence which are not roots (main verb), and then checking all adjectives
and conjunctions related to those words'''

def recursive_find_adjs(root, sentence):
    children = [w for w in sentence.words if w.head == root.id]
    if not children:
        pass 
    filtered_child = [w for w in children if (w.deprel == "conj" or w.deprel == "compound" or w.deprel == "nsubj") and (w.pos == "ADJ"or w.pos == 'NOUN')] #or w.pos == 'NOUN'
    results = [w for w in filtered_child if not any(sub.head == w.id and sub.upos == "NOUN" for sub in sentence.words)]
    for w in children:
        results += recursive_find_adjs(w, sentence)
    return results

In [9]:
def char_attributes(doc):
    names = []
    names_2 = []
    attributes = []
    attributes_2 = []
    for sent in doc.sentences:
        nouns = [w for w in sent.words if w.pos == "PROPN"]
        for noun in nouns:
            if noun.text in get_characters(doc)[1]:
                # Find constructions in the form of "The car is beautiful"
                # In this scenario, the adjective is the parent of the noun
                adj0 = sent.words[noun.head-1] #adjective directly related
                adjs = [adj0] + recursive_find_adjs(adj0, sent) if adj0.pos == "ADJ" or adj0.pos == "NOUN" else []
                #The recursive function finds adjectives related to the first one found,
                #and hence also linked to the target noun
                mod_adjs = [w for w in sent.words if w.head == noun.id and (w.pos == "ADJ")]
                # This should only be one element because conjunctions are hierarchical
                if mod_adjs:
                    mod_adj = mod_adjs[0]
                    adjs.extend([mod_adj] + recursive_find_adjs(mod_adj, sent))
                if adjs:
                    unique_adjs = []
                    unique_ids = set()
                    for adj in adjs:
                        if adj.id not in unique_ids:
                            unique_adjs.append(adj)
                            unique_ids.add(adj.id)
                    names.append(noun.text)
                    attributes.append(" ".join([adj.text for adj in unique_adjs]))
    char_attributes = pd.DataFrame()
    char_attributes['Character Names'] = names
    char_attributes['Character Attributes'] = attributes
    char_attributes['Total Attributes'] = char_attributes.groupby('Character Names')['Character Attributes'].transform(lambda x: ' '.join(x))
    char_attributes= char_attributes[['Character Names','Total Attributes']]
    return (char_attributes.drop_duplicates().reset_index())

In [10]:
def agent_patient_verbs(doc):
    agent_verbs = {'id': [], 'word': [], 'head_id': [], 'agent_verbs': []}
    patient_verbs = {'id': [], 'word': [], 'head_id': [], 'patient_verbs': []}
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.deprel == "nsubj" or word.deprel == "acl:relcl":
                agent_verbs['id'].append(word.id)
                agent_verbs['word'].append(word.text)
                agent_verbs['head_id'].append(word.head)
                agent_verbs['agent_verbs'].append(sentence.words[word.head-1].text)
            elif word.deprel == "nsubj:pass" or word.deprel == "dobj" or word.deprel == "iobj":
                patient_verbs['id'].append(word.id)
                patient_verbs['word'].append(word.text)
                patient_verbs['head_id'].append(word.head)
                patient_verbs['patient_verbs'].append(sentence.words[word.head-1].text)

    return (pd.DataFrame(data=agent_verbs), pd.DataFrame(data=patient_verbs))


In [11]:
def create_table_dependencies(plot):
    doc = nlp(plot)
    attrs_table = char_attributes(doc)
    agent_verbs = agent_patient_verbs(doc)[0] 
    patient_verbs = agent_patient_verbs(doc)[1] 
    attrs_table['Agent Verbs'] = np.zeros(len(attrs_table['Character Names']))
    attrs_table['Patient Verbs'] = np.zeros(len(attrs_table['Character Names']))
    for idx, char in enumerate(attrs_table['Character Names']):
        av = []
        for idx2, w in enumerate(agent_verbs['word']):
            if (w in attrs_table['Total Attributes'][idx] or w == char):
                av.append(agent_verbs['agent_verbs'][idx2])
                attrs_table['Agent Verbs'][idx] = av
        pv = []
        for idx2, w in enumerate(patient_verbs['word']):
            if (w in attrs_table['Total Attributes'][idx] or w == char):
                pv.append(patient_verbs['patient_verbs'][idx2])
                attrs_table['Patient Verbs'][idx] = pv
    return attrs_table


In [12]:
#1.1: Loading the data:
data_folder = './data/'
df_plots = pd.read_csv(data_folder + "plot_summaries.txt", sep='\t', header=None)
df_plots.columns = ('Wikipedia movie ID', 'Summary')
df_plots.head()

Unnamed: 0,Wikipedia movie ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [13]:
def Analyse_Plots(df_plots):
    plot_analysis = pd.DataFrame()
    chars = []
    movies = []
    averbs = []
    pverbs = []
    attrs = []
    for i, summ in enumerate(df_plots['Summary']):
        print('Plot analysed ', i, ' out of ', len(df_plots['Summary']))
        male_gaze = create_table_dependencies(summ)
        for j in range(len(male_gaze)):
            movies.append(df_plots['Wikipedia movie ID'][i])
            chars.append(male_gaze['Character Names'][j])
            averbs.append(male_gaze['Agent Verbs'][j])
            pverbs.append(male_gaze['Patient Verbs'][j])
            attrs.append(male_gaze['Total Attributes'][j])
    plot_analysis['Wikipedia movie ID'] = movies
    plot_analysis['Character_Name'] = chars
    plot_analysis['Agent Verbs'] = averbs
    plot_analysis['Patient Verbs'] = pverbs
    plot_analysis['Attributes'] = attrs
    return plot_analysis

In [None]:
df_NLP = Analyse_Plots(df_plots)  
df_NLP.head(20)

Plot analysed  0  out of  42303
Plot analysed  1  out of  42303


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Patient Verbs'][idx] = pv
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Agent Verbs'][idx] = av


Plot analysed  2  out of  42303


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Agent Verbs'][idx] = av
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Patient Verbs'][idx] = pv


Plot analysed  3  out of  42303


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Agent Verbs'][idx] = av
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Patient Verbs'][idx] = pv


Plot analysed  4  out of  42303


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Agent Verbs'][idx] = av
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Patient Verbs'][idx] = pv


Plot analysed  5  out of  42303


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Patient Verbs'][idx] = pv
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Agent Verbs'][idx] = av


Plot analysed  6  out of  42303


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Agent Verbs'][idx] = av
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Patient Verbs'][idx] = pv


Plot analysed  7  out of  42303


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attrs_table['Agent Verbs'][idx] = av


Plot analysed  8  out of  42303


In [None]:
df_NLP.to_csv(data_folder + "Plot_NLP_Analysis.csv", sep='\t', header=None)

- problem with words separated by "-", does not recognize it as one single word...
- people-people attribute duplicated (people attribute-people)