In [10]:
import pandas as pd
import numpy as np
import stanza


In [11]:
stanza.install_corenlp(dir="C:/Users/nerea/CoreNLP")



In [12]:
from stanza.server import CoreNLPClient

In [None]:
import os
os.environ["CORENLP_HOME"] = "C:/Users/nerea/CoreNLP"
os.environ["_JAVA_OPTIONS"] = '-Xmx512M'

In [None]:
text = "Chris Manning is a nice person. Chris wrote a simple sentence. He also gives oranges to people."
ann = client.annotate(text)

In [None]:

''' Use the ner processor's output to get the "PERSONS" in the plots
We take only the first name'''

def get_characters(doc):
    characters = []
    characters_name = []
    for sent in doc.sentences:
        for word in sent.ents:
            if word.type == 'PERSON' and word.text not in characters:
                characters.append([word.text])
                characters_name.append([word.text.split(' ')[0]])
    characters = list(np.unique(characters))
    characters_name = list(np.unique(characters_name))
    return characters, characters_name

'''For each character, we look at immediate verb governors and attribute syntactic dependencies to all of the entity’s mention headwords that are extracted from the typed dependency tuples produced by the parser:
    + Agent verbs. Verbs for which the entity is an agent argument (nsubj or agent).
    + Patient verbs. Verbs for which the entity is the patient, theme or other argument (dobj, nsubjpass, iobj, or any prepositional argument prep *).
    + Attributes. Adjectives and common noun words that relate to the mention as adjectival modifiers, noun-noun compounds, appositives, or copulas (nsubj or appos governors, or nsubj, appos, amod, nn dependents of an entity mention). 
    
    We end up with a dataframe containing Agent Verbs, Patient Verbs and Attributes corresponding to each character in the plot. '''

''' This function finds attributes recursively, by first checking the words in 
the sentence which are not roots (main verb), and then checking all adjectives
and conjunctions related to those words'''

def recursive_find_adjs(root, sentence):
    children = [w for w in sentence.words if w.head == root.id]
    if not children:
        pass 
    filtered_child = [w for w in children if (w.deprel == "conj" or w.deprel == "compound" or w.deprel == "nsubj") and (w.pos == "ADJ"or w.pos == 'NOUN' or adj0.pos == "ADV" or adj0.pos == "CCONJ"or adj0.pos == "AUX" or adj0.pos == "ADP")] #or w.pos == 'NOUN'
    results = [w for w in filtered_child if not any(sub.head == w.id and sub.upos == "NOUN" for sub in sentence.words)]
    for w in children:
        results += recursive_find_adjs(w, sentence)
    return results

''' The following function uses the recursive search of attributes and outputs a dataframe with the character name and its attributes'''

def char_attributes(doc):
    names = []
    names_2 = []
    attributes = []
    attributes_2 = []
    for sent in doc.sentences:
        nouns = [w for w in sent.words if w.pos == "PROPN"]
        for noun in nouns:
            if noun.text in get_characters(doc)[1]:
                # Find constructions in the form of "The car is beautiful"
                # In this scenario, the adjective is the parent of the noun
                adj0 = sent.words[noun.head-1] #adjective directly related
                adjs = [adj0] + recursive_find_adjs(adj0, sent) if adj0.pos == "ADJ" or adj0.pos == "NOUN" or adj0.pos == "ADV" or adj0.pos == "CCONJ"or adj0.pos == "AUX" or adj0.pos == "ADP" else []               
                #The recursive function finds adjectives related to the first one found,
                #and hence also linked to the target noun
                mod_adjs = [w for w in sent.words if w.head == noun.id and (w.pos == "ADJ")]
                # This should only be one element because conjunctions are hierarchical
                if mod_adjs:
                    mod_adj = mod_adjs[0]
                    adjs.extend([mod_adj] + recursive_find_adjs(mod_adj, sent))
                if adjs:
                    unique_adjs = []
                    unique_ids = set()
                    for adj in adjs:
                        if adj.id not in unique_ids:
                            unique_adjs.append(adj)
                            unique_ids.add(adj.id)
                    names.append(noun.text)
                    attributes.append(" ".join([adj.text for adj in unique_adjs]))
    char_attributes = pd.DataFrame()
    char_attributes['Character Names'] = names
    char_attributes['Character Attributes'] = attributes
    char_attributes['Total Attributes'] = char_attributes.groupby('Character Names')['Character Attributes'].transform(lambda x: ' '.join(x))
    char_attributes= char_attributes[['Character Names','Total Attributes']]
    return (char_attributes.drop_duplicates().reset_index())

''' This function finds agent and patient verbs using the deprel output of the 
depparse processor'''

def agent_patient_verbs(doc):
    agent_verbs = {'id': [], 'word': [], 'head_id': [], 'agent_verbs': []}
    patient_verbs = {'id': [], 'word': [], 'head_id': [], 'patient_verbs': []}
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.deprel == "nsubj" or word.deprel == "acl:relcl":
                agent_verbs['id'].append(word.id)
                agent_verbs['word'].append(word.text)
                agent_verbs['head_id'].append(word.head)
                agent_verbs['agent_verbs'].append(sentence.words[word.head-1].text)
            elif word.deprel == "nsubj:pass" or word.deprel == "dobj" or word.deprel == "iobj":
                patient_verbs['id'].append(word.id)
                patient_verbs['word'].append(word.text)
                patient_verbs['head_id'].append(word.head)
                patient_verbs['patient_verbs'].append(sentence.words[word.head-1].text)

    return (pd.DataFrame(data=agent_verbs), pd.DataFrame(data=patient_verbs))


''' Here we implement the NLP analysis, using the previous functions to get the verbs and attributes related to the found characters in a plot summary'''


def create_table_dependencies(plot, nlp):
    doc = nlp(plot)
    attrs_table = char_attributes(doc)
    agent_verbs = agent_patient_verbs(doc)[0] 
    patient_verbs = agent_patient_verbs(doc)[1] 
    attrs_table['Agent Verbs'] = np.zeros(len(attrs_table['Character Names']))
    attrs_table['Patient Verbs'] = np.zeros(len(attrs_table['Character Names']))
    for idx, char in enumerate(attrs_table['Character Names']):
        av = []
        for idx2, w in enumerate(agent_verbs['word']):
            if (w in attrs_table['Total Attributes'][idx] or w == char):
                av.append(agent_verbs['agent_verbs'][idx2])
                attrs_table['Agent Verbs'][idx] = av
        pv = []
        for idx2, w in enumerate(patient_verbs['word']):
            if (w in attrs_table['Total Attributes'][idx] or w == char):
                pv.append(patient_verbs['patient_verbs'][idx2])
                attrs_table['Patient Verbs'][idx] = pv
            
    return attrs_table

''' Here is the main function, which loops through the whole dataset and creates a new one containing movie IDs, character's first name, attributes, agent verbs and patient verbs'''

def Analyse_Plots(df_plots, nlp):
    plot_analysis = pd.DataFrame()
    chars = []
    movies = []
    averbs = []
    pverbs = []
    attrs = []
    for i, summ in enumerate(df_plots['Plot Summary']):
        print('Plot analysed ', i, ' out of ', len(df_plots['Plot Summary']))
        male_gaze = create_table_dependencies(summ, nlp)
        for j in range(len(male_gaze)):
            movies.append(df_plots['Wikipedia movie ID'][i])
            chars.append(male_gaze['Character Names'][j])
            averbs.append(male_gaze['Agent Verbs'][j])
            pverbs.append(male_gaze['Patient Verbs'][j])
            attrs.append(male_gaze['Total Attributes'][j])
    plot_analysis['Wikipedia movie ID'] = movies
    plot_analysis['Character_Name'] = chars
    plot_analysis['Agent Verbs'] = averbs
    plot_analysis['Patient Verbs'] = pverbs
    plot_analysis['Attributes'] = attrs
    return plot_analysis

In [2]:
stanza.download('en') # download English model
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, lemma, depparse, ner')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-12-14 15:35:20 INFO: Downloading default packages for language: en (English) ...
2022-12-14 15:35:22 INFO: File exists: C:\Users\nerea\stanza_resources\en\default.zip
2022-12-14 15:35:26 INFO: Finished downloading models and saved to C:\Users\nerea\stanza_resources.
2022-12-14 15:35:26 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-12-14 15:35:28 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |

2022-12-14 15:35:28 INFO: Use device: cpu
2022-12-14 15:35:28 INFO: Loading: tokenize
2022-12-14 15:35:28 INFO: Loading: pos
2022-12-14 15:35:28 INFO: Loading: lemma
2022-12-14 15:35:28 INFO: Loading: depparse
2022-12-14 15:35:28 INFO: Loading: ner
2022-12-14 15:35:29 INFO: Done loading processors!


In [3]:
text = '''Jackson Maine, a famous country music singer privately battling an alcohol and drug addiction, plays a concert in California. His main support is Bobby, his manager and older half-brother. After the show, Jackson visits a drag bar where he witnesses a performance by Ally, a waitress and singer-songwriter. Jackson is amazed by her performance, and they spend the night speaking to each other, where Ally discloses to him the troubles she has faced in pursuing a professional music career. Jackson invites Ally to his next show. Despite her initial refusal she attends and, with Jackson's encouragement, sings on stage with him. Jackson invites Ally to go on tour with him, and they form a romantic relationship. In Arizona, Ally and Jackson visit the ranch where Jackson grew up and where his father is buried, only to discover that Bobby sold the land. Angered at his betrayal, Jackson punches Bobby, who subsequently quits as his manager. Before doing so, Bobby reveals that he did inform Jackson about the sale, but the latter was too inebriated to notice.'''

In [4]:
text

"Jackson Maine, a famous country music singer privately battling an alcohol and drug addiction, plays a concert in California. His main support is Bobby, his manager and older half-brother. After the show, Jackson visits a drag bar where he witnesses a performance by Ally, a waitress and singer-songwriter. Jackson is amazed by her performance, and they spend the night speaking to each other, where Ally discloses to him the troubles she has faced in pursuing a professional music career. Jackson invites Ally to his next show. Despite her initial refusal she attends and, with Jackson's encouragement, sings on stage with him. Jackson invites Ally to go on tour with him, and they form a romantic relationship. In Arizona, Ally and Jackson visit the ranch where Jackson grew up and where his father is buried, only to discover that Bobby sold the land. Angered at his betrayal, Jackson punches Bobby, who subsequently quits as his manager. Before doing so, Bobby reveals that he did inform Jackson

In [5]:
doc = nlp(text)
chars = get_characters(doc)

In [6]:
chars[1]

['Ally', 'Bobby', 'Jackson']

In [7]:
attr = char_attributes(doc)

In [8]:
attr

Unnamed: 0,index,Character Names,Total Attributes
0,0,Ally,performance waitress singer
1,1,Jackson,encouragement


In [9]:
def recursive_find_adjs(root, sent):
    children = [w for w in sent.words if w.head == root.id]

    if not children:
        return []

    filtered_c = [w for w in children if w.deprel == "conj" and w.upos == "ADJ"]
    # Do not include an adjective if it is the parent of a noun to prevent
    results = [w for w in filtered_c if not any(sub.head == w.id and sub.upos == "NOUN" for sub in sent.words)]
    for w in children:
        results += recursive_find_adjs(w, sent)

    return results

for sent in doc.sentences:
    nouns = [w for w in sent.words if w.upos == "NOUN"]
    noun_adj_pairs = {}
    for noun in nouns:
        # Find constructions in the form of "La voiture est belle"
        # In this scenario, the adjective is the parent of the noun
        cop_root = sent.words[noun.head-1]
        adjs = [cop_root] + recursive_find_adjs(cop_root, sent) if cop_root.upos == "ADJ" else []

        # Find constructions in the form of "La femme intelligente et belle"
        # Here, the adjectives are descendants of the noun
        mod_adjs = [w for w in sent.words if w.head == noun.id and w.upos == "ADJ"]
        # This should only be one element because conjunctions are hierarchical
        if mod_adjs:
            mod_adj = mod_adjs[0]
            adjs.extend([mod_adj] + recursive_find_adjs(mod_adj, sent))

        if adjs:
            unique_adjs = []
            unique_ids = set()
            for adj in adjs:
                if adj.id not in unique_ids:
                    unique_adjs.append(adj)
                    unique_ids.add(adj.id)

            noun_adj_pairs[noun.text] = " ".join([adj.text for adj in unique_adjs])

    print(noun_adj_pairs)

{'singer': 'famous'}
{'support': 'main', 'brother': 'older'}
{}
{'career': 'professional'}
{}
{'show': 'next'}
{'refusal': 'initial'}
{}
{'relationship': 'romantic'}
{}
{}
{'latter': 'inebriated'}


In [10]:
heads_w = {}

for sentence in doc.sentences:
    heads = []
    words = []
    head_words = []
    for word in sentence.words:
        #print(word.text, word.head, word.deprel, word.pos)
        heads.extend([word.head])
        words.extend([word.text])
    for i,h in enumerate(heads):
        for word in sentence.words:
            if word.id == h:
                head_words.append(word.text) 
    for i,w in enumerate(words[0:-1]):
        heads_w[words[i]] = head_words[i]


<built-in method values of dict object at 0x000001D601FF9400>


In [11]:
import spacy
import neuralcoref
nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)



ModuleNotFoundError: No module named 'neuralcoref'

In [1]:
import neuralcoref