In [1]:
import os 
import pandas as pd
import numpy as np
import stanza

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
stanza.download('en') # download English model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 40.5MB/s]
2022-11-09 16:16:56 INFO: Downloading default packages for language: en (English) ...
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/default.zip: 100%|█| 561M/561M [00:44<00
2022-11-09 16:17:55 INFO: Finished downloading models and saved to C:\Users\nerea\stanza_resources.


In [3]:
text = '''The nation of Panem consists of a wealthy Capitol and twelve poorer districts.
As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. 
The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. 
In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. 
Her older sister Katniss volunteers to take her place. 
Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute. 
Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. 
He warns them about the "Career" tributes who train intensively at special academies and almost always win. 
During a TV interview with Caesar Flickerman, Peeta unexpectedly reveals his love for Katniss.'''

In [4]:
#nlp = stanza.Pipeline('en') # initialize English neural pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize, mwt, pos, lemma, depparse, ner')

2022-11-09 16:17:55 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 8.50MB/s]
2022-11-09 16:17:57 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |

2022-11-09 16:17:57 INFO: Use device: cpu
2022-11-09 16:17:57 INFO: Loading: tokenize
2022-11-09 16:17:57 INFO: Loading: pos
2022-11-09 16:17:58 INFO: Loading: lemma
2022-11-09 16:17:58 INFO: Loading: depparse
2022-11-09 16:17:59 INFO: Loading: ner
2022-11-09 16:18:00 INFO: Done loading processors!


In [5]:
doc = nlp(text) # run annotation over a sentence
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

id: 1	word: The	head id: 2	head: nation	deprel: det
id: 2	word: nation	head id: 5	head: consists	deprel: nsubj
id: 3	word: of	head id: 4	head: Panem	deprel: case
id: 4	word: Panem	head id: 2	head: nation	deprel: nmod
id: 5	word: consists	head id: 0	head: root	deprel: root
id: 6	word: of	head id: 9	head: Capitol	deprel: case
id: 7	word: a	head id: 9	head: Capitol	deprel: det
id: 8	word: wealthy	head id: 9	head: Capitol	deprel: amod
id: 9	word: Capitol	head id: 5	head: consists	deprel: obl
id: 10	word: and	head id: 13	head: districts	deprel: cc
id: 11	word: twelve	head id: 13	head: districts	deprel: nummod
id: 12	word: poorer	head id: 13	head: districts	deprel: amod
id: 13	word: districts	head id: 9	head: Capitol	deprel: conj
id: 14	word: .	head id: 5	head: consists	deprel: punct
id: 1	word: As	head id: 2	head: punishment	deprel: case
id: 2	word: punishment	head id: 11	head: provide	deprel: obl
id: 3	word: for	head id: 6	head: rebellion	deprel: case
id: 4	word: a	head id: 6	head: rebelli

In [79]:
agent_verbs = {'id': [], 'word': [], 'head_id': [], 'agent_verbs': []}
patient_verbs = {'id': [], 'word': [], 'head_id': [], 'patient_verbs': []}
attributes = {'id': [], 'word': [], 'head_id': [], 'attributes': []}

for sentence in doc.sentences:
    for word in sentence.words:
        if word.deprel == "nsubj":
            agent_verbs['id'].append(word.id)
            agent_verbs['word'].append(word.text)
            agent_verbs['head_id'].append(word.head)
            agent_verbs['agent_verbs'].append(sentence.words[word.head-1].text)
        elif word.deprel == "nsubj:pass":
            patient_verbs['id'].append(word.id)
            patient_verbs['word'].append(word.text)
            patient_verbs['head_id'].append(word.head)
            patient_verbs['patient_verbs'].append(sentence.words[word.head-1].text)
        elif word.deprel == "amod":
            attributes['id'].append(word.id)
            attributes['word'].append(word.text)
            attributes['head_id'].append(word.head)
            attributes['attributes'].append(sentence.words[word.head-1].text)

In [81]:
print(agent_verbs)
agents = pd.DataFrame(data=agent_verbs)
agents

{'id': [2, 9, 2, 3, 1, 8, 14, 1, 10, 9], 'word': ['nation', 'district', 'tributes', 'sister', 'Peeta', 'who', 'she', 'He', 'who', 'Peeta'], 'head_id': [5, 11, 4, 5, 23, 10, 16, 2, 11, 11], 'agent_verbs': ['consists', 'provide', 'fight', 'volunteers', 'tribute', 'gave', 'starving', 'warns', 'train', 'reveals']}


Unnamed: 0,id,word,head_id,agent_verbs
0,2,nation,5,consists
1,9,district,11,provide
2,2,tributes,4,fight
3,3,sister,5,volunteers
4,1,Peeta,23,tribute
5,8,who,10,gave
6,14,she,16,starving
7,1,He,2,warns
8,10,who,11,train
9,9,Peeta,11,reveals


In [82]:
print(patient_verbs)
patients = pd.DataFrame(data=patient_verbs)
patients

{'id': [14, 11, 1], 'word': ['survivor', 'Primrose', 'Katniss'], 'head_id': [16, 14, 5], 'patient_verbs': ['rewarded', 'chosen', 'taken']}


Unnamed: 0,id,word,head_id,patient_verbs
0,14,survivor,16,rewarded
1,11,Primrose,14,chosen
2,1,Katniss,5,taken


In [83]:
print(attributes)
attributes = pd.DataFrame(data=attributes)
attributes

{'id': [8, 12, 5, 28, 13, 3, 10, 2, 20, 14, 17, 14], 'word': ['wealthy', 'poorer', 'past', 'annual', 'sole', 'first', 'old', 'older', 'other', 'drunk', 'past', 'special'], 'head_id': [9, 13, 6, 30, 14, 4, 11, 3, 23, 15, 18, 15], 'attributes': ['Capitol', 'districts', 'rebellion', 'Games', 'survivor', 'Reaping', 'Primrose', 'sister', 'tribute', 'mentor', 'victor', 'academies']}


Unnamed: 0,id,word,head_id,attributes
0,8,wealthy,9,Capitol
1,12,poorer,13,districts
2,5,past,6,rebellion
3,28,annual,30,Games
4,13,sole,14,survivor
5,3,first,4,Reaping
6,10,old,11,Primrose
7,2,older,3,sister
8,20,other,23,tribute
9,14,drunk,15,mentor


In [71]:
for sentence in doc.sentences:
    for word in sentence.words:
        print('text: ', word.text, ', ', 'lemma: ', word.lemma, ', ', 'position: ', word.pos)
        #lemma removes upper case letters, pos = part of speech = type of the word grammatically

text:  The ,  lemma:  the ,  position:  DET
text:  nation ,  lemma:  nation ,  position:  NOUN
text:  of ,  lemma:  of ,  position:  ADP
text:  Panem ,  lemma:  Panem ,  position:  PROPN
text:  consists ,  lemma:  consist ,  position:  VERB
text:  of ,  lemma:  of ,  position:  ADP
text:  a ,  lemma:  a ,  position:  DET
text:  wealthy ,  lemma:  wealthy ,  position:  ADJ
text:  Capitol ,  lemma:  Capitol ,  position:  PROPN
text:  and ,  lemma:  and ,  position:  CCONJ
text:  twelve ,  lemma:  twelve ,  position:  NUM
text:  poorer ,  lemma:  poorer ,  position:  ADJ
text:  districts ,  lemma:  district ,  position:  NOUN
text:  . ,  lemma:  . ,  position:  PUNCT
text:  As ,  lemma:  as ,  position:  ADP
text:  punishment ,  lemma:  punishment ,  position:  NOUN
text:  for ,  lemma:  for ,  position:  ADP
text:  a ,  lemma:  a ,  position:  DET
text:  past ,  lemma:  past ,  position:  ADJ
text:  rebellion ,  lemma:  rebellion ,  position:  NOUN
text:  , ,  lemma:  , ,  position:  PUN

In [72]:
for sentence in doc.sentences:
    #print(sentence.ents)
    print(sentence.dependencies)

[({
  "id": 2,
  "text": "nation",
  "lemma": "nation",
  "upos": "NOUN",
  "xpos": "NN",
  "feats": "Number=Sing",
  "head": 5,
  "deprel": "nsubj",
  "start_char": 4,
  "end_char": 10
}, 'det', {
  "id": 1,
  "text": "The",
  "lemma": "the",
  "upos": "DET",
  "xpos": "DT",
  "feats": "Definite=Def|PronType=Art",
  "head": 2,
  "deprel": "det",
  "start_char": 0,
  "end_char": 3
}), ({
  "id": 5,
  "text": "consists",
  "lemma": "consist",
  "upos": "VERB",
  "xpos": "VBZ",
  "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
  "head": 0,
  "deprel": "root",
  "start_char": 20,
  "end_char": 28
}, 'nsubj', {
  "id": 2,
  "text": "nation",
  "lemma": "nation",
  "upos": "NOUN",
  "xpos": "NN",
  "feats": "Number=Sing",
  "head": 5,
  "deprel": "nsubj",
  "start_char": 4,
  "end_char": 10
}), ({
  "id": 4,
  "text": "Panem",
  "lemma": "Panem",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": "Number=Sing",
  "head": 2,
  "deprel": "nmod",
  "start_char": 14,
  "end_char":

In [86]:
doc.sentences[0].print_dependencies()


print ("{:<15} | {:<10} | {:<15} ".format('Token', 'Relation', 'Head'))
print ("-" * 50)
  
# Convert sentence object to dictionary  
sent_dict = doc.sentences[0].to_dict()

# iterate to print the token, relation and head
for word in sent_dict:
  print ("{:<15} | {:<10} | {:<15} "
         .format(str(word['text']),str(word['deprel']), str(sent_dict[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))


('The', 2, 'det')
('nation', 5, 'nsubj')
('of', 4, 'case')
('Panem', 2, 'nmod')
('consists', 0, 'root')
('of', 9, 'case')
('a', 9, 'det')
('wealthy', 9, 'amod')
('Capitol', 5, 'obl')
('and', 13, 'cc')
('twelve', 13, 'nummod')
('poorer', 13, 'amod')
('districts', 9, 'conj')
('.', 5, 'punct')
Token           | Relation   | Head            
--------------------------------------------------
The             | det        | nation          
nation          | nsubj      | consists        
of              | case       | Panem           
Panem           | nmod       | nation          
consists        | root       | ROOT            
of              | case       | Capitol         
a               | det        | Capitol         
wealthy         | amod       | Capitol         
Capitol         | obl        | consists        
and             | cc         | districts       
twelve          | nummod     | districts       
poorer          | amod       | districts       
districts       | conj       | Ca

In [90]:
# Convert sentence object to dictionary  
sent_dict = doc.sentences[3].to_dict()

# iterate to print the token, relation and head
for word in sent_dict:
  print ("{:<15} | {:<10} | {:<15} "
         .format(str(word['text']),str(word['deprel']), str(sent_dict[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))

In              | case       | Reaping         
her             | nmod:poss  | Reaping         
first           | amod       | Reaping         
Reaping         | obl        | chosen          
,               | punct      | Reaping         
12              | nummod     | year            
-               | punct      | year            
year            | obl:npmod  | old             
-               | punct      | year            
old             | amod       | Primrose        
Primrose        | nsubj:pass | chosen          
Everdeen        | flat       | Primrose        
is              | aux:pass   | chosen          
chosen          | root       | ROOT            
from            | case       | 12              
District        | compound   | 12              
12              | obl        | chosen          
.               | punct      | chosen          


In [84]:
def recursive_find_adjs(root, sentence):
    children = [w for w in sentence.words if w.head == root.id]

    if not children:
        return []

    filtered_c = [w for w in children if w.deprel == "conj" and w.pos == "ADJ"]
    # Do not include an adjective if it is the parent of a noun to prevent
    results = [w for w in filtered_c if not any(sub.head == w.id and sub.pos == "PROPN" for sub in sent.words)]
    for w in children:
        results += recursive_find_adjs(w, sent)

    return results

for sent in doc.sentences:
    nouns = [w for w in sent.words if w.pos == "PROPN"]
    noun_adj_pairs = {}
    for noun in nouns:
        # Find constructions in the form of "The car is beautiful"
        # In this scenario, the adjective is the parent of the noun
        cop_root = sent.words[noun.head-1]
        adjs = [cop_root] + recursive_find_adjs(cop_root, sent) if cop_root.pos == "ADJ" else []

        # Find constructions in the form of "The intelligent and beautiful woman"
        # Here, the adjectives are descendants of the noun
        mod_adjs = [w for w in sent.words if w.head == noun.id and w.pos == "ADJ"]
        # This should only be one element because conjunctions are hierarchical
        if mod_adjs:
            mod_adj = mod_adjs[0]
            adjs.extend([mod_adj] + recursive_find_adjs(mod_adj, sent))

        if adjs:
            unique_adjs = []
            unique_ids = set()
            for adj in adjs:
                if adj.id not in unique_ids:
                    unique_adjs.append(adj)
                    unique_ids.add(adj.id)

            noun_adj_pairs[noun.text] = " ".join([adj.text for adj in unique_adjs])

    print(noun_adj_pairs)
print(len(doc.sentences))

{'Capitol': 'wealthy'}
{'Games': 'annual'}
{}
{'Primrose': 'old'}
{}
{}
{}
{}
{}
9


problem with words separated by "-", does not recognize it as one single word...