In [1]:
import stanza
import pandas as pd
import numpy as np

# UPOS Reference List

    ADJ: adjective
    ADP: adposition
    ADV: adverb
    AUX: auxiliary
    CCONJ: coordinating conjunction
    DET: determiner
    INTJ: interjection
    NOUN: noun
    NUM: numeral
    PART: particle
    PRON: pronoun
    PROPN: proper noun
    PUNCT: punctuation
    SCONJ: subordinating conjunction
    SYM: symbol
    VERB: verb
    X: other


In [2]:
print('Downloading English model...')
stanza.download('en')

Downloading English model...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 14.9MB/s]                    
2021-02-09 12:36:42 INFO: Downloading default packages for language: en (English)...
2021-02-09 12:36:43 INFO: File exists: /home/neon/stanza_resources/en/default.zip.
2021-02-09 12:36:50 INFO: Finished downloading models and saved to /home/neon/stanza_resources.


In [3]:
print('Building an English pipeline...')
en_nlp = stanza.Pipeline('en')

2021-02-09 12:36:50 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-02-09 12:36:50 INFO: Use device: cpu
2021-02-09 12:36:50 INFO: Loading: tokenize
2021-02-09 12:36:50 INFO: Loading: pos


Building an English pipeline...


2021-02-09 12:36:50 INFO: Loading: lemma
2021-02-09 12:36:50 INFO: Loading: depparse
2021-02-09 12:36:50 INFO: Loading: sentiment
2021-02-09 12:36:51 INFO: Loading: ner
2021-02-09 12:36:51 INFO: Done loading processors!


In [87]:
def get_hier(sent_no_str, sent_loc_ind):
    
    '''Input the index number of the sentence as a string, i.e. "Sentence 1",
    then give the sentence index location in the list of dicts, and include name for new df'''

    outside = [str(sent_no_str)]*len(sent_loc_ind)
    inside = list(range(len(sent_loc_ind)))
    hier_index = list(zip(outside,inside))
    multi_index = pd.MultiIndex.from_tuples(hier_index)
    
    df = pd.DataFrame(sent_loc_ind, index = multi_index)
    df.drop('id',axis=1,inplace=True)
    
    return df
    

In [50]:
#Process text into stanza document using en_nlp pipeline
s15 = en_nlp("Chapter Fifteen, of how the Spaniardsdeparted from Itztapalapan to enter Mexico. The Spaniards departed from Itztapalapan all outfitted for war and ordered bysquadrons. Some horsemen went ahead tosee if there was some ambush; they also took the greyhounds ahead. Don Hernando Cortes went in the rear guard with many other Spaniards, all armef and in order. After them went the baggage and the artillery on its carriages. Many Indian warriors went along, with all their arms, many Tlaxcalans and Huexotzinca. In this order they entered Mexico.")

In [53]:
#Convert stanza doc into python list of dicts
s15d = s15.to_dict()

# Need to automate this process:

In [79]:
adf = get_hier('Sent 1', s15d[0])
bdf = get_hier('Sent 2', s15d[1])
cdf = get_hier('Sent 3', s15d[2])
ddf = get_hier('Sent 4', s15d[3])
edf = get_hier('Sent 5', s15d[4])
fdf = get_hier('Sent 6', s15d[5])
gdf = get_hier('Sent 7', s15d[6])

In [80]:
tldr = pd.concat([adf,bdf,cdf,ddf,edf,fdf,gdf])

In [86]:
tldr

Unnamed: 0,Unnamed: 1,text,lemma,upos,xpos,feats,head,deprel,misc,ner
Sent 1,0,Chapter,Chapter,PROPN,NNP,Number=Sing,2,compound,start_char=0|end_char=7,O
Sent 1,1,Fifteen,Fifteen,PROPN,NNP,Number=Sing,0,root,start_char=8|end_char=15,S-CARDINAL
Sent 1,2,",",",",PUNCT,",",,2,punct,start_char=15|end_char=16,O
Sent 1,3,of,of,ADP,IN,,5,case,start_char=17|end_char=19,O
Sent 1,4,how,how,SCONJ,WRB,PronType=Int,2,nmod,start_char=20|end_char=23,O
...,...,...,...,...,...,...,...,...,...,...
Sent 7,2,order,order,NOUN,NN,Number=Sing,5,obl,start_char=516|end_char=521,O
Sent 7,3,they,they,PRON,PRP,Case=Nom|Number=Plur|Person=3|PronType=Prs,5,nsubj,start_char=522|end_char=526,O
Sent 7,4,entered,enter,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,0,root,start_char=527|end_char=534,O
Sent 7,5,Mexico,Mexico,PROPN,NNP,Number=Sing,5,obj,start_char=535|end_char=541,S-GPE


In [94]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline