## Stanza

https://stanfordnlp.github.io/stanza/

In [None]:
!pip install stanza

In [2]:
import stanza

In [None]:
stanza.download('en') # download English model
nlp = stanza.Pipeline('en') # initialize English neural pipeline
doc = nlp("Barack Obama was born in Hawaii.") # run annotation over a sentence

In [4]:
print(doc)
print(doc.entities)

[
  [
    {
      "id": 1,
      "text": "Barack",
      "lemma": "Barack",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 4,
      "deprel": "nsubj:pass",
      "misc": "start_char=0|end_char=6",
      "ner": "B-PERSON"
    },
    {
      "id": 2,
      "text": "Obama",
      "lemma": "Obama",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 1,
      "deprel": "flat",
      "misc": "start_char=7|end_char=12",
      "ner": "E-PERSON"
    },
    {
      "id": 3,
      "text": "was",
      "lemma": "be",
      "upos": "AUX",
      "xpos": "VBD",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
      "head": 4,
      "deprel": "aux:pass",
      "misc": "start_char=13|end_char=16",
      "ner": "O"
    },
    {
      "id": 4,
      "text": "born",
      "lemma": "bear",
      "upos": "VERB",
      "xpos": "VBN",
      "feats": "Tense=Past|VerbForm=Part|Voice=Pass",
      "head": 0,
      

#### Токенизация

In [5]:
nlp = stanza.Pipeline(lang='en', processors='tokenize')
doc = nlp('This is a test sentence for stanza. This is another sentence.')
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

2021-01-11 15:30:41 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |

2021-01-11 15:30:41 INFO: Use device: cpu
2021-01-11 15:30:41 INFO: Loading: tokenize
2021-01-11 15:30:41 INFO: Done loading processors!


id: (1,)	text: This
id: (2,)	text: is
id: (3,)	text: a
id: (4,)	text: test
id: (5,)	text: sentence
id: (6,)	text: for
id: (7,)	text: stanza
id: (8,)	text: .
id: (1,)	text: This
id: (2,)	text: is
id: (3,)	text: another
id: (4,)	text: sentence
id: (5,)	text: .


#### Multi-Word Token (MWT) Expansion

In [7]:
stanza.download('fr') # download French model
nlp = stanza.Pipeline(lang='fr', processors='tokenize,mwt')
doc = nlp('Nous avons atteint la fin du sentier.')
for token in doc.sentences[0].tokens:
    print(f'token: {token.text}\twords: {", ".join([word.text for word in token.words])}')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 18.8MB/s]                    
2021-01-11 15:34:51 INFO: Downloading default packages for language: fr (French)...
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/fr/default.zip: 100%|██████████| 589M/589M [00:42<00:00, 13.7MB/s]
2021-01-11 15:35:45 INFO: Finished downloading models and saved to /root/stanza_resources.
2021-01-11 15:35:45 INFO: Loading these models for language: fr (French):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |

2021-01-11 15:35:45 INFO: Use device: cpu
2021-01-11 15:35:45 INFO: Loading: tokenize
2021-01-11 15:35:45 INFO: Loading: mwt
2021-01-11 15:35:45 INFO: Done loading processors!


token: Nous	words: Nous
token: avons	words: avons
token: atteint	words: atteint
token: la	words: la
token: fin	words: fin
token: du	words: de, le
token: sentier	words: sentier
token: .	words: .


In [8]:
for word in doc.sentences[0].words:
    print(f'word: {word.text}\tparent token: {word.parent.text}')

word: Nous	parent token: Nous
word: avons	parent token: avons
word: atteint	parent token: atteint
word: la	parent token: la
word: fin	parent token: fin
word: de	parent token: du
word: le	parent token: du
word: sentier	parent token: sentier
word: .	parent token: .


#### POS

universal POS (UPOS) tags, treebank-specific POS (XPOS) tags, and universal morphological features (UFeats)

In [10]:
doc = nlp('Barack Obama was born in Hawaii.')
print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')

word: Barack	upos: PROPN	xpos: NNP	feats: Number=Sing
word: Obama	upos: PROPN	xpos: NNP	feats: Number=Sing
word: was	upos: AUX	xpos: VBD	feats: Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
word: born	upos: VERB	xpos: VBN	feats: Tense=Past|VerbForm=Part|Voice=Pass
word: in	upos: ADP	xpos: IN	feats: _
word: Hawaii	upos: PROPN	xpos: NNP	feats: Number=Sing
word: .	upos: PUNCT	xpos: .	feats: _


#### Лемматизация

In [13]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
doc = nlp('Barack Obama was born in Hawaii.')
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')

2021-01-11 15:42:37 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2021-01-11 15:42:37 INFO: Use device: cpu
2021-01-11 15:42:37 INFO: Loading: tokenize
2021-01-11 15:42:37 INFO: Loading: pos
2021-01-11 15:42:39 INFO: Loading: lemma
2021-01-11 15:42:39 INFO: Done loading processors!


word: Barack 	lemma: Barack
word: Obama 	lemma: Obama
word: was 	lemma: be
word: born 	lemma: bear
word: in 	lemma: in
word: Hawaii 	lemma: Hawaii
word: . 	lemma: .


#### Dependency Parsing 

In [15]:
nlp = stanza.Pipeline(lang='fr', processors='tokenize,mwt,pos,lemma,depparse')
doc = nlp('Nous avons atteint la fin du sentier.')
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')


2021-01-11 15:45:38 INFO: Loading these models for language: fr (French):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2021-01-11 15:45:38 INFO: Use device: cpu
2021-01-11 15:45:38 INFO: Loading: tokenize
2021-01-11 15:45:38 INFO: Loading: mwt
2021-01-11 15:45:38 INFO: Loading: pos
2021-01-11 15:45:40 INFO: Loading: lemma
2021-01-11 15:45:40 INFO: Loading: depparse
2021-01-11 15:45:41 INFO: Done loading processors!


id: 1	word: Nous	head id: 3	head: atteint	deprel: nsubj
id: 2	word: avons	head id: 3	head: atteint	deprel: aux:tense
id: 3	word: atteint	head id: 0	head: root	deprel: root
id: 4	word: la	head id: 5	head: fin	deprel: det
id: 5	word: fin	head id: 3	head: atteint	deprel: obj
id: 6	word: de	head id: 8	head: sentier	deprel: case
id: 7	word: le	head id: 8	head: sentier	deprel: det
id: 8	word: sentier	head id: 5	head: fin	deprel: nmod
id: 9	word: .	head id: 3	head: atteint	deprel: punct


In [20]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')
doc = nlp('Colorless green ideas sleep furiously.')
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

2021-01-11 15:56:06 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| depparse  | ewt     |

2021-01-11 15:56:06 INFO: Use device: cpu
2021-01-11 15:56:06 INFO: Loading: tokenize
2021-01-11 15:56:06 INFO: Loading: pos
2021-01-11 15:56:07 INFO: Loading: lemma
2021-01-11 15:56:07 INFO: Loading: depparse
2021-01-11 15:56:08 INFO: Done loading processors!


id: 1	word: Colorless	head id: 3	head: ideas	deprel: amod
id: 2	word: green	head id: 3	head: ideas	deprel: amod
id: 3	word: ideas	head id: 4	head: sleep	deprel: nsubj
id: 4	word: sleep	head id: 0	head: root	deprel: root
id: 5	word: furiously	head id: 4	head: sleep	deprel: advmod
id: 6	word: .	head id: 4	head: sleep	deprel: punct


#### NER

In [16]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp("Twitter permanently suspends President Donald Trump.")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')

2021-01-11 15:47:41 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| ner       | ontonotes |

2021-01-11 15:47:41 INFO: Use device: cpu
2021-01-11 15:47:41 INFO: Loading: tokenize
2021-01-11 15:47:41 INFO: Loading: ner
2021-01-11 15:47:42 INFO: Done loading processors!


entity: Donald Trump	type: PERSON


#### Sentiment Analysis

negative (0), neutral (1), positive (2)

Источник: https://arxiv.org/abs/1408.5882

In [19]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')
doc = nlp('He hates that they banned him. He is f** furious.')
for i, sentence in enumerate(doc.sentences):
    print(i, sentence.sentiment)

2021-01-11 15:53:04 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| sentiment | sstplus |

2021-01-11 15:53:04 INFO: Use device: cpu
2021-01-11 15:53:04 INFO: Loading: tokenize
2021-01-11 15:53:04 INFO: Loading: sentiment
2021-01-11 15:53:06 INFO: Done loading processors!


0 1
1 0
