* Почему текст подходит для оценки: разнообразный, много именнованных сущностей
* Тегсет: Universal Dependencies, потому что он универсален и на нем основывается stanza и natasha

In [48]:
with open('panorama_news.txt', encoding='utf-8') as file:
    text = file.read()

### stanza

In [49]:
import stanza

nlp = stanza.Pipeline(lang='ru', processors='tokenize,pos')
doc = nlp(text)

print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')

2021-10-02 05:18:24 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |

2021-10-02 05:18:24 INFO: Use device: cpu
2021-10-02 05:18:24 INFO: Loading: tokenize
2021-10-02 05:18:25 INFO: Loading: pos
2021-10-02 05:18:25 INFO: Done loading processors!


word: На	upos: ADP	xpos: None	feats: _
word: торгах	upos: NOUN	xpos: None	feats: Animacy=Inan|Case=Loc|Gender=Masc|Number=Plur
word: 20	upos: NUM	xpos: None	feats: _
word: сентября	upos: NOUN	xpos: None	feats: Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing
word: был	upos: AUX	xpos: None	feats: Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act
word: зафиксирован	upos: VERB	xpos: None	feats: Aspect=Perf|Gender=Masc|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass
word: самый	upos: ADJ	xpos: None	feats: Case=Nom|Degree=Pos|Gender=Masc|Number=Sing
word: низкий	upos: ADJ	xpos: None	feats: Case=Nom|Degree=Pos|Gender=Masc|Number=Sing
word: за	upos: ADP	xpos: None	feats: _
word: последние	upos: ADJ	xpos: None	feats: Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur
word: три	upos: NUM	xpos: None	feats: Animacy=Inan|Case=Acc
word: месяца	upos: NOUN	xpos: None	feats: Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing
word: курс	upos: NOUN	xpos: None	feats: Animacy=I

In [50]:
stanza_tagged = []
for sent in doc.sentences:
    for word in sent.words:
        stanza_tagged.append((word.text, word.upos))

In [51]:
stanza_tagged[:5]

[('На', 'ADP'),
 ('торгах', 'NOUN'),
 ('20', 'NUM'),
 ('сентября', 'NOUN'),
 ('был', 'AUX')]

In [52]:
stanza_tagged.append(('', ''))
stanza_tagged.append(('', ''))

### pymorphy2

In [53]:
import pymorphy2
import nltk
morph = pymorphy2.MorphAnalyzer()

In [55]:
tokens = nltk.word_tokenize(text)

In [56]:
pymorphy2_tagged = []

In [57]:
for token in tokens:
    token = morph.parse(token)
    pymorphy2_tagged.append((token[0].normal_form, token[0].tag.POS))
    

In [58]:
pymorphy2_tagged[:5]

[('на', 'PREP'),
 ('торг', 'NOUN'),
 ('20', None),
 ('сентябрь', 'NOUN'),
 ('быть', 'VERB')]

### natasha

In [60]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,

    Doc
)
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
segmenter = Segmenter()

In [61]:
doc = Doc(text)
doc.segment(segmenter)

In [62]:
doc.tag_morph(morph_tagger)

In [63]:
natasha_tagged = [(_.text, _.pos) for _ in doc.tokens]

In [64]:
natasha_tagged[:10]

[('На', 'ADP'),
 ('торгах', 'NOUN'),
 ('20', 'ADJ'),
 ('сентября', 'NOUN'),
 ('был', 'AUX'),
 ('зафиксирован', 'VERB'),
 ('самый', 'ADJ'),
 ('низкий', 'ADJ'),
 ('за', 'ADP'),
 ('последние', 'ADJ')]

Записываем результаты в csv-файлы. 

In [65]:
import csv

In [66]:
with open('natasha_results.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for s in natasha_tagged:
        writer.writerow([s[0], s[1]])

In [67]:
with open('stanza_results.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for s in stanza_tagged:
        writer.writerow([s[0], s[1]])

In [68]:
with open('pymorphy2_results.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for p in pymorphy2_tagged:
        writer.writerow([p[0], p[1]])

### Считаем совпадения в разметках

In [69]:
pymorphy2_dict = {
    'NOUN': 'NOUN',
    'ADJF': 'ADJ',
    'ADJS': 'ADJ',
    'COMP': 'ADJ',
    'VERB': 'VERB',
    'INFN': 'VERB',
    'PRTF': 'ADJ',
    'PRTS': 'ADJ',
    'GRND': 'VERB',
    'NUMR': 'NUM',
    'ADVB': 'ADV',
    'NPRO': 'PRON',
    'PRED': 'ADV',
    'PREP': 'ADP',
    'CONJ': 'CCONJ',
    'PRCL': 'PART',
    'INTJ': 'INTJ',
    '': ''
}

In [70]:
tagged_corp = []

In [71]:
with open('corpus.csv', 'r') as csv_file:
    reader = csv.reader(csv_file, delimiter=',')
    for row in reader:
        if len(row) == 2:
            tagged_corp.append((row[0], row[1]))

In [73]:
def read_corpus(file):
    with open(file, 'r') as csv_file:
        corpus = []
        reader = csv.reader(csv_file, delimiter=',')
        if file == 'pymorphy2_results.csv':
            for row in reader:
                if len(row) == 2:
                    pos_tag = pymorphy2_dict[row[1]]
                    corpus.append((row[0], pos_tag))
        else:
            for row in reader:
                if len(row) == 2:
                    corpus.append((row[0], row[1]))
    return corpus

In [78]:
acc = []

def get_accuracy(file):
    
    corpus = read_corpus(file)
    
    for tc in range(len(tagged_corp)):
        counter = 0
        if tagged_corp[tc][0] == corpus[tc][0]:
            if tagged_corp[tc][1] == corpus[tc][1]:
                acc.append(1)
                counter = 1
            else:
                acc.append(0)
                counter = 1
        else:
            for c in corpus:
                if c[0] == tagged_corp[tc][0]:
                    if c[1] == tagged_corp[tc][1]:
                        acc.append(1)
                        counter = 1
                        break
        if counter == 0:
            acc.append(0)
    return acc

### Оцениваем результаты

In [76]:
print(sum(get_accuracy('stanza_results.csv'))/len(get_accuracy('stanza_results.csv')))

0.691908713692946


In [77]:
print(sum(get_accuracy('pymorphy2_results.csv'))/len(get_accuracy('pymorphy2_results.csv')))

0.6493775933609959


In [79]:
print(sum(get_accuracy('natasha_results.csv'))/len(get_accuracy('natasha_results.csv')))

0.46265560165975106


В результате лучший результат показывает stanza.