**NB:** For all tests, we used *FastText* embeddings downloaded
from [here](http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin).
If you want your model to achieve high scores, use embeddings
without any lemmatization, punctuation removal and other
inappropriate transformations.

## UPOS

In [1]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import UposTagger

FT_MODEL_FN = 'ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin'
MODEL_FN = 'upos-ft_model'
SEED=42

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [2]:
tagger = UposTagger(embs=globals()['tagger'].embs
                             if 'tagger' in globals() else
                         None)
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:11', word_emb_type='ft',
                 word_emb_path=FT_MODEL_FN,
                 rnn_emb_dim=300, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 lstm_layers=1, seed=SEED)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


=== UPOS TAGGER TRAINING PIPELINE ===

DATASETS CREATION

MODEL CREATION
Config saved

MODEL TRAINING
Epoch 1: 100%|██████████| 48814/48814 [02:06<00:00, 389.12it/s, train_loss=0.0564]
Epoch 1: 
Losses: train = 0.13044457, test = 0.03624736
Test: accuracy = 0.96886605
Test: precision = 0.89306302
Test: recall = 0.84544437
Test: f1_score = 0.85268954
new maximum score 0.96886605
Saving state_dict... done.
Epoch 2:   0%|          | 0/48814 [00:00<?, ?it/s]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Epoch 2: 100%|██████████| 48814/48814 [02:08<00:00, 354.04it/s, train_loss=0.0387]
Epoch 2: 
Losses: train = 0.04247080, test = 0.02837307
Test: accuracy = 0.97543211
Test: precision = 0.89942434
Test: recall = 0.88743450
Test: f1_score = 0.89237080
new maximum score 0.97543211
Saving state_dict... done.
Epoch 3: 100%|██████████| 48814/48814 [02:08<00:00, 379.30it/s, train_loss=0.031] 
Epoch 3: 
Losses: train = 0.03256159, test = 0.02520218
Test: accuracy = 0.97820032
Test: precision = 0.95854886
Test: recall = 0.91648277
Test: f1_score = 0.92489209
new maximum score 0.97820032
Saving state_dict... done.
Epoch 4: 100%|██████████| 48814/48814 [02:08<00:00, 352.92it/s, train_loss=0.0273]
Epoch 4: 
Losses: train = 0.02773231, test = 0.02305097
Test: accuracy = 0.98009925
Test: precision = 0.94514233
Test: recall = 0.92536437
Test: f1_score = 0.93228473
new maximum score 0.98009925
Saving state_dict... done.
Epoch 5: 100%|██████████| 48814/48814 [02:09<00:00, 340.94it/s, train_loss=0.024] 

In [3]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = UposTagger(embs=globals()['tagger'].embs
                             if 'tagger' in globals() else
                         None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [4]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


In [5]:
_ = tagger.evaluate(corpus.dev)

Evaluating UPOS


Load corpus

Processing corpus
  0%|          | 0/6584 [00:00<?, ?it/s]

Corpus has been loaded: 6584 sentences, 118692 tokens


100%|██████████| 6584/6584 [00:12<00:00, 548.21it/s]
UPOS total: 118488
   correct: 117036
     wrong: 1452
  Accuracy: 0.9877455944905813
[By sentence accuracy: 0.8201701093560145]


In [6]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating UPOS
Load corpus
[> 800                                                            

Load corpus
[> 800                                                            

[=> 1800                                                           

[=> 1800                                                           

[===> 3600                                                         

[===> 3600                                                         

[====> 4500                                                        

[====> 4500                                                        

[=====> 5600                                                       

[=====> 5600                                                       

Corpus has been loaded: 6584 sentences, 118692 tokens
UPOS total: 118488
   correct: 117036
     wrong: 1452
  Accuracy: 0.9877455944905813
[By sentence accuracy: 0.8201701093560145]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [7]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus

Processing corpus
100%|██████████| 6491/6491 [00:11<00:00, 547.56it/s]



Corpus has been loaded: 6491 sentences, 117523 tokens


In [8]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating UPOS


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


UPOS total: 117329
   correct: 116027
     wrong: 1302
  Accuracy: 0.9889029992584953
[By sentence accuracy: 0.8260668618086582]


In [9]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating UPOS
Load corpus
[> 800                                                            

Load corpus
[> 800                                                            

[====> 4300                                                        

[====> 4300                                                        





Corpus has been loaded: 6491 sentences, 117523 tokens
UPOS total: 117329
   correct: 116027
     wrong: 1302
  Accuracy: 0.9889029992584953
[By sentence accuracy: 0.8260668618086582]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [10]:
corp_gold = list(corpus.test())
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x['UPOS'] for x in corp_gold
                            for x in x[0] if x['UPOS']))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             label=tag, log_file=None)
    ))

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
ADJ: 0.9641299510458389
ADP: 0.9987852494577006
ADV: 0.953822988121131
AUX: 0.9293023255813954
CCONJ: 0.9646643109540636
DET: 0.9359692092372288
INTJ: 0.5833333333333334
NOUN: 0.9879952813822774
NUM: 0.9703405346027096
PART: 0.9360210341805434
PRON: 0.9583984375
PROPN: 0.9602237414543194
PUNCT: 1.0
SCONJ: 0.9270588235294117
SYM: 1.0
VERB: 0.9784617734667379
X: 0.7272727272727273


## FEATS

In [11]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import FeatsTagger

FT_MODEL_FN = 'ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin'
MODEL_FN = 'feats-ft_model'
SEED=42

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [12]:
tagger = FeatsTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:11', word_emb_type='ft',
                 word_emb_path=FT_MODEL_FN,
                 rnn_emb_dim=300, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=200, lstm_layers=1, seed=SEED)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus

=== FEATS TAGGER TRAINING PIPELINE ===

DATASETS CREATION

MODEL CREATION
Config saved

MODEL TRAINING
Epoch 1: 100%|██████████| 48814/48814 [02:13<00:00, 336.69it/s, train_loss=0.183]
Epoch 1: 
Losses: train = 0.39967257, test = 0.10713421
Test: accuracy = 0.91929984
Test: precision = 0.39796246
Test: recall = 0.39602833
Test: f1_score = 0.37890660
new maximum score 0.91929984
Saving state_dict... done.
Epoch 2: 100%|██████████| 48814/48814 [02:14<00:00, 292.85it/s, train_loss=0.0979]
Epoch 2: 
Losses: train = 0.11360265, test = 0.06761854
Test: accuracy = 0.94667814
Test: precision = 0.53970678
Test: recall = 0.52545652
Test: f1_score = 0.51340330
new maximum score 0.94667814
Saving state_dict... 

Corpus has been loaded: 6584 sentences, 118692 tokens
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


done.
Epoch 3: 100%|██████████| 48814/48814 [03:28<00:00, 233.80it/s, train_loss=0.0729]
Epoch 3: 
Losses: train = 0.07926444, test = 0.05386320
Test: accuracy = 0.95716866
Test: precision = 0.62164002
Test: recall = 0.60808066
Test: f1_score = 0.59730130
new maximum score 0.95716866
Saving state_dict... done.
Epoch 4: 100%|██████████| 48814/48814 [02:17<00:00, 344.66it/s, train_loss=0.0623]
Epoch 4: 
Losses: train = 0.06480450, test = 0.04718846
Test: accuracy = 0.96284012
Test: precision = 0.68296218
Test: recall = 0.66734046
Test: f1_score = 0.65840958
new maximum score 0.96284012
Saving state_dict... done.
Epoch 5: 100%|██████████| 48814/48814 [02:16<00:00, 335.33it/s, train_loss=0.0541]
Epoch 5: 
Losses: train = 0.05573969, test = 0.04299810
Test: accuracy = 0.96641010
Test: precision = 0.70598300
Test: recall = 0.68768654
Test: f1_score = 0.68195494
new maximum score 0.96641010
Saving state_dict... done.
Epoch 6: 100%|██████████| 48814/48814 [02:15<00:00, 338.25it/s, train_loss=0

In [13]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = FeatsTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [14]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


In [15]:
_ = tagger.evaluate(corpus.dev)

Evaluating FEATS


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [00:12<00:00, 510.46it/s]
FEATS total: 118488 tokens, 287136 tags
    correct: 116129 tokens, 283802 tags
      wrong: 2359 tokens, 3334 tags [296 excess / 412 absent / 2626 wrong type]
   Accuracy: 0.9676392718493216 / 0.9883887774434414
[Total accuracy: 0.9800908108838026 / 0.9883887774434414]
[By sentence accuracy: 0.7595686512758202]


In [16]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating FEATS
Load corpus
[> 400                                                            

Load corpus
[> 400                                                            

[> 1000                                                            

[=> 1100                                                           

[=> 1600                                                           

[=> 1700                                                           

[==> 2200                                                          

[==> 2400                                                          

[==> 3000                                                          

[===> 3100                                                         

[===> 3500                                                         

[===> 3900                                                         

[====> 4300                                                        

[====> 4600                                                        

[====> 5000                                                        

[=====> 5300                                                       

[=====> 5800                                                       



Corpus has been loaded: 6584 sentences, 118692 tokens
FEATS total: 118488 tokens, 287136 tags
    correct: 116129 tokens, 283802 tags
      wrong: 2359 tokens, 3334 tags [296 excess / 412 absent / 2626 wrong type]
   Accuracy: 0.9676392718493216 / 0.9883887774434414
[Total accuracy: 0.9800908108838026 / 0.9883887774434414]
[By sentence accuracy: 0.7595686512758202]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [17]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Processing corpus
100%|██████████| 6491/6491 [00:18<00:00, 355.67it/s]


In [18]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating FEATS


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


FEATS total: 117329 tokens, 285081 tags
    correct: 115080 tokens, 281630 tags
      wrong: 2249 tokens, 3451 tags [404 excess / 513 absent / 2534 wrong type]
   Accuracy: 0.9688840311021334 / 0.9878946685328029
[Total accuracy: 0.9808316784426697 / 0.9878946685328029]
[By sentence accuracy: 0.7544292096749345]


In [19]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating FEATS
Load corpus
[> 500                                                            

Load corpus
[> 500                                                            

[=> 1400                                                           

[=> 1400                                                           

[==> 2400                                                          

[==> 2400                                                          

[==> 3000                                                          

[==> 3000                                                          

[===> 3700                                                         

[===> 3700                                                         

[====> 4300                                                        

[====> 4300                                                        

[====> 4800                                                        

[====> 4800                                                        

[=====> 5500                                                       

[=====> 5600                                                       

Corpus has been loaded: 6491 sentences, 117523 tokens
FEATS total: 117329 tokens, 285081 tags
    correct: 115080 tokens, 281630 tags
      wrong: 2249 tokens, 3451 tags [404 excess / 513 absent / 2534 wrong type]
   Accuracy: 0.9688840311021334 / 0.9878946685328029
[Total accuracy: 0.9808316784426697 / 0.9878946685328029]
[By sentence accuracy: 0.7544292096749345]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [20]:
corp_gold = list(corpus.test())
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x for x in corp_gold
                    for x in x[0]
                    for x in x['FEATS'].keys()))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             feats=tag, log_file=None)
    ))

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
Animacy: 0.9847106856257919
Aspect: 0.992854565138939
Case: 0.9749034010482421
Degree: 0.9981028042924053
Foreign: 0.7627627627627628
Gender: 0.9855458739108149
Mood: 0.9973451327433628
Number: 0.993028609010194
Person: 0.998149992884588
Polarity: 0.8666666666666667
Tense: 0.9957154333660626
Variant: 0.9950402975821451
VerbForm: 0.9989173583543847
Voice: 0.9896788163118008


## LEMMA

In [21]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import LemmaTagger

FT_MODEL_FN = 'ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin'
MODEL_FN = 'lemma-ft_model'
SEED=42

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [22]:
tagger = LemmaTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:11', word_emb_type='ft',
                 word_emb_path=FT_MODEL_FN,
                 rnn_emb_dim=300, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=200, lstm_layers=1, seed=SEED)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus

Parse corpus
[########################################################] 100%    
done: 55398 sentences, 979787 acceptable tokens (plus 575 for YO letters)
Fit corpus dict... done.

Preliminary trainset preparation:
stage 1 of 3... done.
stage 2 of 3... done.
Lengths: [1883 {'allow_replace': True, 'allow_copy': True},
          1881 {'allow_replace': True, 'allow_copy': False},
          1915 {'allow_replace': False, 'allow_copy': True},
          1881 {'allow_replace': False, 'allow_copy': False}]
min = {'allow_replace': True, 'allow_copy': False}
stage 3 of 3... done.

=== LEMMA TAGGER TRAINING PIPELINE ===

DATASETS CREATION

MODEL CREATION
Config saved

MODEL TRAINING
Epoch 1: 100%|██████████| 48814/48814 [02:19<00:00, 350.64it/s, train_loss=0.159]
Epoch 1: 
Losses: train = 0.35587261, test = 0.10482431
Test: accuracy = 0.94818041
Test: precision = 0.21184849
Test: recall = 0.19658773
Test: f1_score = 0.19621228
new maximum score 0.94818041
Saving state_dict... done.
Epoch 2: 100%|█

Corpus has been loaded: 6584 sentences, 118692 tokens


In [23]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = LemmaTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [24]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


In [25]:
_ = tagger.evaluate(corpus.dev)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [00:12<00:00, 548.42it/s]
LEMMA total: 118488
    correct: 117809
      wrong: 679
   Accuracy: 0.9942694618864357
[By sentence accuracy: 0.9097812879708383]


In [26]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating LEMMA
Load corpus
[> 500                                                            

Load corpus
[> 800                                                            

[=> 1900                                                           

[=> 1900                                                           

[===> 3300                                                         

[===> 4000                                                         

[====> 5000                                                        

[=====> 5500                                                       

Corpus has been loaded: 6584 sentences, 118692 tokens
LEMMA total: 118488
    correct: 117809
      wrong: 679
   Accuracy: 0.9942694618864357
[By sentence accuracy: 0.9097812879708383]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [27]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus

Processing corpus
100%|██████████| 6491/6491 [00:12<00:00, 540.34it/s]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [28]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


LEMMA total: 117329
    correct: 116642
      wrong: 687
   Accuracy: 0.9941446701156577
[By sentence accuracy: 0.9063318440918194]


In [29]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating LEMMA
Load corpus
[> 700                                                            

Load corpus
[> 700                                                            

[==> 2300                                                          

[==> 2300                                                          

[===> 3100                                                         

[===> 3600                                                         

[====> 4100                                                        

[====> 4500                                                        

[=====> 5300                                                       

[=====> 5500                                                       





Corpus has been loaded: 6491 sentences, 117523 tokens
LEMMA total: 117329
    correct: 116642
      wrong: 687
   Accuracy: 0.9941446701156577
[By sentence accuracy: 0.9063318440918194]


Corpus has been loaded: 6491 sentences, 117523 tokens


## CoNLL18 Validation

In [30]:
from corpuscula.corpus_utils import download_ud, get_ud_test_path
import junky
from mordl import UposTagger, FeatsTagger, LemmaTagger, conll18_ud_eval

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)

corpus_gold = get_ud_test_path(corpus_name)
corpus_test = 'corpora/_test_tagged.conllu'

In [31]:
tagger_u = UposTagger(embs=globals()['tagger'].embs
                               if 'tagger' in globals() else
                           None)
tagger_u.load('upos-ft_model', device='cuda:10', dataset_device='cuda:10')
tagger_f = FeatsTagger(embs=globals()['tagger'].embs
                                if 'tagger' in globals() else
                            None)
tagger_f.load('feats-ft_model', device='cuda:10', dataset_device='cuda:10')
tagger_l = LemmaTagger(embs=globals()['tagger'].embs
                                if 'tagger' in globals() else
                            None)
tagger_l.load('lemma-ft_model', device='cuda:10', dataset_device='cuda:10')

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [32]:
_ = tagger_l.predict(
    tagger_f.predict(
        tagger_u.predict(corpus_gold)
    ), save_to=corpus_test
)

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
Processing corpus
100%|██████████| 6491/6491 [00:11<00:00, 562.53it/s]
Processing corpus
100%|██████████| 6491/6491 [00:11<00:00, 557.90it/s]
Processing corpus
100%|██████████| 6491/6491 [00:12<00:00, 538.66it/s]


In [33]:
conll18_ud_eval(corpus_gold, corpus_test)

Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |    100.00 |    100.00 |    100.00 |
Sentences  |    100.00 |    100.00 |    100.00 |
Words      |    100.00 |    100.00 |    100.00 |
UPOS       |     98.89 |     98.89 |     98.89 |     98.89
XPOS       |    100.00 |    100.00 |    100.00 |    100.00
UFeats     |     97.32 |     97.32 |     97.32 |     97.32
AllTags    |     97.06 |     97.06 |     97.06 |     97.06
Lemmas     |     98.51 |     98.51 |     98.51 |     98.51
UAS        |    100.00 |    100.00 |    100.00 |    100.00
LAS        |    100.00 |    100.00 |    100.00 |    100.00
CLAS       |    100.00 |    100.00 |    100.00 |    100.00
MLAS       |     95.44 |     95.44 |     95.44 |     95.44
BLEX       |     97.75 |     97.75 |     97.75 |     97.75


## MISC:NE

Note: The corpora we used are proprietary. You have to find another corpora.

In [34]:
from mordl import UposTagger, FeatsTagger

tagger = \
tagger_u = UposTagger(embs=globals()['tagger'].embs
                               if 'tagger' in globals() else
                           None)
tagger_u.load('upos-ft_model', device='cuda:11', dataset_device='cuda:11')
tagger_f = FeatsTagger(embs=globals()['tagger'].embs
                                if 'tagger' in globals() else
                            None)
tagger_f.load('feats-ft_model', device='cuda:11', dataset_device='cuda:11')

for corpora in zip(['corpora/ner_train.conllu',
                    'corpora/ner_dev.conllu',
                    'corpora/ner_test.conllu'],
                   ['corpora/ner_train_upos_feats.conllu',
                    'corpora/ner_dev_upos_feats.conllu',
                    'corpora/ner_test_upos_feats.conllu']):
    tagger_f.predict(
        tagger_u.predict(corpora[0]), save_to=corpora[1]
    )

del tagger_u, tagger_f

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Load corpus
Corpus has been loaded: 30390 sentences, 378829 tokens
Processing corpus
100%|██████████| 30390/30390 [00:41<00:00, 735.39it/s]
Processing corpus
100%|██████████| 30390/30390 [00:41<00:00, 738.11it/s]
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Processing corpus
100%|██████████| 3799/3799 [00:04<00:00, 778.19it/s]
Processing corpus
100%|██████████| 3799/3799 [00:04<00:00, 761.69it/s]
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Processing corpus
100%|██████████| 3798/3798 [00:04<00:00, 769.76it/s]
Processing corpus
100%|██████████| 3798/3798 [00:06<00:00, 621.73it/s]


In [35]:
import junky
from mordl import NeTagger

FT_MODEL_FN = 'ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin'
MODEL_FN = 'misc-ne-ft_model'
SEED=42

corpus_train = 'corpora/ner_train_upos_feats.conllu'
corpus_dev = 'corpora/ner_dev_upos_feats.conllu'
corpus_test = 'corpora/ner_test_upos_feats.conllu'

In [36]:
tagger = NeTagger(embs=globals()['tagger'].embs
                           if 'tagger' in globals() else
                       None)
tagger.load_train_corpus(corpus_train)
tagger.load_test_corpus(corpus_dev)

_ = tagger.train(MODEL_FN, device='cuda:11', word_emb_type='ft',
                 word_emb_path=FT_MODEL_FN,
                 rnn_emb_dim=300, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=200, lstm_layers=1, seed=SEED)

Train: 

Load corpus
Corpus has been loaded: 30390 sentences, 378829 tokens


Test: 

Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Parse corpus
[###################################] 100%                         
done: 34189 sentences, 408004 acceptable tokens (plus 0 for YO letters)
Fit corpus dict... done.

=== MISC:NE TAGGER TRAINING PIPELINE ===

DATASETS CREATION

MODEL CREATION
Config saved

MODEL TRAINING
Epoch 1: 100%|██████████| 30390/30390 [01:02<00:00, 486.51it/s, train_loss=0.16] 
Epoch 1: 
Losses: train = 0.21267492, test = 0.12583241
Test: accuracy = 0.90241117
Test: precision = 0.53908552
Test: recall = 0.39512047
Test: f1_score = 0.41888132
new maximum score 0.90241117
Saving state_dict... done.
Epoch 2:   0%|          | 0/30390 [00:00<?, ?it/s, train_loss=0.122]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Epoch 2: 100%|██████████| 30390/30390 [01:02<00:00, 441.97it/s, train_loss=0.123]
Epoch 2: 
Losses: train = 0.13016171, test = 0.10558253
Test: accuracy = 0.91713198
Test: precision = 0.60519670
Test: recall = 0.50108389
Test: f1_score = 0.51433264
new maximum score 0.91713198
Saving state_dict... done.
Epoch 3: 100%|██████████| 30390/30390 [01:02<00:00, 472.94it/s, train_loss=0.108]
Epoch 3: 
Losses: train = 0.11046946, test = 0.09530475
Test: accuracy = 0.92529611
Test: precision = 0.61576420
Test: recall = 0.54284185
Test: f1_score = 0.55905138
new maximum score 0.92529611
Saving state_dict... done.
Epoch 4: 100%|██████████| 30390/30390 [01:02<00:00, 502.91it/s, train_loss=0.0965]
Epoch 4: 
Losses: train = 0.09930579, test = 0.08984704
Test: accuracy = 0.92895516
Test: precision = 0.62719702
Test: recall = 0.55587481
Test: f1_score = 0.57628455
new maximum score 0.92895516
Saving state_dict... done.
Epoch 5: 100%|██████████| 30390/30390 [01:02<00:00, 455.44it/s, train_loss=0.0921]
E

In [37]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = NeTagger(embs=globals()['tagger'].embs
                           if 'tagger' in globals() else
                       None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [38]:
_ = tagger.predict(corpus_dev, clone_ds=True, save_to=res_dev)

Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens


In [39]:
_ = tagger.evaluate(corpus_dev)

Evaluating MISC:NE
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Processing corpus
100%|██████████| 3799/3799 [00:05<00:00, 758.27it/s]
MISC:NE total: 17189
      correct: 14357
        wrong: 2832 [838 excess / 1175 absent / 819 wrong type]
     Accuracy: 0.8352434696608296
[Total accuracy: 0.9401015228426396]
[By sentence accuracy: 0.5993682548038958]


In [40]:
_ = tagger.evaluate(corpus_dev, res_dev)

Evaluating MISC:NE
Load corpus
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
MISC:NE total: 17189
      correct: 14357
        wrong: 2832 [838 excess / 1175 absent / 819 wrong type]
     Accuracy: 0.8352434696608296
[Total accuracy: 0.9401015228426396]
[By sentence accuracy: 0.5993682548038958]


In [41]:
_ = tagger.predict(corpus_test, save_to=res_test)

Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Processing corpus
100%|██████████| 3798/3798 [00:04<00:00, 766.24it/s]


In [42]:
_ = tagger.evaluate(corpus_test, clone_ds=True)

Evaluating MISC:NE
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
MISC:NE total: 16880
      correct: 14092
        wrong: 2788 [945 excess / 1108 absent / 735 wrong type]
     Accuracy: 0.8348341232227489
[Total accuracy: 0.9408394516827229]
[By sentence accuracy: 0.6011058451816745]


In [43]:
_ = tagger.evaluate(corpus_test, res_test)

Evaluating MISC:NE
Load corpus
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
MISC:NE total: 16880
      correct: 14092
        wrong: 2788 [945 excess / 1108 absent / 735 wrong type]
     Accuracy: 0.8348341232227489
[Total accuracy: 0.9408394516827229]
[By sentence accuracy: 0.6011058451816745]


In [44]:
corp_gold = list(tagger._get_corpus(corpus_test, asis=True))
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x['MISC'].get('NE')
                      for x in corp_gold for x in x[0]
                          if x['MISC'].get('NE')))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             label=tag, log_file=None)
    ))

Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Address: 0.8321981424148607
City: 0.0
Date: 0.0
Department: 0.7541401273885351
Facility: 0.3015267175572519
Geo: 0.8546671918077984
Goal: 0.4128205128205128
Location: 0.35555555555555557
Organization: 0.8447818751506387
Person: 0.7401574803149606
PersonProperty: 0.7415458937198067
Phone: 0.649746192893401
Time: 0.0
