## UPOS

In [1]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import UposTagger

BERT_MODEL_FN = 'bert-base-multilingual-cased'
MODEL_FN = 'upos-bert_model'
SEED=42
BERT_MAX_LEN, BERT_EPOCHS, BERT_BATCH_SIZE = 512, 3, 8

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [2]:
tagger = UposTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:6', word_emb_type='bert',
                 word_emb_path=MODEL_FN.replace('-bert_model',
                                                '_' + BERT_MODEL_FN)
                             + '_len{}_ep{}_bat{}_seed{}'
                                   .format(BERT_MAX_LEN, BERT_EPOCHS,
                                           BERT_BATCH_SIZE, SEED),
                 word_emb_tune_params={
                     'model_name': BERT_MODEL_FN,
                     'max_len': BERT_MAX_LEN,
                     'epochs': BERT_EPOCHS,
                     'batch_size': BERT_BATCH_SIZE
                 },
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 lstm_layers=3, seed=SEED, keep_embs=True)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Parse corpus
done: 55398 sentences, 979787 acceptable tokens (plus 575 for YO letters)
Fit corpus dict... done.
=== UPOS TAGGER TRAINING PIPELINE ===

BERT MODEL TUNING 'bert-base-multilingual-cased'. The result's model name will be 'upos_bert-base-multilingual-cased_len512_ep3_bat8_seed42'
Loading tokenizer...
Tokenizer is loaded. Vocab size: 119547
Corpora processing... done.
Loading model 'bert-base-multilingual-cased'... done.
Epoch 1: 100%|██████████| 48814/48814 [12:14<00:00, 66.13it/s, train_loss=0.0717] 
Average train loss: 0.08670417480997196
Dev: accuracy = 0.98225193
Dev: precision = 0.96434810
Dev: recall = 0.88591372
Dev: f1_score = 0.89677961
NB: Scores may be high because of labels stretching
Saving model to upos_bert-base-multilingual-cased_len512_ep3_bat8_seed42
Epoch 2: 100%|██████████| 48814/48814 [12:06<00:00, 67.79it/s, train_loss=0.0087]  
Average train loss: 0.03311146485173351
Dev: accuracy = 0.98572055
Dev: precision = 0.96505439
Dev: recall = 0.93086512
Dev: f

In [3]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = UposTagger(embs=globals()['tagger'].embs
                             if 'tagger' in globals() else
                         None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [4]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Vectorizing
100%|██████████| 103/103 [00:10<00:00,  9.80it/s]
Reordering
100%|██████████| 6584/6584 [00:23<00:00, 277.73it/s]


In [5]:
_ = tagger.evaluate(corpus.dev)

Evaluating UPOS


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [01:09<00:00, 94.83it/s] 
UPOS total: 118488
   correct: 117275
     wrong: 1213
  Accuracy: 0.9897626763891703
[By sentence accuracy: 0.8479647630619684]


In [6]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating UPOS
Load corpus
[> 900                                                            

Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            [> 800                                                            [> 900                                                            

[==> 2300                                                          

[> 1000                                                            [=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                                                           [=> 1500                                                           [=> 1600                                                           [=> 1700                                                           [=> 1800                                                           [=> 1900                                                           [=> 2000                                                           [==> 2100                                                          [==> 2200                                                          [==> 2300                                                          [==> 2400                                      

[===> 3500                                                         

[==> 2800                                                          [==> 2900                                                          [==> 3000                                                          [===> 3100                                                         [===> 3200                                                         [===> 3300                                                         [===> 3400                                                         [===> 3500                                                         [===> 3600                                                         [===> 3700                                                         [===> 3800                                                         [===> 3900                                                         [===> 4000                                                         [====> 4100                                                        [====> 4200                                    

Corpus has been loaded: 6584 sentences, 118692 tokens
UPOS total: 118488
   correct: 117275
     wrong: 1213
  Accuracy: 0.9897626763891703
[By sentence accuracy: 0.8479647630619684]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [7]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            [> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                                      

Processing corpus
  0%|          | 0/6491 [00:00<?, ?it/s]

Corpus has been loaded: 6491 sentences, 117523 tokens


100%|██████████| 6491/6491 [01:07<00:00, 95.98it/s] 


In [8]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating UPOS


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Vectorizing
100%|██████████| 102/102 [00:10<00:00,  9.86it/s]
Reordering
100%|██████████| 6491/6491 [00:23<00:00, 274.18it/s]
UPOS total: 117329
   correct: 116349
     wrong: 980
  Accuracy: 0.9916474187967169
[By sentence accuracy: 0.870127869357572]


In [9]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating UPOS
Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            [> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                      

Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            [> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                                      

Corpus has been loaded: 6491 sentences, 117523 tokens
UPOS total: 117329
   correct: 116349
     wrong: 980
  Accuracy: 0.9916474187967169
[By sentence accuracy: 0.870127869357572]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [10]:
corp_gold = list(corpus.test())
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x['UPOS'] for x in corp_gold
                            for x in x[0] if x['UPOS']))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             label=tag, log_file=None)
    ))

Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            [> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                                      

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Corpus has been loaded: 6491 sentences, 117523 tokens
ADJ: 0.9702855097109102
ADP: 0.99921875
ADV: 0.9624224123469216
AUX: 0.9529190207156308
CCONJ: 0.9849534272748985
DET: 0.9514528703047485
INTJ: 0.5
NOUN: 0.9900653049881896
NUM: 0.9750824477830707
PART: 0.9628318584070796
PRON: 0.9704269486878182
PROPN: 0.9664943123061014
PUNCT: 0.9996726983681676
SCONJ: 0.9475441106342394
SYM: 1.0
VERB: 0.9830832823025107
X: 0.75


## FEATS

In [11]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import FeatsTagger

BERT_MODEL_FN = 'bert-base-multilingual-cased'
MODEL_FN = 'feats-bert_model'
SEED=42
BERT_MAX_LEN, BERT_EPOCHS, BERT_BATCH_SIZE = 512, 3, 8

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [12]:
tagger = FeatsTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:6', word_emb_type='bert',
                 word_emb_path=MODEL_FN.replace('-bert_model',
                                                '_' + BERT_MODEL_FN)
                             + '_len{}_ep{}_bat{}_seed{}'
                                   .format(BERT_MAX_LEN, BERT_EPOCHS,
                                           BERT_BATCH_SIZE, SEED),
                 word_emb_tune_params={
                     'model_name': BERT_MODEL_FN,
                     'max_len': BERT_MAX_LEN,
                     'epochs': BERT_EPOCHS,
                     'batch_size': BERT_BATCH_SIZE
                 },
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=300, lstm_layers=3, seed=SEED, keep_embs=True)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


=== FEATS TAGGER TRAINING PIPELINE ===

BERT MODEL TUNING 'bert-base-multilingual-cased'. The result's model name will be 'feats_bert-base-multilingual-cased_len512_ep3_bat8_seed42'
Loading tokenizer...
Tokenizer is loaded. Vocab size: 119547
Corpora processing... done.
Loading model 'bert-base-multilingual-cased'... done.
Epoch 1: 100%|██████████| 48814/48814 [12:23<00:00, 65.61it/s, train_loss=0.119] 
Average train loss: 0.48633251200094013


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Dev: accuracy = 0.94740854
Dev: precision = 0.61015736
Dev: recall = 0.63154997
Dev: f1_score = 0.60670891
NB: Scores may be high because of labels stretching
Saving model to feats_bert-base-multilingual-cased_len512_ep3_bat8_seed42
Epoch 2: 100%|██████████| 48814/48814 [12:26<00:00, 65.57it/s, train_loss=0.102] 
Average train loss: 0.13621492406871671
Dev: accuracy = 0.96035391
Dev: precision = 0.72256610
Dev: recall = 0.74398063
Dev: f1_score = 0.72458306
NB: Scores may be high because of labels stretching
Saving model to feats_bert-base-multilingual-cased_len512_ep3_bat8_seed42
Epoch 3: 100%|██████████| 48814/48814 [12:23<00:00, 65.65it/s, train_loss=0.0174] 
Average train loss: 0.07456367332809576
Dev: accuracy = 0.96498032
Dev: precision = 0.76500494
Dev: recall = 0.77045950
Dev: f1_score = 0.75925111
NB: Scores may be high because of labels stretching
Saving model to feats_bert-base-multilingual-cased_len512_ep3_bat8_seed42

DATASETS CREATION
Tokenizing
100%|██████████| 48814/488

Epoch 25: 100%|██████████| 48814/48814 [04:11<00:00, 196.33it/s, train_loss=0.0084] 
Epoch 25: 
Losses: train = 0.00848115, test = 0.03186245
Test: accuracy = 0.98268179
Test: precision = 0.85341732
Test: recall = 0.85012187
Test: f1_score = 0.84648362
BAD EPOCHS: 3 (<< >)
Epoch 26: 100%|██████████| 48814/48814 [04:12<00:00, 200.36it/s, train_loss=0.00872]
Epoch 26: 
Losses: train = 0.00853294, test = 0.03149688
Test: accuracy = 0.98280839
Test: precision = 0.86580909
Test: recall = 0.85988717
Test: f1_score = 0.85725722
new maximum score 0.98280839
Saving state_dict... done.
Epoch 27: 100%|██████████| 48814/48814 [04:11<00:00, 191.57it/s, train_loss=0.00849]
Epoch 27: 
Losses: train = 0.00839034, test = 0.03202971
Test: accuracy = 0.98272399
Test: precision = 0.86262957
Test: recall = 0.85523915
Test: f1_score = 0.85343395
BAD EPOCHS: 1 (<< <)
Epoch 28: 100%|██████████| 48814/48814 [04:10<00:00, 193.14it/s, train_loss=0.00874]
Epoch 28: 
Losses: train = 0.00839503, test = 0.03146320
T

In [13]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = FeatsTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [14]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Vectorizing
100%|██████████| 103/103 [00:10<00:00,  9.69it/s]
Reordering
100%|██████████| 6584/6584 [00:23<00:00, 279.03it/s]


In [15]:
_ = tagger.evaluate(corpus.dev)

Evaluating FEATS


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [01:09<00:00, 94.56it/s] 
FEATS total: 118488 tokens, 287162 tags
    correct: 116461 tokens, 284019 tags
      wrong: 2027 tokens, 3143 tags [507 excess / 438 absent / 2198 wrong type]
   Accuracy: 0.9721902096367029 / 0.989054958525153
[Total accuracy: 0.9828927823914658 / 0.989054958525153]
[By sentence accuracy: 0.7828068043742406]


In [16]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating FEATS
Load corpus
[==> 2200                                                          

Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            [> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                                      

Corpus has been loaded: 6584 sentences, 118692 tokens
FEATS total: 118488 tokens, 287162 tags
    correct: 116461 tokens, 284019 tags
      wrong: 2027 tokens, 3143 tags [507 excess / 438 absent / 2198 wrong type]
   Accuracy: 0.9721902096367029 / 0.989054958525153
[Total accuracy: 0.9828927823914658 / 0.989054958525153]
[By sentence accuracy: 0.7828068043742406]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [17]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Processing corpus
100%|██████████| 6491/6491 [01:07<00:00, 96.15it/s] 


In [18]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating FEATS


Load corpus
[====> 4600                                                        

Vectorizing
100%|██████████| 102/102 [00:10<00:00,  9.76it/s]
Reordering
100%|██████████| 6491/6491 [00:23<00:00, 278.72it/s]
FEATS total: 117329 tokens, 285082 tags
    correct: 115318 tokens, 281895 tags
      wrong: 2011 tokens, 3187 tags [466 excess / 514 absent / 2207 wrong type]
   Accuracy: 0.9721714823425218 / 0.9888207603426382
[Total accuracy: 0.9828601624491814 / 0.9888207603426382]
[By sentence accuracy: 0.7724541673085811]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [19]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating FEATS
Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            [> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                     

Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            [> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                                      

Corpus has been loaded: 6491 sentences, 117523 tokens
FEATS total: 117329 tokens, 285082 tags
    correct: 115318 tokens, 281895 tags
      wrong: 2011 tokens, 3187 tags [466 excess / 514 absent / 2207 wrong type]
   Accuracy: 0.9721714823425218 / 0.9888207603426382
[Total accuracy: 0.9828601624491814 / 0.9888207603426382]
[By sentence accuracy: 0.7724541673085811]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [20]:
corp_gold = list(corpus.test())
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x for x in corp_gold
                    for x in x[0]
                    for x in x['FEATS'].keys()))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             feats=tag, log_file=None)
    ))

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
Animacy: 0.9845278020460502
Aspect: 0.9904033480049066
Case: 0.9824618444707952
Degree: 0.9971560611446854
Foreign: 0.703030303030303
Gender: 0.9849326582070889
Mood: 0.9963483456899414
Number: 0.99327512783834
Person: 0.9975793820304714
Polarity: 0.8695652173913043
Tense: 0.9944692239072257
Variant: 0.9956575682382134
VerbForm: 0.9983404286023523
Voice: 0.9891767082762104


## LEMMA

Note: For lemmata we use *FastText* word embeddings instead of *BERT*.
Accuracy is much higher (0.9946 vs. 0.9913).

**NB:** For this task, we use Russian *FastText* embeddings
provided by *Facebook*:
[cc.ru.300.bin.gz](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz).
We highly recommend them because it delivers the highest
evaluation scores. Also, embeddings provided by *DeepPavlov*
([ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin](http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin))
could be used, too. They deliver just slightly worse model
quality.

Embeddings from RusVectores is the worst choice because of
inappropriate preprocessing. If you want your model to achieve
high scores, use embeddings without any lemmatization,
punctuation removal and other archaic transformations.
Embeddings of words with part of speech tags appended are also
useless.

In [21]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import LemmaTagger

FT_MODEL_FN = 'cc.ru.300.bin'
MODEL_FN = 'lemma-ft_model'
SEED=42

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [22]:
tagger = LemmaTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:6', word_emb_type='ft',
                 word_emb_path=FT_MODEL_FN,
                 rnn_emb_dim=300, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=300, lstm_layers=1, seed=SEED, keep_embs=True)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Parse corpus
done: 55398 sentences, 979787 acceptable tokens (plus 575 for YO letters)
Fit corpus dict... done.

Preliminary trainset preparation:
stage 1 of 3... done.
stage 2 of 3... done.
Lengths: [1883 {'allow_replace': True, 'allow_copy': True},
          1881 {'allow_replace': True, 'allow_copy': False},
          1915 {'allow_replace': False, 'allow_copy': True},
          1881 {'allow_replace': False, 'allow_copy': False}]
min = {'allow_replace': True, 'allow_copy': False}
stage 3 of 3... done.

=== LEMMA TAGGER TRAINING PIPELINE ===

DATASETS CREATION

MODEL CREATION
Config saved

MODEL TRAINING
Epoch 1: 100%|██████████| 48814/48814 [02:09<00:00, 377.14it/s, train_loss=0.143]
Epoch 1: 
Losses: train = 0.31242337, test = 0.09480128
Test: accuracy = 0.95196982
Test: precision = 0.22320700
Test: recall = 0.21054018
Test: f1_score = 0.20824801
new maximum score 0.95196982
Saving state_dict... done.
Epoch 2: 100%|██████████| 48814/48814 [02:10<00:00, 373.20it/s, train_loss=0.0826]


Epoch 26: 100%|██████████| 48814/48814 [02:07<00:00, 377.57it/s, train_loss=0.00862]
Epoch 26: 
Losses: train = 0.00836087, test = 0.01787965
Test: accuracy = 0.99291911
Test: precision = 0.81615304
Test: recall = 0.79733326
Test: f1_score = 0.79880132
BAD EPOCHS: 2 (<< <)
Epoch 27: 100%|██████████| 48814/48814 [02:09<00:00, 369.25it/s, train_loss=0.0082] 
Epoch 27: 
Losses: train = 0.00790170, test = 0.01729759
Test: accuracy = 0.99293599
Test: precision = 0.82556965
Test: recall = 0.80496194
Test: f1_score = 0.80820625
BAD EPOCHS: 2 (<< >)
Epoch 28: 100%|██████████| 48814/48814 [02:08<00:00, 373.04it/s, train_loss=0.00753]
Epoch 28: 
Losses: train = 0.00752251, test = 0.01729198
Test: accuracy = 0.99299507
Test: precision = 0.81649609
Test: recall = 0.80098045
Test: f1_score = 0.80246074
BAD EPOCHS: 2 (<< >)
Epoch 29: 100%|██████████| 48814/48814 [02:07<00:00, 389.08it/s, train_loss=0.00737]
Epoch 29: 
Losses: train = 0.00740685, test = 0.01725600
Test: accuracy = 0.99303727
Test: pr

Epoch 2: 100%|██████████| 48814/48814 [02:06<00:00, 384.93it/s, train_loss=0.00373]
Epoch 2: 
Losses: train = 0.00372300, test = 0.01732891
Test: accuracy = 0.99400783
Test: precision = 0.85083628
Test: recall = 0.83242052
Test: f1_score = 0.83483378
new maximum score 0.99400783
Saving state_dict... done.
Epoch 3: 100%|██████████| 48814/48814 [02:03<00:00, 393.85it/s, train_loss=0.00374]
Epoch 3: 
Losses: train = 0.00361163, test = 0.01766310
Test: accuracy = 0.99400783
Test: precision = 0.84866047
Test: recall = 0.83094964
Test: f1_score = 0.83326091
BAD EPOCHS: 1 (== =)
Epoch 4: 100%|██████████| 48814/48814 [02:06<00:00, 379.89it/s, train_loss=0.00375]
Epoch 4: 
Losses: train = 0.00358771, test = 0.01818226
Test: accuracy = 0.99400783
Test: precision = 0.84923373
Test: recall = 0.83071453
Test: f1_score = 0.83305865
BAD EPOCHS: 2 (== =)
Epoch 5: 100%|██████████| 48814/48814 [02:05<00:00, 391.83it/s, train_loss=0.00357]
Epoch 5: 
Losses: train = 0.00348782, test = 0.01760684
Test: acc

Epoch 31: 100%|██████████| 48814/48814 [02:28<00:00, 323.35it/s, train_loss=0.00299]
Epoch 31: 
Losses: train = 0.00294290, test = 0.01757238
Test: accuracy = 0.99409223
Test: precision = 0.84522938
Test: recall = 0.82898481
Test: f1_score = 0.83021282
BAD EPOCHS: 5 (<< <)
Maximum bad epochs exceeded. Process has been stopped. Best score 0.9941597461346297 (on epoch 24)
Elapsed time: 1h 12m 36s

=== LEMMA TAGGER TRAINING HAS FINISHED === Total time: 3h 18m 58s ===

Use the `.load('lemma-ft_model')` method to start working with the LEMMA tagger.


In [23]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = LemmaTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [24]:
_ = tagger.predict(corpus.dev, use_cdict_coef=True, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


In [25]:
_ = tagger.evaluate(corpus.dev)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [00:15<00:00, 423.29it/s]
LEMMA total: 118488
    correct: 117817
      wrong: 671
   Accuracy: 0.9943369792721626
[By sentence accuracy: 0.910996354799514]


In [26]:
_ = tagger.evaluate(corpus.dev, use_cdict_coef=True)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [00:15<00:00, 414.78it/s]
LEMMA total: 118488
    correct: 117827
      wrong: 661
   Accuracy: 0.9944213760043211
[By sentence accuracy: 0.912363304981774]


In [27]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating LEMMA
Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            

Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            

[> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            

[> 400                                                            [> 500                                                            [> 600                                                            [> 700                                                            

[> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           

[> 800                                                            [> 900                                                            [> 1000                                                            [=> 1100                                                           

[=> 1200                                                           [=> 1300                                                           [=> 1400                                                           [=> 1500                                                           [=> 1600                                                           

[=> 1200                                                           [=> 1300                                                           [=> 1400                                                           [=> 1500                                                           [=> 1600                                                           

[=> 1700                                                           [=> 1800                                                           [=> 1900                                                           [=> 2000                                                           [==> 2100                                                          [==> 2200                                                          

[=> 1700                                                           [=> 1800                                                           [=> 1900                                                           [=> 2000                                                           [==> 2100                                                          [==> 2200                                                          

[==> 2300                                                          [==> 2400                                                          [==> 2500                                                          [==> 2600                                                          [==> 2700                                                          [==> 2800                                                          

[==> 2300                                                          [==> 2400                                                          [==> 2500                                                          [==> 2600                                                          [==> 2700                                                          [==> 2800                                                          

[==> 2900                                                          [==> 3000                                                          [===> 3100                                                         [===> 3200                                                         

[==> 2900                                                          [==> 3000                                                          [===> 3100                                                         [===> 3200                                                         

[===> 3300                                                         [===> 3400                                                         [===> 3500                                                         [===> 3600                                                         [===> 3700                                                         [===> 3800                                                         [===> 3900                                                         [===> 4000                                                         [====> 4100                                                        

[===> 3300                                                         [===> 3400                                                         [===> 3500                                                         [===> 3600                                                         [===> 3700                                                         [===> 3800                                                         [===> 3900                                                         [===> 4000                                                         [====> 4100                                                        

[====> 4200                                                        [====> 4300                                                        [====> 4400                                                        [====> 4500                                                        [====> 4600                                                        [====> 4700                                                        

[====> 4200                                                        [====> 4300                                                        [====> 4400                                                        [====> 4500                                                        [====> 4600                                                        [====> 4700                                                        

[====> 4800                                                        [====> 4900                                                        [====> 5000                                                        

[====> 4800                                                        [====> 4900                                                        [====> 5000                                                        

[=====> 5100                                                       [=====> 5200                                                       [=====> 5300                                                       [=====> 5400                                                       [=====> 5500                                                       [=====> 5600                                                       

[=====> 5100                                                       [=====> 5200                                                       [=====> 5300                                                       [=====> 5400                                                       [=====> 5500                                                       [=====> 5600                                                       





Corpus has been loaded: 6584 sentences, 118692 tokens
LEMMA total: 118488
    correct: 117827
      wrong: 661
   Accuracy: 0.9944213760043211
[By sentence accuracy: 0.912363304981774]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [28]:
_ = tagger.predict(corpus.test, use_cdict_coef=True, save_to=res_test)

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Processing corpus
100%|██████████| 6491/6491 [00:20<00:00, 314.63it/s]


In [29]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


LEMMA total: 117329
    correct: 116697
      wrong: 632
   Accuracy: 0.9946134374280868
[By sentence accuracy: 0.9137266985056232]


In [30]:
_ = tagger.evaluate(corpus.test, use_cdict_coef=True, clone_ds=True)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


LEMMA total: 117329
    correct: 116692
      wrong: 637
   Accuracy: 0.994570822217866
[By sentence accuracy: 0.9131104606378062]


In [31]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating LEMMA
Load corpus
[> 600                                                            

Load corpus
[> 0                                                             [> 100                                                            [> 200                                                            [> 300                                                            [> 400                                                            [> 500                                                            [> 600                                                            

[> 1000                                                            

[> 700                                                            [> 800                                                            [> 900                                                            [> 1000                                                            

[=> 1500                                                           

[=> 1100                                                           [=> 1200                                                           [=> 1300                                                           [=> 1400                                                           [=> 1500                                                           

[==> 2100                                                          

[=> 1600                                                           [=> 1700                                                           [=> 1800                                                           [=> 1900                                                           [=> 2000                                                           [==> 2100                                                          [==> 2200                                                          

[==> 2600                                                          

[==> 2300                                                          [==> 2400                                                          [==> 2500                                                          [==> 2600                                                          

[===> 3100                                                         

[==> 2700                                                          [==> 2800                                                          [==> 2900                                                          [==> 3000                                                          [===> 3100                                                         

[===> 3600                                                         

[===> 3200                                                         [===> 3300                                                         [===> 3400                                                         [===> 3500                                                         [===> 3600                                                         [===> 3700                                                         [===> 3800                                                         

[====> 4200                                                        

[===> 3900                                                         [===> 4000                                                         [====> 4100                                                        [====> 4200                                                        [====> 4300                                                        

[====> 4700                                                        

[====> 4400                                                        [====> 4500                                                        [====> 4600                                                        [====> 4700                                                        [====> 4800                                                        

[=====> 5200                                                       

[====> 4900                                                        [====> 5000                                                        [=====> 5100                                                       [=====> 5200                                                       [=====> 5300                                                       

[=====> 5700                                                       

[=====> 5400                                                       [=====> 5500                                                       [=====> 5600                                                       [=====> 5700                                                       [=====> 5800                                                       





Corpus has been loaded: 6491 sentences, 117523 tokens
LEMMA total: 117329
    correct: 116692
      wrong: 637
   Accuracy: 0.994570822217866
[By sentence accuracy: 0.9131104606378062]


Corpus has been loaded: 6491 sentences, 117523 tokens


## CoNLL18 Validation

In [32]:
from corpuscula.corpus_utils import download_ud, get_ud_test_path
import junky
from mordl import UposTagger, FeatsTagger, LemmaTagger, conll18_ud_eval

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)

corpus_gold = get_ud_test_path(corpus_name)
corpus_test = 'corpora/_test_tagged.conllu'

In [33]:
del tagger
tagger_u = UposTagger()
tagger_u.load('upos-bert_model', device='cuda:6', dataset_device='cuda:6')
tagger_f = FeatsTagger()
tagger_f.load('feats-bert_model', device='cuda:6', dataset_device='cuda:6')
tagger_l = LemmaTagger()
tagger_l.load('lemma-ft_model', device='cuda:6', dataset_device='cuda:6')

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [34]:
_ = tagger_l.predict(
    tagger_f.predict(
        tagger_u.predict(corpus_gold)
    ), save_to=corpus_test
)

del tagger_u, tagger_f, tagger_l

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
Processing corpus
100%|██████████| 6491/6491 [01:17<00:00, 83.37it/s]
Processing corpus
100%|██████████| 6491/6491 [01:16<00:00, 84.69it/s]
Processing corpus
100%|██████████| 6491/6491 [00:16<00:00, 390.27it/s]


In [35]:
conll18_ud_eval(corpus_gold, corpus_test)

Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |    100.00 |    100.00 |    100.00 |
Sentences  |    100.00 |    100.00 |    100.00 |
Words      |    100.00 |    100.00 |    100.00 |
UPOS       |     99.16 |     99.16 |     99.16 |     99.16
XPOS       |    100.00 |    100.00 |    100.00 |    100.00
UFeats     |     97.76 |     97.76 |     97.76 |     97.76
AllTags    |     97.58 |     97.58 |     97.58 |     97.58
Lemmas     |     98.66 |     98.66 |     98.66 |     98.66
UAS        |    100.00 |    100.00 |    100.00 |    100.00
LAS        |    100.00 |    100.00 |    100.00 |    100.00
CLAS       |    100.00 |    100.00 |    100.00 |    100.00
MLAS       |     96.25 |     96.25 |     96.25 |     96.25
BLEX       |     97.97 |     97.97 |     97.97 |     97.97


## MISC:NE

Note: the corpora we used are proprietary. You have to find another corpora.

In [36]:
from mordl import UposTagger, FeatsTagger

tagger_u = UposTagger()
tagger_u.load('upos-bert_model', device='cuda:6', dataset_device='cuda:6')
tagger_f = FeatsTagger()
tagger_f.load('feats-bert_model', device='cuda:6', dataset_device='cuda:6')

for corpora in zip(['corpora/ner_train.conllu',
                    'corpora/ner_dev.conllu',
                    'corpora/ner_test.conllu'],
                   ['corpora/ner_train_upos_feats.conllu',
                    'corpora/ner_dev_upos_feats.conllu',
                    'corpora/ner_test_upos_feats.conllu']):
    tagger_f.predict(
        tagger_u.predict(corpora[0]), save_to=corpora[1]
    )

del tagger_u, tagger_f

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Load corpus
Corpus has been loaded: 30390 sentences, 378829 tokens
Processing corpus
100%|██████████| 30390/30390 [03:53<00:00, 130.04it/s]
Processing corpus
100%|██████████| 30390/30390 [03:55<00:00, 128.92it/s]
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Processing corpus
100%|██████████| 3799/3799 [00:29<00:00, 130.52it/s]
Processing corpus
100%|██████████| 3799/3799 [00:29<00:00, 130.02it/s]
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Processing corpus
100%|██████████| 3798/3798 [00:28<00:00, 131.71it/s]
Processing corpus
100%|██████████| 3798/3798 [00:29<00:00, 130.87it/s]


In [37]:
import junky
from mordl import NeTagger

BERT_MODEL_FN = 'bert-base-multilingual-cased'
MODEL_FN = 'misc-ne-bert_model'
SEED=42
BERT_MAX_LEN, BERT_EPOCHS, BERT_BATCH_SIZE = 512, 3, 8

corpus_train = 'corpora/ner_train_upos_feats.conllu'
corpus_dev = 'corpora/ner_dev_upos_feats.conllu'
corpus_test = 'corpora/ner_test_upos_feats.conllu'

In [38]:
tagger = NeTagger()
tagger.load_train_corpus(corpus_train)
tagger.load_test_corpus(corpus_dev)

_ = tagger.train(MODEL_FN, device='cuda:6', word_emb_type='bert',
                 word_emb_path=MODEL_FN.replace('-bert_model',
                                                '_' + BERT_MODEL_FN)
                             + '_len{}_ep{}_bat{}_seed{}'
                                   .format(BERT_MAX_LEN, BERT_EPOCHS,
                                           BERT_BATCH_SIZE, SEED),
                 word_emb_tune_params={
                     'model_name': BERT_MODEL_FN,
                     'max_len': BERT_MAX_LEN,
                     'epochs': BERT_EPOCHS,
                     'batch_size': BERT_BATCH_SIZE
                 },
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=300, lstm_layers=2, seed=SEED, keep_embs=True)

Train: 

Load corpus
Corpus has been loaded: 30390 sentences, 378829 tokens
Load corpus
[=> 1500                                                           

Test: 

[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Parse corpus
done: 34189 sentences, 408004 acceptable tokens (plus 0 for YO letters)
Fit corpus dict... done.

=== MISC:NE TAGGER TRAINING PIPELINE ===

BERT MODEL TUNING 'bert-base-multilingual-cased'. The result's model name will be 'misc-ne_bert-base-multilingual-cased_len512_ep3_bat8_seed42'
Loading tokenizer...
Tokenizer is loaded. Vocab size: 119547
Corpora processing... done.
Loading model 'bert-base-multilingual-cased'... done.
Epoch 1: 100%|██████████| 30390/30390 [06:54<00:00, 74.16it/s, train_loss=0.222] 
Average train loss: 0.22115723773073204
Dev: accuracy = 0.94016071
Dev: precision = 0.64285216
Dev: recall = 0.60400200
Dev: f1_score = 0.60815842
NB: Scores may be high because of labels stretching
Saving model to misc-ne_bert-base-multilingual-cased_len512_ep3_bat8_seed42
Epoch 2: 100%|██████████| 30390/30390 [06:53<00:00, 72.34it/s, train_loss=0.232] 


Epoch 3: 100%|██████████| 30390/30390 [01:12<00:00, 431.45it/s, train_loss=0.0297]
Epoch 3: 
Losses: train = 0.02933711, test = 0.08516556
Test: accuracy = 0.94263959
Test: precision = 0.73195317
Test: recall = 0.66416926
Test: f1_score = 0.69201386
BAD EPOCHS: 3 (<< <)
Epoch 4: 100%|██████████| 30390/30390 [01:11<00:00, 428.26it/s, train_loss=0.0289]
Epoch 4: 
Losses: train = 0.02911509, test = 0.08611045
Test: accuracy = 0.94295685
Test: precision = 0.73630820
Test: recall = 0.67433371
Test: f1_score = 0.69980265
BAD EPOCHS: 3 (<< >)
Epoch 5: 100%|██████████| 30390/30390 [01:12<00:00, 415.42it/s, train_loss=0.0291]
Epoch 5: 
Losses: train = 0.02899472, test = 0.08680843
Test: accuracy = 0.94295685
Test: precision = 0.74488739
Test: recall = 0.66745949
Test: f1_score = 0.69845889
BAD EPOCHS: 4 (<< =)
Epoch 6: 100%|██████████| 30390/30390 [01:10<00:00, 434.59it/s, train_loss=0.029] 
Epoch 6: 
Losses: train = 0.02930892, test = 0.08500303
Test: accuracy = 0.94266074
Test: precision = 0.

In [39]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = NeTagger(embs=globals()['tagger'].embs
                           if 'tagger' in globals() else
                       None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [40]:
_ = tagger.predict(corpus_dev, clone_ds=True, save_to=res_dev)

Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Vectorizing
100%|██████████| 60/60 [00:05<00:00, 11.05it/s]
Reordering
100%|██████████| 3799/3799 [00:09<00:00, 390.78it/s]


In [41]:
_ = tagger.evaluate(corpus_dev)

Evaluating MISC:NE
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Processing corpus
100%|██████████| 3799/3799 [00:27<00:00, 138.51it/s]
MISC:NE total: 17015
      correct: 14327
        wrong: 2688 [952 excess / 1001 absent / 735 wrong type]
     Accuracy: 0.84202174551866
[Total accuracy: 0.9431472081218274]
[By sentence accuracy: 0.6285864701237168]


In [42]:
_ = tagger.evaluate(corpus_dev, res_dev)

Evaluating MISC:NE
Load corpus
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
MISC:NE total: 17015
      correct: 14327
        wrong: 2688 [952 excess / 1001 absent / 735 wrong type]
     Accuracy: 0.84202174551866
[Total accuracy: 0.9431472081218274]
[By sentence accuracy: 0.6285864701237168]


In [43]:
_ = tagger.predict(corpus_test, save_to=res_test)

Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Processing corpus
100%|██████████| 3798/3798 [00:27<00:00, 138.89it/s]


In [44]:
_ = tagger.evaluate(corpus_test, clone_ds=True)

Evaluating MISC:NE
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
MISC:NE total: 16699
      correct: 14076
        wrong: 2623 [1052 excess / 927 absent / 644 wrong type]
     Accuracy: 0.8429247260314989
[Total accuracy: 0.944340703645546]
[By sentence accuracy: 0.633754607688257]


In [45]:
_ = tagger.evaluate(corpus_test, res_test)

Evaluating MISC:NE
Load corpus
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
MISC:NE total: 16699
      correct: 14076
        wrong: 2623 [1052 excess / 927 absent / 644 wrong type]
     Accuracy: 0.8429247260314989
[Total accuracy: 0.944340703645546]
[By sentence accuracy: 0.633754607688257]


In [46]:
corp_gold = list(tagger._get_corpus(corpus_test, asis=True))
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x['MISC'].get('NE')
                      for x in corp_gold for x in x[0]
                          if x['MISC'].get('NE')))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             label=tag, log_file=None)
    ))

Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Address: 0.8397981709239988
City: 0.0
Date: 0.0
Department: 0.7721854304635761
Facility: 0.36936936936936937
Geo: 0.8536779324055666
Goal: 0.4025423728813559
Location: 0.3488372093023256
Organization: 0.8544906570223026
Person: 0.7182539682539683
PersonProperty: 0.7565011820330969
Phone: 0.7247191011235955
Time: 0.0


In [47]:
import gc

del tagger
gc.collect()

0