## UPOS

In [1]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import UposTagger, FeatsTagger

BERT_MODEL_FN = 'xlm-roberta-base'
MODEL_FN = 'upos-bert_model'
SEED=42
BERT_MAX_LEN, BERT_EPOCHS, BERT_BATCH_SIZE = 0, 3, 8
DEVICE='cuda:0'

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [2]:
tagger = UposTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device=DEVICE, control_metric='accuracy',
                 max_epochs=None, min_epochs=0, bad_epochs=5,
                 max_grad_norm=None, tags_to_remove=None, word_emb_type='bert',
                 word_emb_path=BERT_MODEL_FN, word_transform_kwargs={
                     'max_len': BERT_MAX_LEN, 'hidden_ids': 10, 'aggregate_subtokens_op': 'absmax'
                     # BertDataset.transform() (for BERT-descendant models)
                     # params:
                     # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10',
                     #  'aggregate_hiddens_op': 'cat',
                     #  'aggregate_subtokens_op': 'absmax', 'to': junky.CPU,
                     #  'loglevel': 1}
                     # WordDataset.transform() (for other models) params:
                     # {'check_lower': True}
                 },
                 stage1_params=None,
                     # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': 0, 'amsgrad': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage2_params=None,
                     # {'lr': .001, 'momentum': .9, 'weight_decay': 0,
                     #  'dampening': 0, 'nesterov': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage3_params={
                     'save_as': MODEL_FN.replace('-bert_model', '_' + BERT_MODEL_FN)
                              + f'_len{BERT_MAX_LEN}_ep{BERT_EPOCHS}_bat{BERT_BATCH_SIZE}_seed{SEED}',
                     'epochs': BERT_EPOCHS,
                     'batch_size': BERT_BATCH_SIZE,
                     'lr': 2e-5, 'num_warmup_steps': 3,
                     # {'save_as': None, 'max_epochs': 3, 'batch_size': 8,
                     #  'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': .01, 'amsgrad': False,
                     #  'num_warmup_steps': 3, 'max_grad_norm': 1.}
                 },
                 stages=[1, 2, 3, 1, 2], save_stages=True, load_from=None,
                 learn_on_padding=True, remove_padding_intent=False,
                 seed=SEED, start_time=None, keep_embs=False,
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 emb_bn=True, emb_do=.2,
                 final_emb_dim=512, pre_bn=True, pre_do=.5,
                 lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8,
                 post_bn=True, post_do=.4)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Parse corpus
done: 55398 sentences, 979787 acceptable tokens (plus 575 for YO letters)
Fit corpus dict... done.

=== UPOS TAGGER TRAINING PIPELINE ===

DATASETS CREATION
Tokenizing
100%|██████████| 48814/48814 [00:33<00:00, 1440.55it/s]
Vectorizing
100%|██████████| 763/763 [00:52<00:00, 14.42it/s]
Reordering
100%|██████████| 48814/48814 [01:31<00:00, 531.22it/s]
Tokenizing
100%|██████████| 6584/6584 [00:05<00:00, 1242.25it/s]
Vectorizing
100%|██████████| 103/103 [00:07<00:00, 13.65it/s]
Reordering
100%|██████████| 6584/6584 [00:12<00:00, 520.08it/s]

MODEL CREATION

MODEL TRAINING 1 (STAGE 1, SEED 2746317214)
Epoch 1: 100%|██████████| 48814/48814 [00:56<00:00, 862.58it/s, train_loss=0.0516]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1: 
Losses: train = 0.15643125, test = 0.02975556
Test: accuracy = 0.97729728
Test: precision = 0.85226360
Test: recall = 0.83896966
Test: f1_score = 0.84458696
new maximum score 0.97729728
Config saved
Saving state_dict... done.
Epoch 2: 100%|██████████| 48814/48814 [00:56<00:00, 858.24it/s, train_loss=0.0346]
Epoch 2: 
Losses: train = 0.03847520, test = 0.02256675
Test: accuracy = 0.98220073
Test: precision = 0.91003034
Test: recall = 0.86468484
Test: f1_score = 0.87352699
new maximum score 0.98220073
Config saved
Saving state_dict... done.
Epoch 3: 100%|██████████| 48814/48814 [00:56<00:00, 864.25it/s, train_loss=0.0278]
Epoch 3: 
Losses: train = 0.02958618, test = 0.01952541
Test: accuracy = 0.98443724
Test: precision = 0.97178462
Test: recall = 0.89774008
Test: f1_score = 0.91046597
new maximum score 0.98443724
Config saved
Saving state_dict... done.
Epoch 4: 100%|██████████| 48814/48814 [00:56<00:00, 859.48it/s, train_loss=0.0246]
Epoch 4: 
Losses: train = 0.02500334, test 

In [3]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = UposTagger(embs=globals()['tagger'].embs
                             if 'tagger' in globals() else
                         None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [4]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Vectorizing
100%|██████████| 103/103 [00:07<00:00, 13.68it/s]
Reordering
100%|██████████| 6584/6584 [00:12<00:00, 521.25it/s]


In [5]:
_ = tagger.evaluate(corpus.dev)

Evaluating UPOS


Load corpus
[===> 4000                                                         

Processing corpus
100%|██████████| 6584/6584 [00:42<00:00, 154.10it/s]
UPOS total: 118488
   correct: 117592
     wrong: 896
  Accuracy: 0.9924380527985956
[By sentence accuracy: 0.8824422843256379]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [6]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating UPOS
Load corpus
[==> 3000                                                          

Load corpus
[==> 3000                                                          

Corpus has been loaded: 6584 sentences, 118692 tokens
UPOS total: 118488
   correct: 117592
     wrong: 896
  Accuracy: 0.9924380527985956
[By sentence accuracy: 0.8824422843256379]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [7]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
[==> 2500                                                          

Processing corpus
  0%|          | 0/6491 [00:00<?, ?it/s]

Corpus has been loaded: 6491 sentences, 117523 tokens


100%|██████████| 6491/6491 [00:41<00:00, 155.11it/s]


In [8]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating UPOS


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Vectorizing
100%|██████████| 102/102 [00:07<00:00, 13.78it/s]
Reordering
100%|██████████| 6491/6491 [00:12<00:00, 519.56it/s]
UPOS total: 117329
   correct: 116571
     wrong: 758
  Accuracy: 0.9935395341305219
[By sentence accuracy: 0.8941611462024341]


In [9]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating UPOS


Load corpus
[===> 3300                                                         

Vectorizing
100%|██████████| 102/102 [00:07<00:00, 13.28it/s]
Reordering
100%|██████████| 6491/6491 [00:12<00:00, 519.04it/s]
UPOS total: 117329
   correct: 116571
     wrong: 758
  Accuracy: 0.9935395341305219
[By sentence accuracy: 0.8941611462024341]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [10]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating UPOS
Load corpus
[=====> 5600                                                       

Load corpus
[=====> 5600                                                       

Corpus has been loaded: 6491 sentences, 117523 tokens
UPOS total: 117329
   correct: 116571
     wrong: 758
  Accuracy: 0.9935395341305219
[By sentence accuracy: 0.8941611462024341]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [11]:
corp_gold = list(corpus.test())
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x['UPOS'] for x in corp_gold
                            for x in x[0] if x['UPOS']))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             label=tag, log_file=None)
    ))

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
ADJ: 0.9741541775105448
ADP: 0.9993924138529642
ADV: 0.9738396624472574
AUX: 0.9537299338999056
CCONJ: 0.9880696731090431
DET: 0.9653818700927909
INTJ: 0.6
NOUN: 0.9933170901496693
NUM: 0.9783327212633125
PART: 0.9714540588760036
PRON: 0.9814851290131967
PROPN: 0.964455466005373
PUNCT: 1.0
SCONJ: 0.9668109668109668
SYM: 1.0
VERB: 0.9865982539439424
X: 0.78125


## FEATS

In [12]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import FeatsTagger

BERT_MODEL_FN = 'xlm-roberta-base'
MODEL_FN = 'feats-bert_model'
SEED=42
BERT_MAX_LEN, BERT_EPOCHS, BERT_BATCH_SIZE = 0, 3, 8
DEVICE='cuda:0'

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [13]:
tagger = FeatsTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device=DEVICE, control_metric='accuracy',
                 max_epochs=None, min_epochs=0, bad_epochs=5,
                 max_grad_norm=None, tags_to_remove=None, word_emb_type='bert',
                 word_emb_path=BERT_MODEL_FN, word_transform_kwargs={
                     'max_len': BERT_MAX_LEN, 'hidden_ids': 10, 'aggregate_subtokens_op': 'absmax'
                     # BertDataset.transform() (for BERT-descendant models)
                     # params:
                     # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10',
                     #  'aggregate_hiddens_op': 'cat',
                     #  'aggregate_subtokens_op': 'absmax', 'to': junky.CPU,
                     #  'loglevel': 1}
                     # WordDataset.transform() (for other models) params:
                     # {'check_lower': True}
                 },
                 stage1_params=None,
                     # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': 0, 'amsgrad': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage2_params=None,
                     # {'lr': .001, 'momentum': .9, 'weight_decay': 0,
                     #  'dampening': 0, 'nesterov': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage3_params={
                     'save_as': MODEL_FN.replace('-bert_model', '_' + BERT_MODEL_FN)
                              + f'_len{BERT_MAX_LEN}_ep{BERT_EPOCHS}_bat{BERT_BATCH_SIZE}_seed{SEED}',
                     'epochs': BERT_EPOCHS,
                     'batch_size': BERT_BATCH_SIZE,
                     'lr': 2e-5, 'num_warmup_steps': 3,
                     # {'save_as': None, 'max_epochs': 3, 'batch_size': 8,
                     #  'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': .01, 'amsgrad': False,
                     #  'num_warmup_steps': 3, 'max_grad_norm': 1.}
                 },
                 stages=[1, 2, 3, 1, 2], save_stages=True, load_from=None,
                 learn_on_padding=True, remove_padding_intent=False,
                 seed=SEED, start_time=None, keep_embs=False,
                 rnn_emb_dim=None, cnn_emb_dim=200, cnn_kernels=range(1, 7),
                 upos_emb_dim=200, emb_bn=True, emb_do=.2,
                 final_emb_dim=512, pre_bn=True, pre_do=.5,
                 lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8,
                 post_bn=True, post_do=.4)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens



=== FEATS TAGGER TRAINING PIPELINE ===

DATASETS CREATION
Tokenizing
100%|██████████| 48814/48814 [00:46<00:00, 1042.15it/s]
Vectorizing
100%|██████████| 763/763 [00:52<00:00, 14.47it/s]
Reordering
100%|██████████| 48814/48814 [01:32<00:00, 525.32it/s]
Tokenizing
100%|██████████| 6584/6584 [00:04<00:00, 1430.27it/s]
Vectorizing
100%|██████████| 103/103 [00:07<00:00, 13.68it/s]
Reordering
100%|██████████| 6584/6584 [00:12<00:00, 514.65it/s]

MODEL CREATION

MODEL TRAINING 1 (STAGE 1, SEED 2746317214)
Epoch 1: 100%|██████████| 48814/48814 [01:35<00:00, 511.61it/s, train_loss=0.221]
Epoch 1: 
Losses: train = 0.64466713, test = 0.10057570
Test: accuracy = 0.94098980
Test: precision = 0.34546797
Test: recall = 0.33843755
Test: f1_score = 0.32736530
new maximum score 0.94098980
Config saved
Saving state_dict... done.
Epoch 2:   0%|          | 0/48814 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2: 100%|██████████| 48814/48814 [01:35<00:00, 508.75it/s, train_loss=0.0901]
Epoch 2: 
Losses: train = 0.11484653, test = 0.04867176
Test: accuracy = 0.96900952
Test: precision = 0.51644279
Test: recall = 0.51109260
Test: f1_score = 0.50111097
new maximum score 0.96900952
Config saved
Saving state_dict... done.
Epoch 3: 100%|██████████| 48814/48814 [01:36<00:00, 508.04it/s, train_loss=0.0622]
Epoch 3: 
Losses: train = 0.06865228, test = 0.03626011
Test: accuracy = 0.97636048
Test: precision = 0.62968233
Test: recall = 0.61436918
Test: f1_score = 0.60710569
new maximum score 0.97636048
Config saved
Saving state_dict... done.
Epoch 4: 100%|██████████| 48814/48814 [01:35<00:00, 513.09it/s, train_loss=0.05]  
Epoch 4: 
Losses: train = 0.05205300, test = 0.03133088
Test: accuracy = 0.97987138
Test: precision = 0.69269151
Test: recall = 0.68563225
Test: f1_score = 0.67787682
new maximum score 0.97987138
Config saved
Saving state_dict... done.
Epoch 5: 100%|██████████| 48814/48814 [01:3

In [14]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = FeatsTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [15]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Vectorizing
100%|██████████| 103/103 [00:07<00:00, 13.66it/s]
Reordering
100%|██████████| 6584/6584 [00:12<00:00, 516.45it/s]


In [16]:
_ = tagger.evaluate(corpus.dev)

Evaluating FEATS


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [00:46<00:00, 142.17it/s]
FEATS total: 118488 tokens, 287014 tags
    correct: 117150 tokens, 285090 tags
      wrong: 1338 tokens, 1924 tags [251 excess / 290 absent / 1383 wrong type]
   Accuracy: 0.9816418094754606 / 0.9932964942476674
[Total accuracy: 0.9887077172371885 / 0.9932964942476674]
[By sentence accuracy: 0.8462940461725394]


In [17]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating FEATS
Load corpus
[> 1000                                                            

Load corpus
[> 1000                                                            

Corpus has been loaded: 6584 sentences, 118692 tokens
FEATS total: 118488 tokens, 287014 tags
    correct: 117150 tokens, 285090 tags
      wrong: 1338 tokens, 1924 tags [251 excess / 290 absent / 1383 wrong type]
   Accuracy: 0.9816418094754606 / 0.9932964942476674
[Total accuracy: 0.9887077172371885 / 0.9932964942476674]
[By sentence accuracy: 0.8462940461725394]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [18]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
[=====> 5500                                                       

Processing corpus
  0%|          | 0/6491 [00:00<?, ?it/s]

Corpus has been loaded: 6491 sentences, 117523 tokens


100%|██████████| 6491/6491 [00:45<00:00, 144.19it/s]


In [19]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating FEATS


Load corpus
[==> 2700                                                          

Vectorizing
100%|██████████| 102/102 [00:07<00:00, 13.81it/s]
Reordering
100%|██████████| 6491/6491 [00:12<00:00, 519.14it/s]
FEATS total: 117329 tokens, 284950 tags
    correct: 115927 tokens, 282826 tags
      wrong: 1402 tokens, 2124 tags [299 excess / 382 absent / 1443 wrong type]
   Accuracy: 0.9805973041047358 / 0.9925460607124057
[Total accuracy: 0.9880506950540787 / 0.9925460607124057]
[By sentence accuracy: 0.83484825142505]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [20]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating FEATS
Load corpus

Load corpus


Corpus has been loaded: 6491 sentences, 117523 tokens
FEATS total: 117329 tokens, 284950 tags
    correct: 115927 tokens, 282826 tags
      wrong: 1402 tokens, 2124 tags [299 excess / 382 absent / 1443 wrong type]
   Accuracy: 0.9805973041047358 / 0.9925460607124057
[Total accuracy: 0.9880506950540787 / 0.9925460607124057]
[By sentence accuracy: 0.83484825142505]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [21]:
corp_gold = list(corpus.test())
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x for x in corp_gold
                    for x in x[0]
                    for x in x['FEATS'].keys()))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             feats=tag, log_file=None)
    ))

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
Animacy: 0.9899892843043258
Aspect: 0.9945146156622158
Case: 0.9884052425141108
Degree: 0.9979249421948183
Foreign: 0.7455089820359282
Gender: 0.9891480501364186
Mood: 0.9987825124515772
Number: 0.9954606009769577
Person: 0.9994301994301994
Polarity: 0.9555555555555556
Tense: 0.9980355388874007
Variant: 0.9981366459627329
VerbForm: 0.9996391194514616
Voice: 0.991699747383616


## LEMMA

For lemmata, besides of *BERT* word embeddings one can use *FastText*. In this case, model performance on the *SynTagRus* test datasetis just slightly worse (0.9945 vs. 0.9948, and, we think, it may be tuned if need). So, we give here training snippets for both version of tagger, *BERT* (next snippet) and *FastText* (see further).

### *BERT* Lemmata Tagger

In [22]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import LemmaTagger

BERT_MODEL_FN = 'xlm-roberta-base'
MODEL_FN = 'lemma-bert_model'
SEED=42
BERT_MAX_LEN, BERT_EPOCHS, BERT_BATCH_SIZE = 0, 2, 8
DEVICE='cuda:0'

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [23]:
tagger = LemmaTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device=DEVICE, control_metric='accuracy',
                 max_epochs=None, min_epochs=0, bad_epochs=5,
                 max_grad_norm=None, tags_to_remove=None, word_emb_type='bert',
                 word_emb_path=BERT_MODEL_FN, word_transform_kwargs=None,
                     # BertDataset.transform() (for BERT-descendant models)
                     # params:
                     # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10',
                     #  'aggregate_hiddens_op': 'cat',
                     #  'aggregate_subtokens_op': 'absmax', 'to': junky.CPU,
                     #  'loglevel': 1}
                     # WordDataset.transform() (for other models) params:
                     # {'check_lower': True}
                 stage1_params=None,
                     # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': 0, 'amsgrad': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage2_params=None,
                     # {'lr': .001, 'momentum': .9, 'weight_decay': 0,
                     #  'dampening': 0, 'nesterov': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage3_params={
                     'save_as': MODEL_FN.replace('-bert_model', '_' + BERT_MODEL_FN)
                              + f'_len{BERT_MAX_LEN}_ep{BERT_EPOCHS}_bat{BERT_BATCH_SIZE}_seed{SEED}',
                     'epochs': BERT_EPOCHS,
                     'batch_size': BERT_BATCH_SIZE,
                     'lr': 2e-5, 'num_warmup_steps': 6
                 },
                     # {'save_as': None, 'epochs': 3, 'batch_size': 8,
                     #  'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': .01, 'amsgrad': False,
                     #  'num_warmup_steps': 3, 'max_grad_norm': 1.}
                 stages=[1, 2, 3, 1, 2], save_stages=True, load_from=None,
                 learn_on_padding=False, remove_padding_intent=False,
                 seed=SEED, start_time=None, keep_embs=False,
                 rnn_emb_dim=384, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=256, emb_bn=True, emb_do=.2,
                 final_emb_dim=512, pre_bn=True, pre_do=.5,
                 lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8,
                 post_bn=True, post_do=.4)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Parse corpus
done: 55398 sentences, 979787 acceptable tokens (plus 575 for YO letters)
Fit corpus dict... done.

Preliminary trainset preparation:
stage 1 of 3... done.
stage 2 of 3... done.
Lengths: [1883 {'allow_replace': True, 'allow_copy': True},
          1881 {'allow_replace': True, 'allow_copy': False},
          1915 {'allow_replace': False, 'allow_copy': True},
          1881 {'allow_replace': False, 'allow_copy': False}]
min = {'allow_replace': True, 'allow_copy': False}
stage 3 of 3... done.


=== LEMMA TAGGER TRAINING PIPELINE ===

DATASETS CREATION
Tokenizing
100%|██████████| 48814/48814 [00:35<00:00, 1367.89it/s]
Vectorizing
100%|██████████| 763/763 [00:53<00:00, 14.37it/s]
Reordering
100%|██████████| 48814/48814 [01:35<00:00, 511.17it/s]
Tokenizing
100%|██████████| 6584/6584 [00:04<00:00, 1412.20it/s]
Vectorizing
100%|██████████| 103/103 [00:07<00:00, 13.59it/s]
Reordering
100%|██████████| 6584/6584 [00:13<00:00, 505.12it/s]

MODEL CREATION

MODEL TRAINING 1 (STAGE 1, SE

  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1: 
Losses: train = 0.93437836, test = 0.27618447
Test: accuracy = 0.94639120
Test: precision = 0.16864180
Test: recall = 0.15537752
Test: f1_score = 0.15196150
new maximum score 0.94639120
Config saved
Saving state_dict... done.
Epoch 2: 100%|██████████| 48814/48814 [02:25<00:00, 336.19it/s, train_loss=0.243]
Epoch 2: 
Losses: train = 0.28908512, test = 0.16822170
Test: accuracy = 0.96693336
Test: precision = 0.27221991
Test: recall = 0.25992282
Test: f1_score = 0.25380498


  _warn_prf(average, modifier, msg_start, len(result))


new maximum score 0.96693336
Config saved
Saving state_dict... done.
Epoch 3: 100%|██████████| 48814/48814 [02:03<00:00, 395.71it/s, train_loss=0.174]
Epoch 3: 
Losses: train = 0.19191877, test = 0.12877040
Test: accuracy = 0.97489197
Test: precision = 0.35043221
Test: recall = 0.33488583
Test: f1_score = 0.33015670
new maximum score 0.97489197
Config saved
Saving state_dict... done.
Epoch 4: 100%|██████████| 48814/48814 [01:41<00:00, 481.26it/s, train_loss=0.139]
Epoch 4: 
Losses: train = 0.14606707, test = 0.10475494
Test: accuracy = 0.98001485
Test: precision = 0.41212327
Test: recall = 0.40493035
Test: f1_score = 0.39756519
new maximum score 0.98001485
Config saved
Saving state_dict... done.
Epoch 5: 100%|██████████| 48814/48814 [01:42<00:00, 475.09it/s, train_loss=0.114]
Epoch 5: 
Losses: train = 0.11907239, test = 0.08988474
Test: accuracy = 0.98283370
Test: precision = 0.45866163
Test: recall = 0.44508637
Test: f1_score = 0.43907982
new maximum score 0.98283370
Config saved
Savi

In [24]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = LemmaTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [25]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Vectorizing
100%|██████████| 103/103 [00:07<00:00, 13.67it/s]
Reordering
100%|██████████| 6584/6584 [00:12<00:00, 517.22it/s]


In [26]:
_ = tagger.evaluate(corpus.dev)

Evaluating LEMMA


Load corpus
[> 100                                                            

Processing corpus
  0%|          | 0/6584 [00:00<?, ?it/s]

Corpus has been loaded: 6584 sentences, 118692 tokens


100%|██████████| 6584/6584 [00:46<00:00, 140.38it/s]
LEMMA total: 118488
    correct: 117897
      wrong: 591
   Accuracy: 0.9950121531294308
[By sentence accuracy: 0.9198055893074119]


In [27]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating LEMMA
Load corpus
[=> 1800                                                           

Load corpus
[=> 1700                                                           

Corpus has been loaded: 6584 sentences, 118692 tokens
LEMMA total: 118488
    correct: 117897
      wrong: 591
   Accuracy: 0.9950121531294308
[By sentence accuracy: 0.9198055893074119]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [28]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
[====> 4500                                                        

Processing corpus
  0%|          | 0/6491 [00:00<?, ?it/s]

Corpus has been loaded: 6491 sentences, 117523 tokens


100%|██████████| 6491/6491 [00:45<00:00, 143.61it/s]


In [29]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating LEMMA


Load corpus
[====> 4500                                                        

Vectorizing
100%|██████████| 102/102 [00:07<00:00, 13.76it/s]
Reordering
100%|██████████| 6491/6491 [00:12<00:00, 522.39it/s]
LEMMA total: 117329
    correct: 116741
      wrong: 588
   Accuracy: 0.9949884512780302
[By sentence accuracy: 0.9183484825142505]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [30]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating LEMMA
Load corpus
[=====> 5900                                                       

Load corpus
[=====> 5800                                                       

Corpus has been loaded: 6491 sentences, 117523 tokens
LEMMA total: 117329
    correct: 116741
      wrong: 588
   Accuracy: 0.9949884512780302
[By sentence accuracy: 0.9183484825142505]


Corpus has been loaded: 6491 sentences, 117523 tokens


### *FastText* Lemmata Tagger

**NB:** For this task, we use Russian *FastText* embeddings provided by *Facebook*: [cc.ru.300.bin.gz](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz). We highly recommend them because it delivers the highest evaluation scores. Also, embeddings provided by *DeepPavlov* ([ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin](http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin)) could be used, too. They deliver just slightly worse model performance.

Maybe, one can try embeddings from *RusVectores*. Early (long ago) it was the worst choice because of inappropriate preprocessing. But now it seems, it's corrected. We didn't try, but if you surely want *FastText* model, embeddings from *RusVectores* are also worth to check.

If you want your model to achieve high scores, use embeddings without any lemmatization, removal of punctuation, and adding any other archaic transformations. Embeddings of words with part of speech tags appended in the end are also useless (by obvious reasons).

In [31]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import LemmaTagger

FT_MODEL_FN = '../mordl/cc.ru.300.bin'
MODEL_FN = 'lemma-ft_model'
SEED=42
DEVICE='cuda:0'

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

In [23]:
tagger = LemmaTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device=DEVICE, control_metric='accuracy',
                 max_epochs=None, min_epochs=0, bad_epochs=5,
                 max_grad_norm=None, tags_to_remove=None, word_emb_type='ft',
                 word_emb_path=FT_MODEL_FN, word_transform_kwargs=None,
                     # BertDataset.transform() (for BERT-descendant models)
                     # params:
                     # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10',
                     #  'aggregate_hiddens_op': 'cat',
                     #  'aggregate_subtokens_op': 'absmax', 'to': junky.CPU,
                     #  'loglevel': 1}
                     # WordDataset.transform() (for other models) params:
                     # {'check_lower': True}
                 stage1_params=None,
                     # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': 0, 'amsgrad': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage2_params=None,
                     # {'lr': .001, 'momentum': .9, 'weight_decay': 0,
                     #  'dampening': 0, 'nesterov': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage3_params=None,
                     # {'save_as': None, 'epochs': 3, 'batch_size': 8,
                     #  'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': .01, 'amsgrad': False,
                     #  'num_warmup_steps': 3, 'max_grad_norm': 1.}
                 stages=[1, 2], save_stages=True, load_from=None,
                 learn_on_padding=False, remove_padding_intent=False,
                 seed=SEED, start_time=None, keep_embs=False,
                 rnn_emb_dim=300, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=200, emb_bn=True, emb_do=.2,
                 final_emb_dim=512, pre_bn=True, pre_do=.5,
                 lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8,
                 post_bn=True, post_do=.4)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Parse corpus
done: 55398 sentences, 979787 acceptable tokens (plus 575 for YO letters)
Fit corpus dict... done.

Preliminary trainset preparation:
stage 1 of 3... done.
stage 2 of 3... done.
Lengths: [1883 {'allow_replace': True, 'allow_copy': True},
          1881 {'allow_replace': True, 'allow_copy': False},
          1915 {'allow_replace': False, 'allow_copy': True},
          1881 {'allow_replace': False, 'allow_copy': False}]
min = {'allow_replace': True, 'allow_copy': False}
stage 3 of 3... done.


=== LEMMA TAGGER TRAINING PIPELINE ===

DATASETS CREATION

MODEL CREATION

MODEL TRAINING 1 (STAGE 1, SEED 2746317214)
Epoch 1: 100%|██████████| 48814/48814 [01:25<00:00, 569.12it/s, train_loss=0.386]
Epoch 1: 
Losses: train = 0.85293718, test = 0.24180616
Test: accuracy = 0.95061103
Test: precision = 0.20712500
Test: recall = 0.20030005
Test: f1_score = 0.19704682
new maximum score 0.95061103
Config saved
Saving state_dict... done.
Epoch 2: 100%|██████████| 48814/48814 [01:25<00:00, 5

In [33]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = LemmaTagger(embs=globals()['tagger'].embs
                              if 'tagger' in globals() else
                          None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [34]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


In [35]:
_ = tagger.evaluate(corpus.dev)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [00:08<00:00, 785.64it/s]
LEMMA total: 118488
    correct: 117849
      wrong: 639
   Accuracy: 0.9946070488150699
[By sentence accuracy: 0.9146415552855407]


In [36]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating LEMMA
Load corpus
[===> 3800                                                         

Load corpus
[===> 3700                                                         

Corpus has been loaded: 6584 sentences, 118692 tokens
LEMMA total: 118488
    correct: 117849
      wrong: 639
   Accuracy: 0.9946070488150699
[By sentence accuracy: 0.9146415552855407]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [37]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus

Processing corpus
  0%|          | 0/6491 [00:00<?, ?it/s]

Corpus has been loaded: 6491 sentences, 117523 tokens


100%|██████████| 6491/6491 [00:08<00:00, 800.66it/s]


In [38]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating LEMMA


Load corpus
[====> 4500                                                        

LEMMA total: 117329
    correct: 116684
      wrong: 645
   Accuracy: 0.9945026378815127
[By sentence accuracy: 0.9109536281004468]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [39]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating LEMMA
Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
LEMMA total: 117329
    correct: 116684
      wrong: 645
   Accuracy: 0.9945026378815127
[By sentence accuracy: 0.9109536281004468]


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


## CoNLL18 Validation

In [40]:
from corpuscula.corpus_utils import download_ud, get_ud_test_path
import junky
from mordl import UposTagger, FeatsTagger, LemmaTagger, conll18_ud_eval

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
DEVICE = 'cuda:0'

corpus_gold = get_ud_test_path(corpus_name)
corpus_test = 'corpora/_test_tagged.conllu'

In [41]:
del tagger
tagger_u = UposTagger()
tagger_u.load('upos-bert_model', device=DEVICE, dataset_device=DEVICE)
tagger_f = FeatsTagger()
tagger_f.load('feats-bert_model', device=DEVICE, dataset_device=DEVICE)
tagger_l = LemmaTagger()
tagger_l.load('lemma-bert_model', device=DEVICE, dataset_device=DEVICE)

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [42]:
_ = tagger_l.predict(
    tagger_f.predict(
        tagger_u.predict(corpus_gold)
    ), save_to=corpus_test
)

del tagger_u, tagger_f, tagger_l

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
Processing corpus
100%|██████████| 6491/6491 [00:39<00:00, 162.29it/s]
Processing corpus
100%|██████████| 6491/6491 [00:43<00:00, 150.59it/s]
Processing corpus
100%|██████████| 6491/6491 [00:43<00:00, 150.00it/s]


In [43]:
conll18_ud_eval(corpus_gold, corpus_test)

Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |    100.00 |    100.00 |    100.00 |
Sentences  |    100.00 |    100.00 |    100.00 |
Words      |    100.00 |    100.00 |    100.00 |
UPOS       |     99.35 |     99.35 |     99.35 |     99.35
XPOS       |    100.00 |    100.00 |    100.00 |    100.00
UFeats     |     98.36 |     98.36 |     98.36 |     98.36
AllTags    |     98.21 |     98.21 |     98.21 |     98.21
Lemmas     |     98.88 |     98.88 |     98.88 |     98.88
UAS        |    100.00 |    100.00 |    100.00 |    100.00
LAS        |    100.00 |    100.00 |    100.00 |    100.00
CLAS       |    100.00 |    100.00 |    100.00 |    100.00
MLAS       |     97.22 |     97.22 |     97.22 |     97.22
BLEX       |     98.29 |     98.29 |     98.29 |     98.29


## MISC:NE

Note: the corpora we used are proprietary. You have to find another corpora.

In [46]:
from mordl import UposTagger, FeatsTagger

DEVICE = 'cuda:0'

tagger_u = UposTagger()
tagger_u.load('upos-bert_model', device=DEVICE, dataset_device=DEVICE)
tagger_f = FeatsTagger()
tagger_f.load('feats-bert_model', device=DEVICE, dataset_device=DEVICE)

PREFIX = 'ner-old-'
for corpora in zip([f'corpora/{PREFIX}train.conllu',
                    f'corpora/{PREFIX}dev.conllu',
                    f'corpora/{PREFIX}test.conllu'],
                   [f'corpora/{PREFIX}train_upos_feats.conllu',
                    f'corpora/{PREFIX}dev_upos_feats.conllu',
                    f'corpora/{PREFIX}test_upos_feats.conllu']):
    tagger_f.predict(
        tagger_u.predict(corpora[0]), save_to=corpora[1]
    )

del tagger_u, tagger_f

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Load corpus
Corpus has been loaded: 24042 sentences, 455243 tokens
Processing corpus
100%|██████████| 24042/24042 [02:19<00:00, 172.87it/s]
Processing corpus
100%|██████████| 24042/24042 [02:32<00:00, 158.15it/s]
Load corpus
[====] 3005                                                        
Corpus has been loaded: 3005 sentences, 57512 tokens
Processing corpus
100%|██████████| 3005/3005 [00:17<00:00, 176.71it/s]
Processing corpus
100%|██████████| 3005/3005 [00:18<00:00, 162.61it/s]
Load corpus
[====] 3006                                                        
Corpus has been loaded: 3006 sentences, 58201 tokens
Processing corpus
100%|██████████| 3006/3006 [00:17<00:00, 174.13it/s]
Processing corpus
100%|██████████| 3006/3006 [00:18<00:00, 160.72it/s]


In [47]:
import junky
from mordl import NeTagger

BERT_MODEL_FN = 'xlm-roberta-base'
MODEL_FN = 'misc-ne-bert_model'
SEED=42
BERT_MAX_LEN, BERT_EPOCHS, BERT_BATCH_SIZE = 0, 2, 8
DEVICE='cuda:0'

PREFIX = 'ner-old-'
corpus_train = f'corpora/{PREFIX}train_upos_feats.conllu'
corpus_dev = f'corpora/{PREFIX}dev_upos_feats.conllu'
corpus_test = f'corpora/{PREFIX}test_upos_feats.conllu'

In [48]:
tagger = NeTagger()
tagger.load_train_corpus(corpus_train)
tagger.load_test_corpus(corpus_dev)

_ = tagger.train(MODEL_FN, device=DEVICE, control_metric='accuracy',
                 max_epochs=None, min_epochs=0, bad_epochs=5,
                 max_grad_norm=None, tags_to_remove=None, word_emb_type='bert',
                 word_emb_path=BERT_MODEL_FN, word_transform_kwargs={
                     'max_len': BERT_MAX_LEN, 'hidden_ids': 10, 'aggregate_subtokens_op': 'absmax'
                     # BertDataset.transform() (for BERT-descendant models)
                     # params:
                     # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10',
                     #  'aggregate_hiddens_op': 'cat',
                     #  'aggregate_subtokens_op': 'absmax', 'to': junky.CPU,
                     #  'loglevel': 1}
                     # WordDataset.transform() (for other models) params:
                     # {'check_lower': True}
                 },
                 stage1_params=None,
                     # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': 0, 'amsgrad': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage2_params=None,
                     # {'lr': .001, 'momentum': .9, 'weight_decay': 0,
                     #  'dampening': 0, 'nesterov': False,
                     #  'max_epochs': None, 'min_epochs': None,
                     #  'bad_epochs': None, 'batch_size': None,
                     #  'max_grad_norm': None}
                 stage3_params={
                     'save_as': MODEL_FN.replace('-bert_model', '_' + BERT_MODEL_FN)
                              + f'_len{BERT_MAX_LEN}_ep{BERT_EPOCHS}_bat{BERT_BATCH_SIZE}_seed{SEED}',
                     'epochs': BERT_EPOCHS,
                     'batch_size': BERT_BATCH_SIZE,
                     'lr': 4e-5, 'num_warmup_steps': 1,
                     # {'save_as': None, 'max_epochs': 3, 'batch_size': 8,
                     #  'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8,
                     #  'weight_decay': .01, 'amsgrad': False,
                     #  'num_warmup_steps': 0, 'max_grad_norm': 1.}
                 },
                 stages=[1, 2, 3, 1, 2], save_stages=True, load_from=None,
                 learn_on_padding=True, remove_padding_intent=False,
                 seed=SEED, start_time=None, keep_embs=False,
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=300, emb_bn=True, emb_do=.2,
                 final_emb_dim=512, pre_bn=True, pre_do=.5,
                 lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8,
                 post_bn=True, post_do=.4)

Train: 

Load corpus
Corpus has been loaded: 24042 sentences, 455243 tokens
Load corpus
[====] 3005                                                        
Corpus has been loaded: 3005 sentences, 57512 tokens
Parse corpus
[> 400                                                            

Test: 

done: 27047 sentences, 470458 acceptable tokens (plus 5712 for YO letters)
Fit corpus dict... done.


=== MISC:NE TAGGER TRAINING PIPELINE ===

DATASETS CREATION
Tokenizing
100%|██████████| 24042/24042 [00:26<00:00, 895.02it/s]
Vectorizing
100%|██████████| 376/376 [00:24<00:00, 15.23it/s]
Reordering
100%|██████████| 24042/24042 [00:39<00:00, 615.36it/s]
Tokenizing
100%|██████████| 3005/3005 [00:02<00:00, 1377.27it/s]
Vectorizing
100%|██████████| 47/47 [00:03<00:00, 14.38it/s]
Reordering
100%|██████████| 3005/3005 [00:05<00:00, 600.07it/s]

MODEL CREATION

MODEL TRAINING 1 (STAGE 1, SEED 2746317214)
Epoch 1: 100%|██████████| 24042/24042 [00:31<00:00, 762.74it/s, train_loss=0.144]
Epoch 1: 
Losses: train = 0.24623045, test = 0.09434854
Test: accuracy = 0.91806927
Test: precision = 0.62082768
Test: recall = 0.58602199
Test: f1_score = 0.59330271
new maximum score 0.91806927
Config saved
Saving state_dict... done.
Epoch 2: 100%|██████████| 24042/24042 [00:31<00:00, 755.30it/s, train_loss=0

In [49]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = NeTagger(embs=globals()['tagger'].embs
                           if 'tagger' in globals() else
                       None)
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [50]:
_ = tagger.predict(corpus_dev, clone_ds=True, save_to=res_dev)

Load corpus
[====] 3005                                                        
Corpus has been loaded: 3005 sentences, 57512 tokens
Vectorizing
100%|██████████| 47/47 [00:03<00:00, 14.50it/s]
Reordering
100%|██████████| 3005/3005 [00:04<00:00, 612.43it/s]


In [51]:
_ = tagger.evaluate(corpus_dev)

Evaluating MISC:NE
Load corpus
[====] 3005                                                        
Corpus has been loaded: 3005 sentences, 57512 tokens
Processing corpus
100%|██████████| 3005/3005 [00:17<00:00, 168.77it/s]
MISC:NE total: 16147
      correct: 13091
        wrong: 3056 [1084 excess / 1711 absent / 261 wrong type]
     Accuracy: 0.8107388369356536
[Total accuracy: 0.9468632633189595]
[By sentence accuracy: 0.5444259567387687]


In [52]:
_ = tagger.evaluate(corpus_dev, res_dev)

Evaluating MISC:NE
Load corpus
Load corpus
[====] 3005                                                        
Corpus has been loaded: 3005 sentences, 57512 tokens
[====] 3005                                                        
Corpus has been loaded: 3005 sentences, 57512 tokens
MISC:NE total: 16147
      correct: 13091
        wrong: 3056 [1084 excess / 1711 absent / 261 wrong type]
     Accuracy: 0.8107388369356536
[Total accuracy: 0.9468632633189595]
[By sentence accuracy: 0.5444259567387687]


In [53]:
_ = tagger.predict(corpus_test, save_to=res_test)

Load corpus
[====] 3006                                                        
Corpus has been loaded: 3006 sentences, 58201 tokens
Processing corpus
100%|██████████| 3006/3006 [00:18<00:00, 164.77it/s]


In [54]:
_ = tagger.evaluate(corpus_test, clone_ds=True)

Evaluating MISC:NE
Load corpus
[====] 3006                                                        
Corpus has been loaded: 3006 sentences, 58201 tokens
Vectorizing
100%|██████████| 47/47 [00:03<00:00, 14.12it/s]
Reordering
100%|██████████| 3006/3006 [00:04<00:00, 603.36it/s]
MISC:NE total: 16195
      correct: 13222
        wrong: 2973 [1051 excess / 1724 absent / 198 wrong type]
     Accuracy: 0.8164248224760728
[Total accuracy: 0.948918403463858]
[By sentence accuracy: 0.5508982035928144]


In [55]:
_ = tagger.evaluate(corpus_test, res_test)

Evaluating MISC:NE
Load corpus
Load corpus
[====] 3006                                                        
Corpus has been loaded: 3006 sentences, 58201 tokens
[====] 3006                                                        
Corpus has been loaded: 3006 sentences, 58201 tokens
MISC:NE total: 16195
      correct: 13222
        wrong: 2973 [1051 excess / 1724 absent / 198 wrong type]
     Accuracy: 0.8164248224760728
[Total accuracy: 0.948918403463858]
[By sentence accuracy: 0.5508982035928144]


In [56]:
corp_gold = list(tagger._get_corpus(corpus_test, asis=True))
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x['MISC'].get('NE')
                      for x in corp_gold for x in x[0]
                          if x['MISC'].get('NE')))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             label=tag, log_file=None)
    ))

Load corpus
[====] 3006                                                        
Corpus has been loaded: 3006 sentences, 58201 tokens
Load corpus
[====] 3006                                                        
Corpus has been loaded: 3006 sentences, 58201 tokens
Address: 0.8687350835322196
Age: 0.9747447608812466
City: 0.9107142857142857
Date: 0.9185867895545314
Department: 0.26136363636363635
Designation: 0.7242152466367713
Facility: 0.0
Geo: 0.6814159292035398
Goal: 0.7220701667081364
Location: 0.0
Money: 0.9542760372565622
Organization: 0.7089715536105032
Period: 0.7673755755316817
Person: 0.9527896995708155
Personproperty: 0.8291457286432161
Phone: 0.927710843373494
Sequence: 0.07692307692307693
Term: 0.5151515151515151
Time: 0.7350427350427351


In [57]:
import gc

del tagger
gc.collect()

0