## UPOS

In [1]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import UposTagger

BERT_MODEL_FN = 'bert-base-multilingual-cased'
MODEL_FN = 'upos_model'
SEED=42

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

Downloading UniversalDependencies/UD_Russian-SynTagRus contents
done: 9525 bytes


In [2]:
tagger = UposTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:8', word_emb_type='bert',
                 word_emb_path=MODEL_FN.replace('model', BERT_MODEL_FN)
                             + '_len512_ep3_bat8_seed42',
                 word_emb_tune_params={
                     'model_name': BERT_MODEL_FN,
                     'max_len': 512, 'epochs': 3, 'batch_size': 8
                 },
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 lstm_layers=3, seed=SEED)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


=== UPOS TAGGER TRAINING PIPELINE ===

BERT MODEL TUNING 'bert-base-multilingual-cased'. The result's model name will be 'upos_bert-base-multilingual-cased_len512_ep3_bat8_seed42'
Loading tokenizer...
Tokenizer is loaded. Vocab size: 119547
Corpora processing... done.
Loading model 'bert-base-multilingual-cased'... done.
Epoch 1: 100%|██████████| 48814/48814 [12:32<00:00, 64.86it/s, train_loss=0.0226] 
Average train loss: 0.08670417480997196
Dev: accuracy = 0.98225193
Dev: precision = 0.96434810
Dev: recall = 0.88591372
Dev: f1_score = 0.89677961
NB: Scores may be high because of tags stretching
Saving model to upos_bert-base-multilingual-cased_len512_ep3_bat8_seed42
Epoch 2: 100%|██████████| 48814/48814 [12:24<00:00, 65.54it/s, train_loss=0.00107] 
Average train loss: 0.03311146485173351
Dev: accuracy = 0.98572055
Dev: precision = 0.96505439
Dev: recall = 0.93086512
Dev: f1_score = 0.94336020
NB: Scores may be high because of tags stretching
Saving model to upos_bert-base-multilingual

In [3]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = UposTagger()
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [4]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus

Vectorizing
100%|██████████| 103/103 [00:10<00:00,  9.68it/s]
Adjusting
100%|██████████| 6584/6584 [00:23<00:00, 275.34it/s]



Corpus has been loaded: 6584 sentences, 118692 tokens


In [5]:
_ = tagger.evaluate(corpus.dev)

Evaluating UPOS


Load corpus
[=====> 5500                                                       

Processing corpus
  0%|          | 0/6584 [00:00<?, ?it/s]

Corpus has been loaded: 6584 sentences, 118692 tokens


100%|██████████| 6584/6584 [01:08<00:00, 95.47it/s] 
UPOS total: 118488
   correct: 117279
     wrong: 1209
  Accuracy: 0.9897964350820336


In [6]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating UPOS
Load corpus
[> 700                                                            

Load corpus
[> 700                                                            

[==> 2400                                                          

[==> 2400                                                          

[===> 3700                                                         

[=====> 5200                                                       

[=====> 5900                                                       

[=====> 5900                                                       

Corpus has been loaded: 6584 sentences, 118692 tokens
UPOS total: 118488
   correct: 117279
     wrong: 1209
  Accuracy: 0.9897964350820336


Corpus has been loaded: 6584 sentences, 118692 tokens


In [7]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Processing corpus
100%|██████████| 6491/6491 [01:07<00:00, 95.92it/s] 


In [8]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating UPOS


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Vectorizing
100%|██████████| 102/102 [00:10<00:00,  9.74it/s]
Adjusting
100%|██████████| 6491/6491 [00:23<00:00, 274.60it/s]
UPOS total: 117329
   correct: 116337
     wrong: 992
  Accuracy: 0.9915451422921869


In [9]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating UPOS
Load corpus
[=> 1200                                                           

Load corpus
[=> 1200                                                           

[==> 2300                                                          

[==> 2300                                                          

[===> 3500                                                         

[====> 5000                                                        

Corpus has been loaded: 6491 sentences, 117523 tokens
UPOS total: 117329
   correct: 116337
     wrong: 992
  Accuracy: 0.9915451422921869


Corpus has been loaded: 6491 sentences, 117523 tokens


In [10]:
corp_gold = list(corpus.test())
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x['UPOS'] for x in corp_gold
                            for x in x[0] if x['UPOS']))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             label=tag, log_file=None)
    ))

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
ADJ: 0.9703829634931997
ADP: 0.9993054952686865
ADV: 0.9627454270850814
AUX: 0.9485500467726847
CCONJ: 0.9847182425978988
DET: 0.9493670886075949
INTJ: 0.5
NOUN: 0.9901998262380539
NUM: 0.9709558823529412
PART: 0.9644970414201184
PRON: 0.97059400117624
PROPN: 0.9657308009909166
PUNCT: 0.9996726830636865
SCONJ: 0.9476190476190476
SYM: 1.0
VERB: 0.982405140758874
X: 0.6875


## FEATS

In [11]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import FeatsTagger

BERT_MODEL_FN = 'bert-base-multilingual-cased'
MODEL_FN = 'feats_model'
SEED=42

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

Downloading UniversalDependencies/UD_Russian-SynTagRus contents
done: 9525 bytes


In [12]:
tagger = FeatsTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:8', word_emb_type='bert',
                 word_emb_path=MODEL_FN.replace('model', BERT_MODEL_FN)
                             + '_len512_ep3_bat8_seed42',
                 word_emb_tune_params={
                     'model_name': BERT_MODEL_FN,
                     'max_len': 512, 'epochs': 3, 'batch_size': 8
                 },
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=200, lstm_layers=3, seed=SEED)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


=== FEATS TAGGER TRAINING PIPELINE ===

BERT MODEL TUNING 'bert-base-multilingual-cased'. The result's model name will be 'feats_bert-base-multilingual-cased_len512_ep3_bat8_seed42'
Loading tokenizer...
Tokenizer is loaded. Vocab size: 119547
Corpora processing... done.
Loading model 'bert-base-multilingual-cased'... done.
Epoch 1: 100%|██████████| 48814/48814 [12:28<00:00, 65.23it/s, train_loss=0.203] 
Average train loss: 0.48633251200094013


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Dev: accuracy = 0.94740854
Dev: precision = 0.61015736
Dev: recall = 0.63154997
Dev: f1_score = 0.60670891
NB: Scores may be high because of tags stretching
Saving model to feats_bert-base-multilingual-cased_len512_ep3_bat8_seed42
Epoch 2: 100%|██████████| 48814/48814 [12:28<00:00, 65.25it/s, train_loss=0.177]  
Average train loss: 0.13621492406871671
Dev: accuracy = 0.96035391
Dev: precision = 0.72256610
Dev: recall = 0.74398063
Dev: f1_score = 0.72458306
NB: Scores may be high because of tags stretching
Saving model to feats_bert-base-multilingual-cased_len512_ep3_bat8_seed42
Epoch 3: 100%|██████████| 48814/48814 [12:28<00:00, 65.18it/s, train_loss=0.0404] 
Average train loss: 0.07456367332809576
Dev: accuracy = 0.96498032
Dev: precision = 0.76500494
Dev: recall = 0.77045950
Dev: f1_score = 0.75925111
NB: Scores may be high because of tags stretching
Saving model to feats_bert-base-multilingual-cased_len512_ep3_bat8_seed42

DATASETS CREATION
Tokenizing
100%|██████████| 48814/48814 [0

In [13]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = FeatsTagger()
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [14]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Vectorizing
100%|██████████| 103/103 [00:10<00:00,  9.65it/s]
Adjusting
100%|██████████| 6584/6584 [00:23<00:00, 280.21it/s]


In [15]:
_ = tagger.evaluate(corpus.dev)

Evaluating FEATS


Load corpus
[==> 2700                                                          

Processing corpus
  0%|          | 0/6584 [00:00<?, ?it/s]

Corpus has been loaded: 6584 sentences, 118692 tokens


100%|██████████| 6584/6584 [01:09<00:00, 94.79it/s] 
FEATS total: 118488 tokens, 287191 tags
    correct: 116483 tokens, 284084 tags
      wrong: 2005 tokens, 3107 tags [471 excess / 467 absent / 2169 wrong type]
   Accuracy: 0.972497325176264 / 0.9891814158521681
[Total accuracy: 0.9830784552022146 / 0.9891814158521681]


In [16]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating FEATS
Load corpus
[> 600                                                            

Load corpus
[> 600                                                            

[==> 3000                                                          

[==> 3000                                                          

[====> 4100                                                        

[====> 4100                                                        

[=====> 5200                                                       

[=====> 5200                                                       

[=====> 5900                                                       

[=====> 5900                                                       

Corpus has been loaded: 6584 sentences, 118692 tokens
FEATS total: 118488 tokens, 287191 tags
    correct: 116483 tokens, 284084 tags
      wrong: 2005 tokens, 3107 tags [471 excess / 467 absent / 2169 wrong type]
   Accuracy: 0.972497325176264 / 0.9891814158521681
[Total accuracy: 0.9830784552022146 / 0.9891814158521681]


Corpus has been loaded: 6584 sentences, 118692 tokens


In [17]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Processing corpus
100%|██████████| 6491/6491 [01:07<00:00, 95.57it/s] 


In [18]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating FEATS


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Vectorizing
100%|██████████| 102/102 [00:10<00:00,  9.71it/s]
Adjusting
100%|██████████| 6491/6491 [00:23<00:00, 277.58it/s]
FEATS total: 117329 tokens, 285121 tags
    correct: 115310 tokens, 281884 tags
      wrong: 2019 tokens, 3237 tags [465 excess / 553 absent / 2219 wrong type]
   Accuracy: 0.9720627101523475 / 0.9886469253404695
[Total accuracy: 0.982791978112828 / 0.9886469253404695]


In [19]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating FEATS
Load corpus
[> 900                                                            

Load corpus
[> 800                                                            

[==> 2200                                                          

[==> 2500                                                          

[===> 3400                                                         

[===> 3700                                                         

Corpus has been loaded: 6491 sentences, 117523 tokens
FEATS total: 117329 tokens, 285121 tags
    correct: 115310 tokens, 281884 tags
      wrong: 2019 tokens, 3237 tags [465 excess / 553 absent / 2219 wrong type]
   Accuracy: 0.9720627101523475 / 0.9886469253404695
[Total accuracy: 0.982791978112828 / 0.9886469253404695]


Corpus has been loaded: 6491 sentences, 117523 tokens


In [20]:
corp_gold = list(corpus.test())
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x for x in corp_gold
                    for x in x[0]
                    for x in x['FEATS'].keys()))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             feats=tag, log_file=None)
    ))

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
Animacy: 0.9842530775514803
Aspect: 0.9900425716141136
Case: 0.9822159329942249
Degree: 0.9970380901605355
Foreign: 0.7090909090909091
Gender: 0.9849553168812003
Mood: 0.9962381057756141
Number: 0.9931442663378545
Person: 0.9970106761565837
Polarity: 0.8695652173913043
Tense: 0.9942892834835371
Variant: 0.9962756052141527
VerbForm: 0.9981239627678765
Voice: 0.9887437766072589


## LEMMA
Note: For lemmata, consider to use *FastText* word embeddingsinstead of *BERT* (see the notebook `mordl-ft.ipynb`). Accuracyis much higher.

In [21]:
from corpuscula.corpus_utils import download_ud, UniversalDependencies, \
                                    AdjustedForSpeech
import junky
from mordl import LemmaTagger

BERT_MODEL_FN = 'bert-base-multilingual-cased'
MODEL_FN = 'lemma_model'
SEED=42

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)
corpus = UniversalDependencies(corpus_name)
#corpus = AdjustedForSpeech(corpus)

Downloading UniversalDependencies/UD_Russian-SynTagRus contents
done: 9525 bytes


In [22]:
tagger = LemmaTagger()
tagger.load_train_corpus(corpus.train)
tagger.load_test_corpus(corpus.dev)

_ = tagger.train(MODEL_FN, device='cuda:8', word_emb_type='bert',
                 word_emb_path=MODEL_FN.replace('model', BERT_MODEL_FN)
                             + '_len512_ep3_bat8_seed42',
                 word_emb_tune_params={
                     'model_name': BERT_MODEL_FN,
                     'max_len': 512, 'epochs': 3, 'batch_size': 8
                 },
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=300, lstm_layers=3, seed=SEED)

Train: Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Test: Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Parse corpus
[#################################################] 100%           
done: 48814 sentences, 862287 acceptable tokens (plus 321 for YO letters)
Fit corpus dict... done.

Preliminary trainset preparation:
stage 1 of 3... done.
stage 2 of 3... done.
Lengths: [1883 {'allow_replace': True, 'allow_copy': True},
          1881 {'allow_replace': True, 'allow_copy': False},
          1915 {'allow_replace': False, 'allow_copy': True},
          1881 {'allow_replace': False, 'allow_copy': False}]
min = {'allow_replace': True, 'allow_copy': False}
stage 3 of 3... done.

=== LEMMA TAGGER TRAINING PIPELINE ===

BERT MODEL TUNING 'bert-base-multilingual-cased'. The result's model name will be 'lemma_bert-base-multilingual-cased_len512_ep3_bat8_seed42'
Loading tokenizer...
Tokenizer is loaded. Vocab size: 119547
Corpora processing... done.
Loading model 'bert-base-multilingual-cased'... done.
Epoch 1: 100%|██████████| 48814/48814 [12:27<00:00, 65.28it/s, train_loss=0.407] 
Average train lo

In [23]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = LemmaTagger()
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [24]:
_ = tagger.predict(corpus.dev, clone_ds=True, save_to=res_dev)

Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Vectorizing
100%|██████████| 103/103 [00:10<00:00,  9.56it/s]
Adjusting
100%|██████████| 6584/6584 [00:23<00:00, 282.71it/s]


In [25]:
_ = tagger.evaluate(corpus.dev)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens


Processing corpus
100%|██████████| 6584/6584 [01:09<00:00, 94.38it/s] 
LEMMA total: 118488
    correct: 117459
      wrong: 1029
   Accuracy: 0.9913155762608872


In [26]:
_ = tagger.evaluate(corpus.dev, res_dev)

Evaluating LEMMA
Load corpus
[> 900                                                            

Load corpus
[> 900                                                            

[=> 1600                                                           

[=> 1600                                                           

[===> 3100                                                         

[===> 3100                                                         

[====> 5000                                                        

[====> 5000                                                        





Corpus has been loaded: 6584 sentences, 118692 tokens
LEMMA total: 118488
    correct: 117459
      wrong: 1029
   Accuracy: 0.9913155762608872


Corpus has been loaded: 6584 sentences, 118692 tokens


In [27]:
_ = tagger.predict(corpus.test, save_to=res_test)

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Processing corpus
100%|██████████| 6491/6491 [01:08<00:00, 95.04it/s] 


In [28]:
_ = tagger.evaluate(corpus.test, clone_ds=True)

Evaluating LEMMA


Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


Vectorizing
100%|██████████| 102/102 [00:10<00:00,  9.65it/s]
Adjusting
100%|██████████| 6491/6491 [00:23<00:00, 278.95it/s]
LEMMA total: 117329
    correct: 116310
      wrong: 1019
   Accuracy: 0.9913150201569945


In [29]:
_ = tagger.evaluate(corpus.test, res_test)

Evaluating LEMMA
Load corpus
[> 700                                                            

Load corpus
[> 700                                                            

[=> 1300                                                           

[=> 1300                                                           

[==> 2300                                                          

[==> 2200                                                          

[===> 3100                                                         

[==> 3000                                                          

[====> 4700                                                        

[====> 4700                                                        

[=====> 5800                                                       

[=====> 5800                                                       

Corpus has been loaded: 6491 sentences, 117523 tokens
LEMMA total: 117329
    correct: 116310
      wrong: 1019
   Accuracy: 0.9913150201569945


Corpus has been loaded: 6491 sentences, 117523 tokens


## CoNLL18 Validation

In [30]:
from corpuscula.corpus_utils import download_ud, get_ud_test_path
import junky
from mordl import UposTagger, FeatsTagger, LemmaTagger, conll18_ud_eval

# we use UD Taiga corpus only as example. For real model training comment
# Taiga and uncomment SynTagRus
#corpus_name = 'UD_Russian-Taiga'
corpus_name = 'UD_Russian-SynTagRus'
download_ud(corpus_name, overwrite=False)

corpus_gold = get_ud_test_path(corpus_name)
corpus_test = 'corpora/_test_tagged.conllu'

Downloading UniversalDependencies/UD_Russian-SynTagRus contents
done: 9525 bytes


In [31]:
tagger_u = UposTagger()
tagger_u.load('upos_model', device='cuda:11', dataset_device='cuda:11')
tagger_f = FeatsTagger()
tagger_f.load('feats_model', device='cuda:11', dataset_device='cuda:11')
tagger_l = LemmaTagger()
tagger_l.load('lemma_model', device='cuda:11', dataset_device='cuda:11')

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [32]:
_ = tagger_l.predict(
    tagger_f.predict(
        tagger_u.predict(corpus_gold)
    ), save_to=corpus_test
)

Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens
Processing corpus
100%|██████████| 6491/6491 [01:07<00:00, 96.49it/s] 
Processing corpus
100%|██████████| 6491/6491 [01:07<00:00, 95.95it/s] 
Processing corpus
100%|██████████| 6491/6491 [01:08<00:00, 94.37it/s] 


In [33]:
conll18_ud_eval(corpus_gold, corpus_test)

Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |    100.00 |    100.00 |    100.00 |
Sentences  |    100.00 |    100.00 |    100.00 |
Words      |    100.00 |    100.00 |    100.00 |
UPOS       |     99.15 |     99.15 |     99.15 |     99.15
XPOS       |    100.00 |    100.00 |    100.00 |    100.00
UFeats     |     97.75 |     97.75 |     97.75 |     97.75
AllTags    |     97.55 |     97.55 |     97.55 |     97.55
Lemmas     |     98.57 |     98.57 |     98.57 |     98.57
UAS        |    100.00 |    100.00 |    100.00 |    100.00
LAS        |    100.00 |    100.00 |    100.00 |    100.00
CLAS       |    100.00 |    100.00 |    100.00 |    100.00
MLAS       |     96.22 |     96.22 |     96.22 |     96.22
BLEX       |     97.83 |     97.83 |     97.83 |     97.83


## MISC:NE

Note: The corpora we used are proprietary. You have to find another corpora.

In [34]:
from mordl import UposTagger, FeatsTagger

tagger_u = UposTagger()
tagger_u.load('upos_model')
tagger_f = FeatsTagger()
tagger_f.load('feats_model')

for corpora in zip(['corpora/ner_train.conllu',
                    'corpora/ner_dev.conllu',
                    'corpora/ner_test.conllu'],
                   ['corpora/ner_train_upos_feats.conllu',
                    'corpora/ner_dev_upos_feats.conllu',
                    'corpora/ner_test_upos_feats.conllu']):
    tagger_f.predict(
        tagger_u.predict(corpora[0]), save_to=corpora[1]
    )

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.
Load corpus
Corpus has been loaded: 30390 sentences, 378829 tokens
Processing corpus
100%|██████████| 30390/30390 [03:54<00:00, 129.38it/s]
Processing corpus
100%|██████████| 30390/30390 [03:55<00:00, 128.91it/s]
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Processing corpus
100%|██████████| 3799/3799 [00:29<00:00, 128.57it/s]
Processing corpus
100%|██████████| 3799/3799 [00:29<00:00, 128.80it/s]
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Processing corpus
100%|██████████| 3798/3798 [00:29<00:00, 129.88it/s]
Processing corpus
100%|██████████| 3798/3798 [00:29<00:00, 127.57it/s]


In [35]:
import junky
from mordl import NeTagger

BERT_MODEL_FN = 'bert-base-multilingual-cased'
MODEL_FN = 'misc-ne_model'
SEED=42

corpus_train = 'corpora/ner_train_upos_feats.conllu'
corpus_dev = 'corpora/ner_dev_upos_feats.conllu'
corpus_test = 'corpora/ner_test_upos_feats.conllu'

In [36]:
tagger = NeTagger()
tagger.load_train_corpus(corpus_train)
tagger.load_test_corpus(corpus_dev)

_ = tagger.train(MODEL_FN, device='cuda:8', word_emb_type='bert',
                 word_emb_path=MODEL_FN.replace('model', BERT_MODEL_FN)
                             + '_len512_ep3_bat8_seed42',
                 word_emb_tune_params={
                     'model_name': BERT_MODEL_FN,
                     'max_len': 512, 'epochs': 3, 'batch_size': 8
                 },
                 rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
                 upos_emb_dim=300, lstm_layers=2, seed=SEED)

Train: 

Load corpus
Corpus has been loaded: 30390 sentences, 378829 tokens


Test: 

Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Parse corpus
[###############################] 100%                             
done: 30390 sentences, 362786 acceptable tokens (plus 0 for YO letters)
Fit corpus dict... done.
=== MISC:NE TAGGER TRAINING PIPELINE ===

BERT MODEL TUNING 'bert-base-multilingual-cased'. The result's model name will be 'misc-ne_bert-base-multilingual-cased_len512_ep3_bat8_seed42'
Loading tokenizer...
Tokenizer is loaded. Vocab size: 119547
Corpora processing... done.
Loading model 'bert-base-multilingual-cased'... done.
Epoch 1: 100%|██████████| 30390/30390 [07:06<00:00, 71.19it/s, train_loss=0.506] 
Average train loss: 0.22115723773073204
Dev: accuracy = 0.94016071
Dev: precision = 0.64285216
Dev: recall = 0.60400200
Dev: f1_score = 0.60815842
NB: Scores may be high because of tags stretching
Saving model to misc-ne_bert-base-multilingual-cased_len512_ep3_bat8_seed42
Epoch

In [37]:
res_dev = 'corpora/_dev_' + MODEL_FN.replace('_model', '.conllu')
res_test = 'corpora/_test_' + MODEL_FN.replace('_model', '.conllu')
tagger = NeTagger()
tagger.load(MODEL_FN)
junky.clear_tqdm()

Loading dataset... done.
Creating model... done.
Loading state_dict... done.
Fit corpus dict... done.


In [38]:
_ = tagger.predict(corpus_dev, clone_ds=True, save_to=res_dev)

Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens


In [40]:
_ = tagger.evaluate(corpus_dev)

Evaluating MISC:NE
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
Processing corpus
100%|██████████| 3799/3799 [00:28<00:00, 132.82it/s]
MISC:NE total: 17041
      correct: 14346
        wrong: 2695 [914 excess / 1027 absent / 754 wrong type]
     Accuracy: 0.8418520039903762
[Total accuracy: 0.9429991539763113]


In [41]:
_ = tagger.evaluate(corpus_dev, res_dev)

Evaluating MISC:NE
Load corpus
Load corpus
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
[====] 3799                                                        
Corpus has been loaded: 3799 sentences, 47280 tokens
MISC:NE total: 17041
      correct: 14346
        wrong: 2695 [914 excess / 1027 absent / 754 wrong type]
     Accuracy: 0.8418520039903762
[Total accuracy: 0.9429991539763113]


In [42]:
_ = tagger.predict(corpus_test, save_to=res_test)

Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Processing corpus
100%|██████████| 3798/3798 [00:28<00:00, 133.49it/s]


In [43]:
_ = tagger.evaluate(corpus_test, clone_ds=True)

Evaluating MISC:NE
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
MISC:NE total: 16722
      correct: 14096
        wrong: 2626 [1027 excess / 950 absent / 649 wrong type]
     Accuracy: 0.8429613682573854
[Total accuracy: 0.9442770445189492]


In [44]:
_ = tagger.evaluate(corpus_test, res_test)

Evaluating MISC:NE
Load corpus
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
MISC:NE total: 16722
      correct: 14096
        wrong: 2626 [1027 excess / 950 absent / 649 wrong type]
     Accuracy: 0.8429613682573854
[Total accuracy: 0.9442770445189492]


In [45]:
corp_gold = list(tagger._get_corpus(corpus_test, asis=True))
corp_test = list(tagger._get_corpus(res_test))
tags = sorted(set(x['MISC'].get('NE')
                      for x in corp_gold for x in x[0]
                          if x['MISC'].get('NE')))
for tag in tags:
    print('{}: {}'.format(
        tag, tagger.evaluate(corp_gold, corp_test,
                             label=tag, log_file=None)
    ))

Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Load corpus
[====] 3798                                                        
Corpus has been loaded: 3798 sentences, 47126 tokens
Address: 0.8411097099621689
City: 0.0
Date: 0.0
Department: 0.77129750982962
Facility: 0.3817427385892116
Geo: 0.8568019093078759
Goal: 0.4030054644808743
Location: 0.38461538461538464
Organization: 0.8547898752573574
Person: 0.7269076305220884
PersonProperty: 0.7540603248259861
Phone: 0.7158469945355191
Time: 0.0
