# Benchmark evaluation

## I. Evaluate algorithms on the benchmark data 

In [1]:
from helper_utils import RecallEstimator

r_estimator = RecallEstimator('amundsen_01/data_description.csv')

Loaded evaluation benchmark of size 535.


### I.I. EstNLTK's default model

In [2]:
from estnltk.taggers import NerTagger

nertagger = NerTagger(output_layer='estnltk_ner')
r_estimator.evaluate_tagger(nertagger, "EstNLTK-default-NER")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [00:54<00:00,  9.90it/s]


{'Recall': 0.5892016803782091,
 'Recall-95CI%': (0.5442531601445354, 0.6341502006118828),
 'correct': 344,
 'incorrect': 191}

In [3]:
r_estimator.leaderboard()

Unnamed: 0,Recall,Recall-95CI%,correct,incorrect
EstNLTK-default-NER,0.589202,"(0.5442531601445354, 0.6341502006118828)",344,191


### I.II. Kairit's model I

#### A. Use EstNLTK's words layer as a basis of tokenization

In [4]:
from estnltk_neural.taggers import EstBERTNERTagger

# Evaluate on neural NER aligned with EstNLTK's words layer
kairit_11 = EstBERTNERTagger(output_layer='estbertner_v1_estnltk_words', custom_words_layer='words')
r_estimator.evaluate_tagger(kairit_11, "EstBertNER-v1-estnltk_words")

  from .autonotebook import tqdm as notebook_tqdm
Downloading resources index: 20.1kB [00:00, 10.5MB/s]
Some weights of the model checkpoint at C:\Programmid\Miniconda3\envs\py39_devel\lib\site-packages\estnltk-1.7.2-py3.9-win-amd64.egg\estnltk\estnltk_resources\estbert\ner_model_hf_tartunlp_2022-05-06\ were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████████

{'Recall': 0.7393959687879113,
 'Recall-95CI%': (0.6992909997792455, 0.779500937796577),
 'correct': 423,
 'incorrect': 112}

#### B. Use Bert's default tokenization

In [5]:
from estnltk_neural.taggers import EstBERTNERTagger

# Evaluate on neural NER with its own (Bert's) tokenization
kairit_12 = EstBERTNERTagger(output_layer='estbertner_v1_bert_tokens', custom_words_layer=None)
r_estimator.evaluate_tagger(kairit_12, "EstBertNER-v1-bert_tokens")

Some weights of the model checkpoint at C:\Programmid\Miniconda3\envs\py39_devel\lib\site-packages\estnltk-1.7.2-py3.9-win-amd64.egg\estnltk\estnltk_resources\estbert\ner_model_hf_tartunlp_2022-05-06\ were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [00:51<00:00, 10.30it/s]


{'Recall': 0.5986107200179038,
 'Recall-95CI%': (0.5538265828949592, 0.6433948571408484),
 'correct': 345,
 'incorrect': 190}

### I.III. Kairit's model II

#### A. Use EstNLTK's words layer as a basis of tokenization

In [6]:
from estnltk_neural.taggers import EstBERTNERTagger
from estnltk import get_resource_paths

kairit_21 = EstBERTNERTagger(output_layer='estbertner_v2_estnltk_words', custom_words_layer='words', 
                             model_location=get_resource_paths("estbertner_v2", only_latest=True, download_missing=True))
r_estimator.evaluate_tagger(kairit_21, "EstBertNER-v2-estnltk_words")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [00:58<00:00,  9.22it/s]


{'Recall': 0.6205693492710501,
 'Recall-95CI%': (0.576236009719034, 0.6649026888230662),
 'correct': 321,
 'incorrect': 214}

#### B. Use Bert's default tokenization

In [7]:
from estnltk import get_resource_paths
kairit_22 = EstBERTNERTagger(output_layer='estbertner_v2_bert_tokens', custom_words_layer=None, 
                             model_location=get_resource_paths("estbertner_v2", only_latest=True, download_missing=True))
r_estimator.evaluate_tagger(kairit_22, "EstBertNER-v2-bert_tokens")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [01:03<00:00,  8.45it/s]


{'Recall': 0.27861353796148314,
 'Recall-95CI%': (0.23765408170807967, 0.3195729942148866),
 'correct': 131,
 'incorrect': 404}

## II. Summarise results

In [8]:
r_estimator.leaderboard()

Unnamed: 0,Recall,Recall-95CI%,correct,incorrect
EstBertNER-v1-estnltk_words,0.739396,"(0.6992909997792455, 0.779500937796577)",423,112
EstBertNER-v2-estnltk_words,0.620569,"(0.576236009719034, 0.6649026888230662)",321,214
EstBertNER-v1-bert_tokens,0.598611,"(0.5538265828949592, 0.6433948571408484)",345,190
EstNLTK-default-NER,0.589202,"(0.5442531601445354, 0.6341502006118828)",344,191
EstBertNER-v2-bert_tokens,0.278614,"(0.23765408170807967, 0.3195729942148866)",131,404


In [9]:
# Use default ordering and save to CSV
r_estimator.leaderboard(order_by_recall=False).to_csv('leaderboard_amundsen_01.csv')

---