# Benchmark evaluation

## I. Evaluate algorithms on the benchmark data 

In [1]:
from helper_utils import RecallEstimator

r_estimator = RecallEstimator('amundsen_01/data_description.csv')

Loaded evaluation benchmark of size 535.


### I.I. EstNLTK's default model

In [2]:
from estnltk.taggers import NerTagger

nertagger = NerTagger(output_layer='estnltk_ner')
r_estimator.evaluate_tagger(nertagger, "EstNLTK-default-NER")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [00:54<00:00,  9.73it/s]


{'Recall': 0.5875822776214714,
 'Recall-95CI%': (0.542378287169957, 0.6327862680729859),
 'correct': 344,
 'incorrect': 191}

In [3]:
r_estimator.leaderboard()

Unnamed: 0,Recall,Recall-95CI%,correct,incorrect
EstNLTK-default-NER,0.587582,"(0.542378287169957, 0.6327862680729859)",344,191


### I.II. Kairit's model I

#### A. Use EstNLTK's words layer as a basis of tokenization

In [4]:
from estnltk_neural.taggers import EstBERTNERTagger

# Evaluate on neural NER aligned with EstNLTK's words layer
kairit_11 = EstBERTNERTagger(output_layer='estbertner_v1_estnltk_words', custom_words_layer='words')
r_estimator.evaluate_tagger(kairit_11, "EstBertNER-v1-estnltk_words")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at C:\Programmid\Miniconda3\envs\py39_devel\lib\site-packages\estnltk-1.7.2-py3.9-win-amd64.egg\estnltk\estnltk_resources\estbert\ner_model_hf_tartunlp_2022-05-06\ were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

{'Recall': 0.7378266049169795,
 'Recall-95CI%': (0.6974392663356724, 0.7782139434982865),
 'correct': 423,
 'incorrect': 112}

#### B. Use Bert's default tokenization

In [5]:
from estnltk_neural.taggers import EstBERTNERTagger

# Evaluate on neural NER with its own (Bert's) tokenization
kairit_12 = EstBERTNERTagger(output_layer='estbertner_v1_bert_tokens', custom_words_layer=None)
r_estimator.evaluate_tagger(kairit_12, "EstBertNER-v1-bert_tokens")

Some weights of the model checkpoint at C:\Programmid\Miniconda3\envs\py39_devel\lib\site-packages\estnltk-1.7.2-py3.9-win-amd64.egg\estnltk\estnltk_resources\estbert\ner_model_hf_tartunlp_2022-05-06\ were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [00:54<00:00

{'Recall': 0.597255930126802,
 'Recall-95CI%': (0.5522190204013548, 0.6422928398522492),
 'correct': 345,
 'incorrect': 190}

### I.III. Kairit's model II

#### A. Use EstNLTK's words layer as a basis of tokenization

In [6]:
from estnltk_neural.taggers import EstBERTNERTagger
from estnltk import get_resource_paths

kairit_21 = EstBERTNERTagger(output_layer='estbertner_v2_estnltk_words', custom_words_layer='words', 
                             model_location=get_resource_paths("estbertner_v2", only_latest=True, download_missing=True))
r_estimator.evaluate_tagger(kairit_21, "EstBertNER-v2-estnltk_words")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [01:07<00:00,  7.90it/s]


{'Recall': 0.6212461225635655,
 'Recall-95CI%': (0.5767026408929118, 0.6657896042342192),
 'correct': 321,
 'incorrect': 214}

#### B. Use Bert's default tokenization

In [7]:
from estnltk import get_resource_paths
kairit_22 = EstBERTNERTagger(output_layer='estbertner_v2_bert_tokens', custom_words_layer=None, 
                             model_location=get_resource_paths("estbertner_v2", only_latest=True, download_missing=True))
r_estimator.evaluate_tagger(kairit_22, "EstBertNER-v2-bert_tokens")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [01:12<00:00,  7.41it/s]


{'Recall': 0.27987202626926433,
 'Recall-95CI%': (0.23864724524359443, 0.32109680729493423),
 'correct': 131,
 'incorrect': 404}

## II. Summarise results

In [8]:
r_estimator.leaderboard()

Unnamed: 0,Recall,Recall-95CI%,correct,incorrect
EstBertNER-v1-estnltk_words,0.737827,"(0.6974392663356724, 0.7782139434982865)",423,112
EstBertNER-v2-estnltk_words,0.621246,"(0.5767026408929118, 0.6657896042342192)",321,214
EstBertNER-v1-bert_tokens,0.597256,"(0.5522190204013548, 0.6422928398522492)",345,190
EstNLTK-default-NER,0.587582,"(0.542378287169957, 0.6327862680729859)",344,191
EstBertNER-v2-bert_tokens,0.279872,"(0.23864724524359443, 0.32109680729493423)",131,404


In [9]:
# Use default ordering and save to CSV
r_estimator.leaderboard(order_by_recall=False).to_csv('leaderboard_amundsen_01.csv')

---