# Benchmark evaluation

## I. Evaluate algorithms on the benchmark data 

In [1]:
from helper_utils import RecallEstimator

r_estimator = RecallEstimator('amundsen_02/data_description.csv')

Loaded evaluation benchmark of size 1685.


### I.I. EstNLTK's default model

In [2]:
from estnltk.taggers import NerTagger

nertagger = NerTagger(output_layer='estnltk_ner')
r_estimator.evaluate_tagger(nertagger, "EstNLTK-default-NER")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1685/1685 [03:00<00:00,  9.35it/s]


{'Recall': 0.5863719147123075,
 'Recall-95CI%': (0.5556871420680223, 0.6170566873565928),
 'correct': 775,
 'incorrect': 910}

In [3]:
r_estimator.leaderboard()

Unnamed: 0,Recall,Recall-95CI%,correct,incorrect
EstNLTK-default-NER,0.586372,"(0.5556871420680223, 0.6170566873565928)",775,910


### I.II. Kairit's model I

#### A. Use EstNLTK's words layer as a basis of tokenization

In [4]:
from estnltk_neural.taggers import EstBERTNERTagger

# Evaluate on neural NER aligned with EstNLTK's words layer
kairit_11 = EstBERTNERTagger(output_layer='estbertner_v1_estnltk_words', custom_words_layer='words')
r_estimator.evaluate_tagger(kairit_11, "EstBertNER-v1-estnltk_words")

  from .autonotebook import tqdm as notebook_tqdm
Downloading resources index: 20.1kB [00:00, 20.1MB/s]
Some weights of the model checkpoint at C:\Programmid\Miniconda3\envs\py39_devel\lib\site-packages\estnltk-1.7.2-py3.9-win-amd64.egg\estnltk\estnltk_resources\estbert\ner_model_hf_tartunlp_2022-05-06\ were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 20%|██████████████████████████████████                                                      

ERROR:estbertner_tagger.py:204: Skipping [UNK] token with tag 'I-ORG' from annotations.


 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 1428/1685 [02:33<00:42,  6.08it/s]

ERROR:estbertner_tagger.py:204: Skipping [UNK] token with tag 'B-PER' from annotations.
ERROR:estbertner_tagger.py:204: Skipping [UNK] token with tag 'I-PER' from annotations.
ERROR:estbertner_tagger.py:204: Skipping [UNK] token with tag 'I-PER' from annotations.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1685/1685 [03:01<00:00,  9.30it/s]


{'Recall': 0.7641942790662142,
 'Recall-95CI%': (0.7377452106929332, 0.7906433474394952),
 'correct': 1257,
 'incorrect': 428}

#### B. Use Bert's default tokenization

In [5]:
from estnltk_neural.taggers import EstBERTNERTagger

# Evaluate on neural NER with its own (Bert's) tokenization
kairit_12 = EstBERTNERTagger(output_layer='estbertner_v1_bert_tokens', custom_words_layer=None)
r_estimator.evaluate_tagger(kairit_12, "EstBertNER-v1-bert_tokens")

Some weights of the model checkpoint at C:\Programmid\Miniconda3\envs\py39_devel\lib\site-packages\estnltk-1.7.2-py3.9-win-amd64.egg\estnltk\estnltk_resources\estbert\ner_model_hf_tartunlp_2022-05-06\ were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 20%|██████████████████████████████████                                                                                                                                       | 339/1685 [00:39<03:12

ERROR:estbertner_tagger.py:204: Skipping [UNK] token with tag 'I-ORG' from annotations.


 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 1428/1685 [02:45<00:42,  6.12it/s]

ERROR:estbertner_tagger.py:204: Skipping [UNK] token with tag 'B-PER' from annotations.
ERROR:estbertner_tagger.py:204: Skipping [UNK] token with tag 'I-PER' from annotations.
ERROR:estbertner_tagger.py:204: Skipping [UNK] token with tag 'I-PER' from annotations.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1685/1685 [03:22<00:00,  8.33it/s]


{'Recall': 0.6201637929998809,
 'Recall-95CI%': (0.5899237291790453, 0.6504038568207164),
 'correct': 1048,
 'incorrect': 637}

### I.III. Kairit's model II

#### A. Use EstNLTK's words layer as a basis of tokenization

In [6]:
from estnltk_neural.taggers import EstBERTNERTagger
from estnltk import get_resource_paths

kairit_21 = EstBERTNERTagger(output_layer='estbertner_v2_estnltk_words', custom_words_layer='words', 
                             model_location=get_resource_paths("estbertner_v2", only_latest=True, download_missing=True))
r_estimator.evaluate_tagger(kairit_21, "EstBertNER-v2-estnltk_words", ignore_errors=True)

 ("(!) Cannot find bert_token's 'y' text location at 'es ” ( APIM ) ) kuni 31. detsembrini 2000 ;'.", "in the 'EstBERTNERTagger'")
 ("(!) Cannot find bert_token's 'lait' text location at ' Laïta jõe suudme vaheline rannikuala ,'.", "in the 'EstBERTNERTagger'")
 ("(!) Cannot find bert_token's 'zo' text location at ' Žórarinn Eldjárn ; Soome - Joni Pyysalo , Peter Mickwitz ; Rootsi - Ida Börjel , Simon Marainen ; Norra - Anne Bøe ; Taani - Peter Laugesen ; Fääri saared - Sigri Gaini ; Läti - Kārlis Vērdiņš ; Leedu - Birute Mar ( Marcinkeviciute ) ; Saksamaa - Klavki ; Peterburi - Valeri Shubinski ; Eesti - Doris Kareva , Eeva Park , Triin Soomets , Elo Viiding , Merca , Indrek Hirv , Hando Runnel , Priidu Beier , fs , Kalju Kruusa .'.", "in the 'EstBERTNERTagger'")
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1685/1685 [03:34<00:00,  7.84it/s]


{'Recall': 0.5952331757385971,
 'Recall-95CI%': (0.564650369113671, 0.6258159823635233),
 'correct': 815,
 'incorrect': 870}

#### B. Use Bert's default tokenization

In [7]:
from estnltk import get_resource_paths
kairit_22 = EstBERTNERTagger(output_layer='estbertner_v2_bert_tokens', custom_words_layer=None, 
                             model_location=get_resource_paths("estbertner_v2", only_latest=True, download_missing=True))
r_estimator.evaluate_tagger(kairit_22, "EstBertNER-v2-bert_tokens", ignore_errors=True)

 ("(!) Cannot find bert_token's 'y' text location at 'es ” ( APIM ) ) kuni 31. detsembrini 2000 ;'.", "in the 'EstBERTNERTagger'")
 ("(!) Cannot find bert_token's 'lait' text location at ' Laïta jõe suudme vaheline rannikuala ,'.", "in the 'EstBERTNERTagger'")
 ("(!) Cannot find bert_token's 'zo' text location at ' Žórarinn Eldjárn ; Soome - Joni Pyysalo , Peter Mickwitz ; Rootsi - Ida Börjel , Simon Marainen ; Norra - Anne Bøe ; Taani - Peter Laugesen ; Fääri saared - Sigri Gaini ; Läti - Kārlis Vērdiņš ; Leedu - Birute Mar ( Marcinkeviciute ) ; Saksamaa - Klavki ; Peterburi - Valeri Shubinski ; Eesti - Doris Kareva , Eeva Park , Triin Soomets , Elo Viiding , Merca , Indrek Hirv , Hando Runnel , Priidu Beier , fs , Kalju Kruusa .'.", "in the 'EstBERTNERTagger'")
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1685/1685 [03:19<00:00,  8.45it/s]


{'Recall': 0.23890777215371042,
 'Recall-95CI%': (0.21233939043790626, 0.26547615386951456),
 'correct': 412,
 'incorrect': 1273}

## II. Summarise results

In [8]:
r_estimator.leaderboard()

Unnamed: 0,Recall,Recall-95CI%,correct,incorrect
EstBertNER-v1-estnltk_words,0.764194,"(0.7377452106929332, 0.7906433474394952)",1257,428
EstBertNER-v1-bert_tokens,0.620164,"(0.5899237291790453, 0.6504038568207164)",1048,637
EstBertNER-v2-estnltk_words,0.595233,"(0.564650369113671, 0.6258159823635233)",815,870
EstNLTK-default-NER,0.586372,"(0.5556871420680223, 0.6170566873565928)",775,910
EstBertNER-v2-bert_tokens,0.238908,"(0.21233939043790626, 0.26547615386951456)",412,1273


In [9]:
# Use default ordering and save to CSV
r_estimator.leaderboard(order_by_recall=False).to_csv('leaderboard_amundsen_02.csv')

---