# Benchmark evaluation

## I. Evaluate algorithm to the benchmark data 

In [1]:
from helper_functions import load_evaluation_data
from helper_functions import evaluate_benchmark
from helper_functions import find_recall_estimate

gold_standard = load_evaluation_data()

In [2]:
len(gold_standard)

535

In [3]:
all_results = {}

### I.I. EstNLTK's default model

In [4]:
from estnltk.taggers import NerTagger

nertagger = NerTagger(output_layer='estnltk_ner')
results = evaluate_benchmark(gold_standard, nertagger)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [00:54<00:00,  9.77it/s]


In [5]:
results['correct'].value_counts()

correct
yes    344
no     191
Name: count, dtype: int64

In [6]:
all_results["EstNLTK-default-NER"] = find_recall_estimate(results)
all_results["EstNLTK-default-NER"]

{'Recall': 0.5875822776214714,
 'Recall-95CI%': (0.542378287169957, 0.6327862680729859)}

### I.II. Kairit's model I

#### A. Use EstNLTK's words layer as a basis of tokenization

In [7]:
from estnltk_neural.taggers import EstBERTNERTagger

# Evaluate on neural NER aligned with EstNLTK's words layer
kairit_11 = EstBERTNERTagger(output_layer='estbertner_v1_estnltk_words', custom_words_layer='words')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at C:\Programmid\Miniconda3\envs\py39_devel\lib\site-packages\estnltk-1.7.2-py3.9-win-amd64.egg\estnltk\estnltk_resources\estbert\ner_model_hf_tartunlp_2022-05-06\ were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
results = evaluate_benchmark(gold_standard, kairit_11)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [00:52<00:00, 10.18it/s]


In [9]:
results['correct'].value_counts()

correct
yes    423
no     112
Name: count, dtype: int64

In [10]:
all_results["Kairit-I-estnltk_words"] = find_recall_estimate(results)
all_results["Kairit-I-estnltk_words"]

{'Recall': 0.7378266049169795,
 'Recall-95CI%': (0.6974392663356724, 0.7782139434982865)}

#### B. Use Bert's default tokenization

In [11]:
from estnltk_neural.taggers import EstBERTNERTagger

# Evaluate on neural NER with its own tokenization
kairit_12 = EstBERTNERTagger(output_layer='estbertner_v1_bert_tokens', custom_words_layer=None)

Some weights of the model checkpoint at C:\Programmid\Miniconda3\envs\py39_devel\lib\site-packages\estnltk-1.7.2-py3.9-win-amd64.egg\estnltk\estnltk_resources\estbert\ner_model_hf_tartunlp_2022-05-06\ were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
results = evaluate_benchmark(gold_standard, kairit_12)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [00:55<00:00,  9.69it/s]


In [13]:
results['correct'].value_counts()

correct
yes    345
no     190
Name: count, dtype: int64

In [14]:
all_results["Kairit-I-bert_tokens"] = find_recall_estimate(results)
all_results["Kairit-I-bert_tokens"]

{'Recall': 0.597255930126802,
 'Recall-95CI%': (0.5522190204013548, 0.6422928398522492)}

### I.III. Kairit's model II

#### A. Use EstNLTK's words layer as a basis of tokenization

In [15]:
from estnltk_neural.taggers import EstBERTNERTagger
from estnltk import get_resource_paths

kairit_21 = EstBERTNERTagger(output_layer='estbertner_v2_estnltk_words', custom_words_layer='words', 
                             model_location=get_resource_paths("estbertner_v2", only_latest=True, download_missing=True))

In [16]:
results = evaluate_benchmark(gold_standard, kairit_21)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [01:05<00:00,  8.15it/s]


In [17]:
results['correct'].value_counts()

correct
yes    321
no     214
Name: count, dtype: int64

In [18]:
all_results["Kairit-II-estnltk_words"] = find_recall_estimate(results)
all_results["Kairit-II-estnltk_words"]

{'Recall': 0.6212461225635655,
 'Recall-95CI%': (0.5767026408929118, 0.6657896042342192)}

#### B. Use Bert's default tokenization

In [19]:
from estnltk import get_resource_paths
kairit_22 = EstBERTNERTagger(output_layer='estbertner_v2_bert_tokens', custom_words_layer=None, 
                             model_location=get_resource_paths("estbertner_v2", only_latest=True, download_missing=True))

In [20]:
results = evaluate_benchmark(gold_standard, kairit_22)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535/535 [01:04<00:00,  8.31it/s]


In [21]:
results['correct'].value_counts()

correct
no     404
yes    131
Name: count, dtype: int64

In [22]:
all_results["Kairit-II-bert_tokens"] = find_recall_estimate(results)
all_results["Kairit-II-bert_tokens"]

{'Recall': 0.27987202626926433,
 'Recall-95CI%': (0.23864724524359443, 0.32109680729493423)}

## II. Summarise results

In [25]:
# Output as CSV
import pandas as pd

leaderboard = pd.DataFrame.from_dict(all_results, orient='index')
display(leaderboard)
leaderboard.to_csv('leaderboard.csv')

Unnamed: 0,Recall,Recall-95CI%
EstNLTK-default-NER,0.587582,"(0.542378287169957, 0.6327862680729859)"
Kairit-I-estnltk_words,0.737827,"(0.6974392663356724, 0.7782139434982865)"
Kairit-I-bert_tokens,0.597256,"(0.5522190204013548, 0.6422928398522492)"
Kairit-II-estnltk_words,0.621246,"(0.5767026408929118, 0.6657896042342192)"
Kairit-II-bert_tokens,0.279872,"(0.23864724524359443, 0.32109680729493423)"


---