# Evaluation Comparison

## eCoNLL 

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../src/'))

In [2]:
from econll.parser import merge_tag, parse, merge

In [3]:
from econll.aligner import align
from econll.rebaser import rebase

In [4]:
from econll.scorer import chunkeval, tokeneval

## Data

In [5]:
import nltk

In [6]:
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to /Users/eas/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [7]:
from nltk.corpus import conll2002

In [8]:
# trn = [[(text, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.train')]
tst = [[(text, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.testa')]

In [9]:
# references
tst_tok = [[text for text, iob in sent] for sent in tst]
tst_tag = [[iob for text, iob in sent] for sent in tst]
tst_txt = [" ".join(sent) for sent in tst_tok]

## spaCy Predictions

In [10]:
import spacy

In [11]:
# !python -m spacy download es_core_news_sm

In [12]:
nlp = spacy.load("es_core_news_sm")

In [13]:
# get spacy predictions on the test set
tst_hyp = [[(tok.text, merge_tag((tok.ent_type_ or None), tok.ent_iob_)) for tok in nlp(sent)] for sent in tst_txt]

In [14]:
# compare input & output

diff = sum([int(len(ref) != len(hyp)) for ref, hyp in zip(tst, tst_hyp)])
print(f"refs: {len(tst)} vs. hyps: {len(tst_hyp)} -> {diff} with different tokenization")

refs: 1915 vs. hyps: 1915 -> 132 with different tokenization


## Alignment & Transfer

In [15]:
hyp_tok = [[txt for txt, tag in sent] for sent in tst_hyp]
hyp_tag = [[tag for txt, tag in sent] for sent in tst_hyp]

In [16]:
aln_tag = [rebase(ref, hyp, val, scheme="IOB") 
           for ref, hyp, val in zip(tst_tok, hyp_tok, hyp_tag)]

In [17]:
diff = sum([int(len(ref) != len(hyp)) for ref, hyp in zip(tst, aln_tag)])
print(f"refs: {len(tst)} vs. hyps: {len(aln_tag)} -> {diff} with different tokenization")

refs: 1915 vs. hyps: 1915 -> 0 with different tokenization


In [18]:
aln_tag[0]

['B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']

## Evaluation Comparison

In [19]:
from sklearn.metrics import classification_report as sklearn_report
from seqeval.metrics import classification_report as seqeval_report

### `scikit-learn`

In [20]:
print(sklearn_report([tag for block in tst_tag for tag in block],
                     [tag for block in aln_tag for tag in block],
                     digits=4))

              precision    recall  f1-score   support

       B-LOC     0.4620    0.7419    0.5694       984
      B-MISC     0.2041    0.2876    0.2388       445
       B-ORG     0.7638    0.4982    0.6031      1700
       B-PER     0.6962    0.7201    0.7080      1222
       I-LOC     0.3400    0.7596    0.4697       337
      I-MISC     0.3087    0.2997    0.3041       654
       I-ORG     0.7556    0.3690    0.4958      1366
       I-PER     0.7958    0.8847    0.8379       859
           O     0.9817    0.9812    0.9814     45356

    accuracy                         0.9222     52923
   macro avg     0.5898    0.6158    0.5787     52923
weighted avg     0.9306    0.9222    0.9226     52923



### `seqeval`

In [21]:
print(seqeval_report(tst_tag, aln_tag, digits=4))

              precision    recall  f1-score   support

         LOC     0.4563    0.7320    0.5622       985
        MISC     0.1707    0.2404    0.1996       445
         ORG     0.7142    0.4659    0.5639      1700
         PER     0.6875    0.7111    0.6991      1222

   micro avg     0.5434    0.5719    0.5573      4352
   macro avg     0.5072    0.5374    0.5062      4352
weighted avg     0.5927    0.5719    0.5642      4352



### `econll`

In [22]:
print(chunkeval(tst_tag, aln_tag))


Chunk-Level Evaluation

label     	 pre  	 rec  	 f1s  	 gold 	 pred 	 true 

LOC       	0.4563	0.7320	0.5622	   985	  1580	   721
MISC      	0.1707	0.2404	0.1996	   445	   627	   107
ORG       	0.7142	0.4659	0.5639	  1700	  1109	   792
PER       	0.6875	0.7111	0.6991	  1222	  1264	   869

token     	0.9222	0.9222	0.9222	 52923	 52923	 48804
block     	0.4480	0.4480	0.4480	  1915	  1915	   858
spans     	0.7869	0.8281	0.8070	  4352	  4580	  3604
micro     	0.5434	0.5719	0.5573	  4352	  4580	  2489
macro     	0.5072	0.5374	0.5062	  4352	  4580	  2489
weighted  	0.5927	0.5719	0.5642	  4352	  4580	  2489



In [23]:
print(tokeneval(tst_tag, aln_tag))


Token-Level Evaluation

label     	 pre  	 rec  	 f1s  	 gold 	 pred 	 true 

B-LOC     	0.4620	0.7419	0.5694	   984	  1580	   730
B-MISC    	0.2041	0.2876	0.2388	   445	   627	   128
B-ORG     	0.7638	0.4982	0.6031	  1700	  1109	   847
B-PER     	0.6962	0.7201	0.7080	  1222	  1264	   880
I-LOC     	0.3400	0.7596	0.4697	   337	   753	   256
I-MISC    	0.3087	0.2997	0.3041	   654	   635	   196
I-ORG     	0.7556	0.3690	0.4958	  1366	   667	   504
I-PER     	0.7958	0.8847	0.8379	   859	   955	   760
O         	0.9817	0.9812	0.9814	 45356	 45333	 44503

block     	0.4480	0.4480	0.4480	  1915	  1915	   858
micro     	0.9222	0.9222	0.9222	 52923	 52923	 48804
macro     	0.5898	0.6158	0.5787	 52923	 52923	 48804
weighted  	0.9306	0.9222	0.9226	 52923	 52923	 48804

