In [23]:
import sys; sys.path.insert(0, '../..')
import spacy
import stanza
from params import SPACY_MODEL_PATH, BERT_MODEL_PATH, DEVICE
from utils import init_model, bert_ner

print('device: ', DEVICE)
# load the spaCy dutch nlp model
spacy_model = spacy.load(SPACY_MODEL_PATH)
# load the stanza dutch nlp model
stanza_model = stanza.Pipeline(lang='nl')
# load the bert dutch ner model
bert_model, bert_tokenizer, _ = init_model(BERT_MODEL_PATH)


device:  cuda


2020-05-08 10:50:36 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |
| depparse  | alpino  |
| ner       | conll02 |

2020-05-08 10:50:36 INFO: Use device: gpu
2020-05-08 10:50:36 INFO: Loading: tokenize
2020-05-08 10:50:36 INFO: Loading: pos
2020-05-08 10:50:36 INFO: Loading: lemma
2020-05-08 10:50:36 INFO: Loading: depparse
2020-05-08 10:50:37 INFO: Loading: ner
2020-05-08 10:50:38 INFO: Done loading processors!


In [24]:
def convert_spacy_tags_to_standards(spacy_tags):
    output = []
    miscs = ['NORP', 'FAC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 
             'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 
             'QUANTITY', 'ORDINAL', 'CARDINAL']
    miscs.remove('QUANTITY')
    miscs.remove('ORDINAL')
    miscs.remove('CARDINAL')
    miscs.remove('MONEY')
    miscs.remove('PERCENT')
    miscs.remove('DATE')
    miscs.remove('TIME')
    for iob, etype in spacy_tags:
        if iob == 'O':
            output.append(iob)
        else:
            if etype == 'PERSON':
                output.append(iob + '-PER')
            elif etype == 'ORG':
                output.append(iob + '-ORG')
            elif etype in ['GPE', 'LOC']:
                output.append(iob + '-LOC')
            elif etype in miscs:
                output.append(iob + '-MISC')
            else:
                output.append('O')
            
    return output
    
    

# for the tags that spacy can detect:
# https://spacy.io/api/annotation#named-entities
def spacy_ner(input_string):
    doc = spacy_model(input_string)
    tags = [(token.ent_iob_, token.ent_type_) for token in doc]
    output = convert_spacy_tags_to_standards(tags)
    return output



def stanza_ner(input_string): 
    doc = stanza_model(input_string)
    output = []
    for sent in doc.sentences:
        for token in sent.tokens:
            ner = token.ner
            if ner == 'O':
                output.append(ner)
            else:
                if ner[0:2] == 'E-':
                    output.append('I-'+ner[2:])
                elif ner[0:2] == 'S-':
                    output.append('B-'+ner[2:])
                else:
                    output.append(ner)

    return output
    

def bert_ner_(words):
    placeholder_labels = ['O' for _ in words]
    output = bert_ner(bert_model, bert_tokenizer, words, placeholder_labels)
    return output

In [25]:
s = 'Waar is Amsterdam UMC?'
s = "Dat is in Italië , Spanje of Engeland misschien geen probleem , maar volgens ' Der Kaiser ' in Duitsland wel ."
s = 'De Europese Centrale Bank verhoogde gisteren haar centraal ...'
s = 'Vandaag is het haar 46e verjaardag.'
s = 'Er zijn 1 eend en 2 honden.'
s = 'Het is de vierde keer dat hij 200 euro verloor, met 15% rente, op 26 Juli.'
s = 'Hij spreekt Engels.'
spacy_preds = spacy_ner(s)
print(f'spacy predictions ({len(spacy_preds)}): ', spacy_preds)
doc = spacy_model(s)
words = [t.text for t in doc]
bert_preds = bert_ner_(words)
print(f'bert predictions ({len(bert_preds)}): ', bert_preds)
stanza_preds = stanza_ner(s)
print(f'stanza predictions ({len(stanza_preds)}): ', stanza_preds)


spacy predictions (4):  ['O', 'O', 'B-MISC', 'O']
bert predictions (4):  ['O', 'O', 'I-MISC', 'O']
stanza predictions (4):  ['O', 'O', 'B-MISC', 'O']


In [26]:
import re
import time
from tqdm import tqdm
from ner.evaluation.ner_evaluation.ner_eval import Evaluator
# from seqeval.metrics import f1_score, precision_score, recall_score



# preprae datasets for evaluation
def prepare_data(data_path):
    f = open(data_path, 'r')
    pairs = []
    words = []
    tags = []
    for l in f:
        if '-DOCSTART' in l:
            pass
        else:
            l = l.replace('\n', '')
            l = re.sub(' +', ' ', l)
            if l == '':
                assert len(words) == len(tags)
                pairs.append((words.copy(), tags.copy())) 
                words = []
                tags = []
            else:
                parts = l.split(' ')
                words.append(parts[0])
                tags.append(parts[1])

    return pairs    
   

def ner_eval(true, pred, labels=['LOC', 'MISC', 'PER', 'ORG']):
    evaluator = Evaluator(true, pred, labels)
    results, results_agg = evaluator.evaluate()
    for k in results:
        p = results[k]['precision']
        r = results[k]['recall']
        f1 = 2*(p*r)/(p+r) if (p+r)>0 else 0
        results[k]['f1'] = f1
    
    for label in results_agg:
        for k in results_agg[label]:
            p = results_agg[label][k]['precision']
            r = results_agg[label][k]['recall']
            f1 = 2*(p*r)/(p+r) if (p+r)>0 else 0
            results_agg[label][k]['f1'] = f1
    
    return results, results_agg



# mode = 'bert' | 'spacy' | 'stanza'
def evaluate(data_path, mode='bert'):
    ms_per_sent = []
    num_words_per_sent = []
    ner_func = bert_ner_
    if mode == 'spacy':
        ner_func = spacy_ner
    elif mode == 'stanza':
        ner_func = stanza_ner
    pairs = prepare_data(data_path)
    preds = []
    tags = []
    for pair in tqdm(pairs):
        sent_words = pair[0]
        sent_tags = pair[1]
        num_words_per_sent.append(len(sent_words))
        if mode != 'bert':
            sent = ''
            for w in sent_words:
                sent += w + ' '
            sent = sent.rstrip()
            sent_words = sent
        
        start_ms = int(round(time.time() * 1000))
        sent_preds = ner_func(sent_words)
        end_ms = int(round(time.time() * 1000))
        ms_per_sent.append(end_ms - start_ms)
        
        if len(sent_preds) == len(sent_tags):
            pass
        elif len(sent_preds) > len(sent_tags):
            diff = len(sent_preds) - len(sent_tags)
            sent_preds = sent_preds[:-diff]
        elif len(sent_tags) > len(sent_preds):
            diff = len(sent_tags) - len(sent_preds)
            sent_tags = sent_tags[:-diff]
            
        assert len(sent_preds) == len(sent_tags)

        preds.append(sent_preds)
        tags.append(sent_tags)

    labels = ['LOC', 'MISC', 'PER', 'ORG']
    results, results_agg = ner_eval(tags, preds, labels)    
    output = {
        'mode': mode,
        'data': data_path,
        'avg_ms_per_sent': sum(ms_per_sent) / len(ms_per_sent),
        'avg_num_words_per_sent': sum(num_words_per_sent) / len(num_words_per_sent)
    }
    output.update(results)
    output.update(results_agg)
    
    return output

In [5]:
# =================== demorgen2000, mini ==============================================================
# all bert model
dataset = 'demorgen2000'
subset = 'mini' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 241/241 [00:02<00:00, 85.34it/s]


{'mode': 'bert',
 'data': 'data/demorgen2000/mini.txt',
 'avg_ms_per_sent': 11.639004149377593,
 'avg_num_words_per_sent': 11.551867219917012,
 'ent_type': {'correct': 149,
  'incorrect': 13,
  'partial': 0,
  'missed': 5,
  'spurious': 7,
  'possible': 167,
  'actual': 169,
  'precision': 0.8816568047337278,
  'recall': 0.8922155688622755,
  'f1': 0.8869047619047618},
 'partial': {'correct': 161,
  'incorrect': 0,
  'partial': 1,
  'missed': 5,
  'spurious': 7,
  'possible': 167,
  'actual': 169,
  'precision': 0.9556213017751479,
  'recall': 0.9670658682634731,
  'f1': 0.9613095238095238},
 'strict': {'correct': 148,
  'incorrect': 14,
  'partial': 0,
  'missed': 5,
  'spurious': 7,
  'possible': 167,
  'actual': 169,
  'precision': 0.8757396449704142,
  'recall': 0.8862275449101796,
  'f1': 0.880952380952381},
 'exact': {'correct': 161,
  'incorrect': 1,
  'partial': 0,
  'missed': 5,
  'spurious': 7,
  'possible': 167,
  'actual': 169,
  'precision': 0.9526627218934911,
  'recall':

In [7]:
# =================== demorgen2000, test ==============================================================
# all bert model
dataset = 'demorgen2000'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 5195/5195 [01:02<00:00, 82.84it/s]


{'mode': 'bert',
 'data': 'data/demorgen2000/test.txt',
 'strict': {'f1': 0.8715148031043404,
  'precision': 0.868021757801317,
  'recall': 0.8750360750360751},
 'exact': {'f1': 0.9473986777809715,
  'precision': 0.943601488691669,
  'recall': 0.9512265512265512},
 'type': {'f1': 0.8792756539235412,
  'precision': 0.875751503006012,
  'recall': 0.8828282828282829},
 'partial': {'f1': 0.9525725783271055,
  'precision': 0.9487546521614658,
  'recall': 0.9564213564213564},
 'avg_ms_per_sent': 12.007314725697785,
 'avg_num_words_per_sent': 13.257940327237728}

In [5]:
# demorgen2000 bert model
dataset = 'demorgen2000'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 5195/5195 [01:04<00:00, 80.90it/s]


{'mode': 'bert',
 'data': 'data/demorgen2000/test.txt',
 'f1': 0.9042853477033617,
 'precision': 0.896236012207528,
 'recall': 0.9124805800103574,
 'avg_ms_per_sent': 12.292974013474495,
 'avg_num_words_per_sent': 13.257940327237728}

In [23]:
# wikiner bert model
dataset = 'demorgen2000'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 5195/5195 [01:04<00:00, 80.84it/s]


{'mode': 'bert',
 'data': 'data/demorgen2000/test.txt',
 'f1': 0.7248223474638569,
 'precision': 0.687906976744186,
 'recall': 0.7659243915069912,
 'avg_ms_per_sent': 12.30567853705486,
 'avg_num_words_per_sent': 13.257940327237728}

In [7]:
# parliamentary bert model
dataset = 'demorgen2000'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 5195/5195 [01:02<00:00, 82.70it/s]


{'mode': 'bert',
 'data': 'data/demorgen2000/test.txt',
 'f1': 0.07957153787299158,
 'precision': 0.15226939970717424,
 'recall': 0.05385810460901087,
 'avg_ms_per_sent': 12.026564003849856,
 'avg_num_words_per_sent': 13.257940327237728}

In [5]:
# meantimenews bert model
dataset = 'demorgen2000'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 5195/5195 [01:01<00:00, 83.82it/s]


{'mode': 'bert',
 'data': 'data/demorgen2000/test.txt',
 'f1': 0.030332681017612523,
 'precision': 0.2743362831858407,
 'recall': 0.01605385810460901,
 'avg_ms_per_sent': 11.8725697786333,
 'avg_num_words_per_sent': 13.257940327237728}

In [5]:
# europeananews bert model
dataset = 'demorgen2000'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 5195/5195 [01:05<00:00, 79.15it/s]


{'mode': 'bert',
 'data': 'data/demorgen2000/test.txt',
 'f1': 0.07966651227420102,
 'precision': 0.37719298245614036,
 'recall': 0.044536509580528225,
 'avg_ms_per_sent': 12.567276227141482,
 'avg_num_words_per_sent': 13.257940327237728}

In [8]:
evaluate(f'data/{dataset}/{subset}.txt', 'stanza')

100%|██████████| 5195/5195 [05:20<00:00, 16.19it/s]


{'mode': 'stanza',
 'data': 'data/demorgen2000/test.txt',
 'strict': {'f1': 0.8731353885495099,
  'precision': 0.8787532170431799,
  'recall': 0.8675889328063241},
 'exact': {'f1': 0.9561017189941754,
  'precision': 0.9622533600228768,
  'recall': 0.9500282326369283},
 'type': {'f1': 0.8779656201164937,
  'precision': 0.8836145267372033,
  'recall': 0.8723884810841332},
 'partial': {'f1': 0.9595112942179287,
  'precision': 0.9656848727480698,
  'recall': 0.953416149068323},
 'avg_ms_per_sent': 61.5097208854668,
 'avg_num_words_per_sent': 13.257940327237728}

In [9]:
dataset = 'demorgen2000'
subset = 'test' 
evaluate(f'data/{dataset}/{subset}.txt', 'spacy')

100%|██████████| 5195/5195 [00:41<00:00, 126.12it/s]


{'mode': 'spacy',
 'data': 'data/demorgen2000/test.txt',
 'strict': {'f1': 0.5039596832253419,
  'precision': 0.5147058823529411,
  'recall': 0.4936530324400564},
 'exact': {'f1': 0.6928725701943845,
  'precision': 0.7076470588235294,
  'recall': 0.6787023977433004},
 'type': {'f1': 0.515766738660907,
  'precision': 0.5267647058823529,
  'recall': 0.5052186177715091},
 'partial': {'f1': 0.7058315334773217,
  'precision': 0.7208823529411764,
  'recall': 0.6913963328631876},
 'avg_ms_per_sent': 7.883926852743022,
 'avg_num_words_per_sent': 13.257940327237728}

In [10]:
# =================== wikiner, test ==================================================================
# all bert model
dataset = 'wikiner'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 27648/27648 [06:04<00:00, 75.82it/s]


{'mode': 'bert',
 'data': 'data/wikiner/test.txt',
 'strict': {'f1': 0.9113751248899484,
  'precision': 0.9018736417565636,
  'recall': 0.921078941054147},
 'exact': {'f1': 0.9505485265459149,
  'precision': 0.9406386436165006,
  'recall': 0.9606694393345597},
 'type': {'f1': 0.9186162688324151,
  'precision': 0.9090392936155217,
  'recall': 0.9283971846756779},
 'partial': {'f1': 0.9548714499104749,
  'precision': 0.944916498619731,
  'recall': 0.9650383907862113},
 'avg_ms_per_sent': 13.11935763888889,
 'avg_num_words_per_sent': 18.826822916666668}

In [6]:
# demorgen2000 bert model
dataset = 'wikiner'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 27648/27648 [06:05<00:00, 75.69it/s]


{'mode': 'bert',
 'data': 'data/wikiner/test.txt',
 'f1': 0.7943572267691918,
 'precision': 0.7723779854620976,
 'recall': 0.8176240156693448,
 'avg_ms_per_sent': 13.141746238425926,
 'avg_num_words_per_sent': 18.826822916666668}

In [29]:
# wikiner bert model
dataset = 'wikiner'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 27648/27648 [05:53<00:00, 78.23it/s]


{'mode': 'bert',
 'data': 'data/wikiner/test.txt',
 'f1': 0.9133919031224029,
 'precision': 0.9043728694016692,
 'recall': 0.9225926370068354,
 'avg_ms_per_sent': 12.719907407407407,
 'avg_num_words_per_sent': 18.826822916666668}

In [8]:
# parlimentary bert model
dataset = 'wikiner'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 27648/27648 [06:04<00:00, 75.83it/s]


{'mode': 'bert',
 'data': 'data/wikiner/test.txt',
 'f1': 0.05116010952586829,
 'precision': 0.09170283116346352,
 'recall': 0.03547587640404525,
 'avg_ms_per_sent': 13.11794704861111,
 'avg_num_words_per_sent': 18.826822916666668}

In [6]:
# meantimenews bert model
dataset = 'wikiner'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 27648/27648 [06:28<00:00, 71.20it/s]


{'mode': 'bert',
 'data': 'data/wikiner/test.txt',
 'f1': 0.013990621245591597,
 'precision': 0.2296437659033079,
 'recall': 0.007215093736259344,
 'avg_ms_per_sent': 13.964409722222221,
 'avg_num_words_per_sent': 18.826822916666668}

In [6]:
# europeananews bert model
dataset = 'wikiner'
subset = 'test' # mini, dev, test, train
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 27648/27648 [06:22<00:00, 72.20it/s]


{'mode': 'bert',
 'data': 'data/wikiner/test.txt',
 'f1': 0.1988976037035187,
 'precision': 0.5961864829789358,
 'recall': 0.11935883599152576,
 'avg_ms_per_sent': 13.771158854166666,
 'avg_num_words_per_sent': 18.826822916666668}

In [11]:
evaluate(f'data/{dataset}/{subset}.txt', 'stanza')

100%|██████████| 27648/27648 [34:11<00:00, 13.47it/s]


{'mode': 'stanza',
 'data': 'data/wikiner/test.txt',
 'strict': {'f1': 0.7948412971463958,
  'precision': 0.7824656473775885,
  'recall': 0.8076147100537344},
 'exact': {'f1': 0.9122489703236968,
  'precision': 0.8980452874008128,
  'recall': 0.9269091708116098},
 'type': {'f1': 0.8083278450029981,
  'precision': 0.7957422101799884,
  'recall': 0.8213179920496994},
 'partial': {'f1': 0.9207517865743972,
  'precision': 0.9064157151151538,
  'recall': 0.9355486306705819},
 'avg_ms_per_sent': 73.92531105324075,
 'avg_num_words_per_sent': 18.826822916666668}

In [12]:
dataset = 'wikiner'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'spacy')

100%|██████████| 27648/27648 [03:57<00:00, 116.65it/s]


{'mode': 'spacy',
 'data': 'data/wikiner/test.txt',
 'strict': {'f1': 0.5263919940381135,
  'precision': 0.5637984902734384,
  'recall': 0.49364030270961046},
 'exact': {'f1': 0.7359948898115617,
  'precision': 0.7882961983169513,
  'recall': 0.6902018729657954},
 'type': {'f1': 0.5459597572660491,
  'precision': 0.584756778945928,
  'recall': 0.5119905752680657},
 'partial': {'f1': 0.7519535824550198,
  'precision': 0.8053889484366804,
  'recall': 0.705167628441924},
 'avg_ms_per_sent': 8.521882233796296,
 'avg_num_words_per_sent': 18.826822916666668}

In [13]:
# =================== parliamentary, test ==============================================================
# all bert model
dataset = 'parliamentary'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 502/502 [00:05<00:00, 84.13it/s]


{'mode': 'bert',
 'data': 'data/parliamentary/test.txt',
 'strict': {'f1': 0.7308970099667774,
  'precision': 0.6875,
  'recall': 0.7801418439716312},
 'exact': {'f1': 0.8073089700996677,
  'precision': 0.759375,
  'recall': 0.8617021276595744},
 'type': {'f1': 0.7475083056478405,
  'precision': 0.703125,
  'recall': 0.7978723404255319},
 'partial': {'f1': 0.823920265780731,
  'precision': 0.775,
  'recall': 0.8794326241134752},
 'avg_ms_per_sent': 11.840637450199203,
 'avg_num_words_per_sent': 20.685258964143426}

In [7]:
# demorgen2000 bert model
dataset = 'parliamentary'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 502/502 [00:05<00:00, 83.82it/s]


{'mode': 'bert',
 'data': 'data/parliamentary/test.txt',
 'f1': 0.6666666666666666,
 'precision': 0.6170212765957447,
 'recall': 0.725,
 'avg_ms_per_sent': 11.876494023904382,
 'avg_num_words_per_sent': 20.685258964143426}

In [32]:
# wikiner bert model
dataset = 'parliamentary'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 502/502 [00:05<00:00, 84.02it/s]


{'mode': 'bert',
 'data': 'data/parliamentary/test.txt',
 'f1': 0.5816993464052287,
 'precision': 0.5493827160493827,
 'recall': 0.6180555555555556,
 'avg_ms_per_sent': 11.834661354581673,
 'avg_num_words_per_sent': 20.888446215139442}

In [7]:
# meantimenews bert model
dataset = 'parliamentary'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 502/502 [00:06<00:00, 77.55it/s]


{'mode': 'bert',
 'data': 'data/parliamentary/test.txt',
 'f1': 0,
 'precision': 0.0,
 'recall': 0.0,
 'avg_ms_per_sent': 12.820717131474103,
 'avg_num_words_per_sent': 20.685258964143426}

In [7]:
# europeananews bert model
dataset = 'parliamentary'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 502/502 [00:06<00:00, 80.89it/s]


{'mode': 'bert',
 'data': 'data/parliamentary/test.txt',
 'f1': 0.11620795107033638,
 'precision': 0.40425531914893614,
 'recall': 0.06785714285714285,
 'avg_ms_per_sent': 12.294820717131474,
 'avg_num_words_per_sent': 20.685258964143426}

In [14]:
evaluate(f'data/{dataset}/{subset}.txt', 'stanza')

100%|██████████| 502/502 [00:38<00:00, 13.05it/s]


{'mode': 'stanza',
 'data': 'data/parliamentary/test.txt',
 'strict': {'f1': 0.6564625850340136,
  'precision': 0.6286644951140065,
  'recall': 0.6868327402135231},
 'exact': {'f1': 0.8095238095238095,
  'precision': 0.7752442996742671,
  'recall': 0.8469750889679716},
 'type': {'f1': 0.6666666666666667,
  'precision': 0.6384364820846905,
  'recall': 0.697508896797153},
 'partial': {'f1': 0.8333333333333334,
  'precision': 0.7980456026058632,
  'recall': 0.8718861209964412},
 'avg_ms_per_sent': 76.3406374501992,
 'avg_num_words_per_sent': 20.685258964143426}

In [15]:
evaluate(f'data/{dataset}/{subset}.txt', 'spacy')

100%|██████████| 502/502 [00:04<00:00, 122.18it/s]


{'mode': 'spacy',
 'data': 'data/parliamentary/test.txt',
 'strict': {'f1': 0.3493761140819964,
  'precision': 0.35125448028673834,
  'recall': 0.3475177304964539},
 'exact': {'f1': 0.5490196078431373,
  'precision': 0.5519713261648745,
  'recall': 0.5460992907801419},
 'type': {'f1': 0.35650623885918004,
  'precision': 0.35842293906810035,
  'recall': 0.3546099290780142},
 'partial': {'f1': 0.5668449197860964,
  'precision': 0.5698924731182796,
  'recall': 0.5638297872340425},
 'avg_ms_per_sent': 8.121513944223107,
 'avg_num_words_per_sent': 20.685258964143426}

In [16]:
# =================== meantimenews, test ==============================================================
# all bert model 
dataset = 'meantimenews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 534/534 [00:06<00:00, 83.75it/s]


{'mode': 'bert',
 'data': 'data/meantimenews/test.txt',
 'strict': {'f1': 0.3508178228990412,
  'precision': 0.3478747203579418,
  'recall': 0.35381114903299204},
 'exact': {'f1': 0.3666102650874224,
  'precision': 0.3635346756152125,
  'recall': 0.36973833902161546},
 'type': {'f1': 0.4568527918781726,
  'precision': 0.45302013422818793,
  'recall': 0.46075085324232085},
 'partial': {'f1': 0.4247038917089679,
  'precision': 0.4211409395973154,
  'recall': 0.4283276450511945},
 'avg_ms_per_sent': 11.887640449438202,
 'avg_num_words_per_sent': 21.96067415730337}

In [8]:
# demorgen2000 bert model 
dataset = 'meantimenews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 534/534 [00:06<00:00, 83.17it/s]


{'mode': 'bert',
 'data': 'data/meantimenews/test.txt',
 'f1': 0.24343163538873994,
 'precision': 0.2292929292929293,
 'recall': 0.25942857142857145,
 'avg_ms_per_sent': 11.962546816479401,
 'avg_num_words_per_sent': 21.96067415730337}

In [35]:
# wikiner bert model
dataset = 'meantimenews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 534/534 [00:06<00:00, 82.92it/s]


{'mode': 'bert',
 'data': 'data/meantimenews/test.txt',
 'f1': 0.2543103448275862,
 'precision': 0.24057084607543322,
 'recall': 0.26971428571428574,
 'avg_ms_per_sent': 12.00374531835206,
 'avg_num_words_per_sent': 21.96067415730337}

In [9]:
# parliamentary bert model
dataset = 'meantimenews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 534/534 [00:06<00:00, 82.83it/s]


{'mode': 'bert',
 'data': 'data/meantimenews/test.txt',
 'f1': 0.02009273570324575,
 'precision': 0.031026252983293555,
 'recall': 0.014857142857142857,
 'avg_ms_per_sent': 12.014981273408239,
 'avg_num_words_per_sent': 21.96067415730337}

In [8]:
# europeananews bert model
dataset = 'meantimenews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 534/534 [00:06<00:00, 80.84it/s]


{'mode': 'bert',
 'data': 'data/meantimenews/test.txt',
 'f1': 0.010672358591248666,
 'precision': 0.08064516129032258,
 'recall': 0.005714285714285714,
 'avg_ms_per_sent': 12.320224719101123,
 'avg_num_words_per_sent': 21.96067415730337}

In [17]:
evaluate(f'data/{dataset}/{subset}.txt', 'stanza')

100%|██████████| 534/534 [00:43<00:00, 12.35it/s]


{'mode': 'stanza',
 'data': 'data/meantimenews/test.txt',
 'strict': {'f1': 0.21269669017905588,
  'precision': 0.20310880829015543,
  'recall': 0.22323462414578588},
 'exact': {'f1': 0.26044492674986436,
  'precision': 0.24870466321243523,
  'recall': 0.2733485193621868},
 'type': {'f1': 0.29734129137276183,
  'precision': 0.2839378238341969,
  'recall': 0.3120728929384966},
 'partial': {'f1': 0.31795984807379274,
  'precision': 0.30362694300518134,
  'recall': 0.3337129840546697},
 'avg_ms_per_sent': 80.66666666666667,
 'avg_num_words_per_sent': 21.96067415730337}

In [18]:
evaluate(f'data/{dataset}/{subset}.txt', 'spacy')

100%|██████████| 534/534 [00:04<00:00, 114.06it/s]


{'mode': 'spacy',
 'data': 'data/meantimenews/test.txt',
 'strict': {'f1': 0.07129686539643515,
  'precision': 0.07733333333333334,
  'recall': 0.0661345496009122},
 'exact': {'f1': 0.14382298709280886,
  'precision': 0.156,
  'recall': 0.13340935005701254},
 'type': {'f1': 0.11677934849416104,
  'precision': 0.12666666666666668,
  'recall': 0.10832383124287344},
 'partial': {'f1': 0.1917639827904118,
  'precision': 0.208,
  'recall': 0.17787913340935005},
 'avg_ms_per_sent': 8.709737827715356,
 'avg_num_words_per_sent': 21.96067415730337}

In [19]:
# =================== europeananews, test ==============================================================
# all bert model
dataset = 'europeananews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 602/602 [00:08<00:00, 69.54it/s]


{'mode': 'bert',
 'data': 'data/europeananews/test.txt',
 'strict': {'f1': 0.5812200137080192,
  'precision': 0.5832187070151307,
  'recall': 0.5792349726775956},
 'exact': {'f1': 0.6004112405757368,
  'precision': 0.6024759284731774,
  'recall': 0.5983606557377049},
 'type': {'f1': 0.6237148732008225,
  'precision': 0.6258596973865199,
  'recall': 0.6215846994535519},
 'partial': {'f1': 0.6223440712816998,
  'precision': 0.624484181568088,
  'recall': 0.6202185792349727},
 'avg_ms_per_sent': 14.317275747508306,
 'avg_num_words_per_sent': 100.7375415282392}

In [9]:
# demorgen2000 bert model
dataset = 'europeananews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 602/602 [00:09<00:00, 66.84it/s]


{'mode': 'bert',
 'data': 'data/europeananews/test.txt',
 'f1': 0.36254809747755457,
 'precision': 0.263681592039801,
 'recall': 0.5800273597811217,
 'avg_ms_per_sent': 14.890365448504983,
 'avg_num_words_per_sent': 100.7375415282392}

In [38]:
# wikiner bert model
dataset = 'europeananews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 602/602 [00:08<00:00, 68.44it/s]


{'mode': 'bert',
 'data': 'data/europeananews/test.txt',
 'f1': 0.3360723089564503,
 'precision': 0.2401644157369348,
 'recall': 0.5595075239398085,
 'avg_ms_per_sent': 14.544850498338871,
 'avg_num_words_per_sent': 100.7375415282392}

In [10]:
# parliamentary bert model
dataset = 'europeananews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 602/602 [00:09<00:00, 66.83it/s]


{'mode': 'bert',
 'data': 'data/europeananews/test.txt',
 'f1': 0.05328135152696556,
 'precision': 0.050742574257425746,
 'recall': 0.0560875512995896,
 'avg_ms_per_sent': 14.87375415282392,
 'avg_num_words_per_sent': 100.7375415282392}

In [8]:
# meantimenews bert model
dataset = 'europeananews'
subset = 'test'
evaluate(f'data/{dataset}/{subset}.txt', 'bert')

100%|██████████| 602/602 [00:09<00:00, 63.58it/s]


{'mode': 'bert',
 'data': 'data/europeananews/test.txt',
 'f1': 0,
 'precision': 0.0,
 'recall': 0.0,
 'avg_ms_per_sent': 15.654485049833887,
 'avg_num_words_per_sent': 100.7375415282392}

In [20]:
evaluate(f'data/{dataset}/{subset}.txt', 'stanza')

100%|██████████| 602/602 [01:26<00:00,  6.96it/s]


{'mode': 'stanza',
 'data': 'data/europeananews/test.txt',
 'strict': {'f1': 0.056329932461560574,
  'precision': 0.046445497630331754,
  'recall': 0.07155896312522819},
 'exact': {'f1': 0.07989653685874407,
  'precision': 0.06587677725118483,
  'recall': 0.10149689667761957},
 'type': {'f1': 0.07730995832734588,
  'precision': 0.06374407582938389,
  'recall': 0.0982110259218693},
 'partial': {'f1': 0.09484121281793362,
  'precision': 0.07819905213270142,
  'recall': 0.12048192771084337},
 'avg_ms_per_sent': 143.45348837209303,
 'avg_num_words_per_sent': 100.7375415282392}

In [21]:
evaluate(f'data/{dataset}/{subset}.txt', 'spacy')

100%|██████████| 602/602 [00:20<00:00, 29.42it/s]


{'mode': 'spacy',
 'data': 'data/europeananews/test.txt',
 'strict': {'f1': 0.03851444291609354,
  'precision': 0.030911901081916538,
  'recall': 0.05107624954396206},
 'exact': {'f1': 0.06932599724896837,
  'precision': 0.05564142194744977,
  'recall': 0.09193724917913171},
 'type': {'f1': 0.055020632737276476,
  'precision': 0.044159858688452194,
  'recall': 0.07296607077708865},
 'partial': {'f1': 0.07991746905089409,
  'precision': 0.06414219474497682,
  'recall': 0.10598321780372127},
 'avg_ms_per_sent': 33.895348837209305,
 'avg_num_words_per_sent': 100.7375415282392}

In [28]:
#++++++++++++++++++++++++ write the evaluation results into a csv file +++++++++++++++++++++++++++++++++
def write_evaluation_to_csv():
    ner_models = ['bert', 'stanza', 'spacy']
    datasets = ['demorgen2000', 'europeananews', 'meantimenews', 'parliamentary', 'wikiner']
    entity_types = ['', 'PER', 'LOC', 'ORG', 'MISC']
    score_modes = ['strict', 'exact', 'type', 'partial']
    score_metrics = ['f1', 'precision', 'recall']
    data_subset = 'test'
    output = open('evaluation.csv', 'w')
    header = 'experiment_id, model_name, dataset, '
    for entity_type in entity_types:
        for score_mode in score_modes:
            for score_metric in score_metrics:
                col = score_metric + '_' + score_mode + '_' + entity_type
                if entity_type == '':
                    col = col[:-1]
                header += col+', '
    header = header[:-2]
    output.write(header + '\n')
    count = 0
    for dataset in datasets:
        for ner_model in ner_models:
            count += 1
            result = evaluate(f'data/{dataset}/{data_subset}.txt', ner_model)
            row = f'{count}, {ner_model}, {dataset}, '
            for entity_type in entity_types:
                for score_mode in score_modes:
                    for score_metric in score_metrics:
                        _score_mode = 'ent_type' if score_mode == 'type' else score_mode
                        if entity_type == '':
                            score = result[_score_mode][score_metric]
                        else:
                            score = result[entity_type][_score_mode][score_metric]
                        row += f'{score}, '
            row = row[:-2]
            output.write(row + '\n')
    output.close()
    
    
write_evaluation_to_csv()

100%|██████████| 5195/5195 [01:01<00:00, 84.79it/s]
100%|██████████| 5195/5195 [05:21<00:00, 16.16it/s]
100%|██████████| 5195/5195 [00:42<00:00, 122.85it/s]
100%|██████████| 602/602 [00:08<00:00, 68.80it/s]
100%|██████████| 602/602 [01:30<00:00,  6.63it/s]
100%|██████████| 602/602 [00:24<00:00, 24.10it/s]
100%|██████████| 534/534 [00:06<00:00, 78.95it/s]
100%|██████████| 534/534 [00:46<00:00, 11.61it/s]
100%|██████████| 534/534 [00:05<00:00, 97.38it/s] 
100%|██████████| 502/502 [00:06<00:00, 78.79it/s]
100%|██████████| 502/502 [00:41<00:00, 12.17it/s]
100%|██████████| 502/502 [00:04<00:00, 104.93it/s]
100%|██████████| 27648/27648 [06:05<00:00, 75.54it/s]
100%|██████████| 27648/27648 [40:00<00:00, 11.52it/s]
100%|██████████| 27648/27648 [04:59<00:00, 92.46it/s] 
