In [1]:
import ast
import pandas as pd

from itertools import chain

from transformers import pipeline

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Load Test Dataset

In [2]:
data_path = "../data/fine-tuning/"
test_df = pd.read_csv(data_path + "mwb-texts_NER_test.tsv", sep="\t", converters={0: ast.literal_eval, 1: ast.literal_eval})
test_df

Unnamed: 0,tokens,labels
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ..."
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, B, B, O, B, O, B, B, B, O, ..."
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ..."
...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, O, O, O, B, O, B, O, B, O, O, B, B, ..."
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [3]:
y_true = list(test_df["labels"].explode())

# Evaluate Majority Baseline

In [4]:
majority_test_df = test_df[["tokens"]].copy()
majority_test_df["labels"] = majority_test_df["tokens"].apply(lambda x: ["O" for token in x])

majority_test_df

Unnamed: 0,tokens,labels
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [5]:
y_majority = list(majority_test_df["labels"].explode()) #list(chain(*y_majority)) # flatten list

{
    "precision": precision_score(y_true, y_majority, pos_label="B"),
    "recall": recall_score(y_true, y_majority, pos_label="B"),
    "f1_micro": f1_score(y_true, y_majority, average="micro"),
    "f1_macro": f1_score(y_true, y_majority, average="macro"),
    "accuracy": accuracy_score(y_true, y_majority)  
}

  _warn_prf(average, modifier, msg_start, len(result))


{'precision': 0.0,
 'recall': 0.0,
 'f1_micro': 0.9297507540879505,
 'f1_macro': 0.48179836288100036,
 'accuracy': 0.9297507540879505}

# Evaluate Fine-tuned gbert

In [6]:
def get_aligned_labels(tokens, y_predicted):
    # get start/end indices
    start_end_data = [(y["start"], y["end"]) for y in y_predicted]
    
    # build labels
    labels = []
    i = 0
    start = 0
    end = 0
    for word in tokens:
        end = start + len(word)
        label = "O"
        for j, entity in enumerate(start_end_data[i:]):
            if start <= entity[0] and end >= entity[1]:
                i += j
                label = "B"
                break
        labels.append(label)
        start = end + 1
    return labels

In [7]:
model_path = "../models/gbert-base-mwb-NER"
gbert = pipeline("token-classification", model=model_path)

In [8]:
#finetuned_gbert_test_df = test_df[["tokens"]].copy()
#finetuned_gbert_test_df["labels"] = finetuned_gbert_test_df["tokens"].apply(lambda x: get_aligned_labels(x, gbert(" ".join(x), aggregation_strategy="first")))

finetuned_gbert_test_df = pd.read_csv("../data/predictions/mwb-texts_NER_test_gbert.tsv", sep="\t", converters={0: ast.literal_eval, 1: ast.literal_eval})
finetuned_gbert_test_df

Unnamed: 0,tokens,labels
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ..."
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, B, B, O, B, O, B, B, B, O, ..."
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ..."
...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, B, O, O, B, O, B, O, B, O, O, B, B, ..."
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [9]:
#finetuned_gbert_test_df.to_csv("../data/predictions/mwb-texts_NER_test_gbert.tsv", sep="\t", index=False)

In [10]:
y_gbert = list(finetuned_gbert_test_df["labels"].explode()) #list(chain(*y_gbert)) # flatten list

{
    "precision": precision_score(y_true, y_gbert, pos_label="B"),
    "recall": recall_score(y_true, y_gbert, pos_label="B"),
    "f1_micro": f1_score(y_true, y_gbert, average="micro"),
    "f1_macro": f1_score(y_true, y_gbert, average="macro"),
    "accuracy": accuracy_score(y_true, y_gbert)  
}

{'precision': 0.9559819413092551,
 'recall': 0.9570621468926553,
 'f1_micro': 0.9938879187172567,
 'f1_macro': 0.9766173263421651,
 'accuracy': 0.9938879187172567}

# Evaluate Fine-tuned christinbeck/GHisBERT

In [11]:
model_path = "../models/ghisbert-mwb-NER"
ghisbert = pipeline("token-classification", model=model_path)

In [12]:
#finetuned_ghisbert_test_df = test_df[["tokens"]].copy()
#finetuned_ghisbert_test_df["labels"] = finetuned_ghisbert_test_df["tokens"].apply(lambda x: get_aligned_labels(x, ghisbert(" ".join(x), aggregation_strategy="first")))

finetuned_ghisbert_test_df = pd.read_csv("../data/predictions/mwb-texts_NER_test_ghisbert.tsv", sep="\t", converters={0: ast.literal_eval, 1: ast.literal_eval})
finetuned_ghisbert_test_df

Unnamed: 0,tokens,labels
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, ..."
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ..."
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, B, B, O, B, O, B, B, B, O, ..."
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ..."
...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, ..."
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, O, O, O, B, O, B, O, B, O, O, O, B, ..."
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [13]:
#finetuned_ghisbert_test_df.to_csv("../data/predictions/mwb-texts_NER_test_ghisbert.tsv", sep="\t", index=False)

In [14]:
y_ghisbert = list(finetuned_ghisbert_test_df["labels"].explode()) #list(chain(*y_ghisbert)) # flatten list

{
    "precision": precision_score(y_true, y_ghisbert, pos_label="B"),
    "recall": recall_score(y_true, y_ghisbert, pos_label="B"),
    "f1_micro": f1_score(y_true, y_ghisbert, average="micro"),
    "f1_macro": f1_score(y_true, y_ghisbert, average="macro"),
    "accuracy": accuracy_score(y_true, y_ghisbert)  
}

{'precision': 0.8067581837381204,
 'recall': 0.8632768361581921,
 'f1_micro': 0.9758691855850135,
 'f1_macro': 0.9105248323662948,
 'accuracy': 0.9758691855850135}

# Evaluate TreeTagger

In [15]:
import treetaggerwrapper

def get_tt_label(tagger, tokens):
    output = tagger.tag_text([f"<w>{x}</w>" for x in tokens])
    labels = []
    
    is_content = False
    was_punct = False
    for i in range(len(output)):
        if output[i] == "<w>":
            is_content = True
            continue
        if not is_content:
            continue
            
        if output[i] == "</w>" and was_punct:
            labels.append("O")
            was_punct = False
            continue
        
        pos = output[i].split("\t")[1] # MEMO: output is a list with {token}\t{POS}\t{lemma}
        if pos == "PUNCT":
            was_punct = True
            continue
        if pos == "PROPN":
            labels.append("B")
        else:
            labels.append("O")
        is_content = False
    return labels

# MEMO: paths need to be adapted!
tagger = treetaggerwrapper.TreeTagger(TAGDIR='../lib/tree-tagger/tree-tagger-linux-3.2.4', TAGPARFILE='../lib/tree-tagger/middle-high-german.par', TAGOPT="-token -lemma -sgml")

  punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
  DnsHostMatch_re = re.compile("(" + DnsHost_expression + ")",
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


In [16]:
#tt_test_df = test_df[["tokens"]].copy()
#tt_test_df["labels"] = tt_test_df["tokens"].apply(lambda x: get_tt_label(tagger, x))
tt_test_df = pd.read_csv("../data/predictions/mwb-texts_NER_test_tt.tsv", sep="\t", converters={0: ast.literal_eval, 1: ast.literal_eval})
tt_test_df

Unnamed: 0,tokens,labels
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ..."
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, B, B, O, B, O, B, O, B, O, ..."
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, O, O, O, B, O, B, O, O, O, O, B, B, ..."
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [17]:
#tt_test_df.to_csv("../data/predictions/mwb-texts_NER_test_tt.tsv", sep="\t", index=False)

In [18]:
y_tt = list(tt_test_df["labels"].explode())

{
    "precision": precision_score(y_true, y_tt, pos_label="B"),
    "recall": recall_score(y_true, y_tt, pos_label="B"),
    "f1_micro": f1_score(y_true, y_tt, average="micro"),
    "f1_macro": f1_score(y_true, y_tt, average="macro"),
    "accuracy": accuracy_score(y_true, y_tt)  
}

{'precision': 0.9622377622377623,
 'recall': 0.7774011299435029,
 'f1_micro': 0.9822193999047468,
 'f1_macro': 0.925253432785218,
 'accuracy': 0.9822193999047468}

# Evaluate off-the-shelf NER model

In [19]:
import stanza

def get_stanza_entities(stanza_output):
    entities = []
    for entity in stanza_output.ents:
        if entity.type in ["PER", "LOC"]:
            entities.append({"word": entity.text, "start": entity.start_char, "end": entity.end_char})
    return entities

stanza = stanza.Pipeline(lang='de', processors='tokenize,ner')

2024-06-03 14:50:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-06-03 14:50:16 INFO: Downloaded file to /home/pb/stanza_resources/resources.json
2024-06-03 14:50:17 INFO: Loading these models for language: de (German):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| mwt       | gsd          |
| ner       | germeval2014 |

2024-06-03 14:50:17 INFO: Using device: cpu
2024-06-03 14:50:17 INFO: Loading: tokenize
2024-06-03 14:50:17 INFO: Loading: mwt
2024-06-03 14:50:17 INFO: Loading: ner
2024-06-03 14:50:18 INFO: Done loading processors!


In [20]:
#stanza_test_df = test_df[["tokens"]].copy()
#stanza_test_df["labels"] = stanza_test_df["tokens"].apply(lambda x: get_aligned_labels(x, get_stanza_entities(stanza(" ".join(x)))))

stanza_test_df = pd.read_csv("../data/predictions/mwb-texts_NER_test_stanza.tsv", sep="\t", converters={0: ast.literal_eval, 1: ast.literal_eval})
stanza_test_df

Unnamed: 0,tokens,labels
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, B, O, B, O, B, ..."
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ..."
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, B, ..."
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [21]:
#stanza_test_df.to_csv("../data/predictions/mwb-texts_NER_test_stanza.tsv", sep="\t", index=False)

In [22]:
y_stanza = list(stanza_test_df["labels"].explode())

{
    "precision": precision_score(y_true, y_stanza, pos_label="B"),
    "recall": recall_score(y_true, y_stanza, pos_label="B"),
    "f1_micro": f1_score(y_true, y_stanza, average="micro"),
    "f1_macro": f1_score(y_true, y_stanza, average="macro"),
    "accuracy": accuracy_score(y_true, y_stanza)  
}

{'precision': 0.868020304568528,
 'recall': 0.19322033898305085,
 'f1_micro': 0.9412605175424671,
 'f1_macro': 0.6426968817402612,
 'accuracy': 0.9412605175424671}

# Evaluate String Matching

In [23]:
import re

def remove_punctuation(token):
    punct_pattern = r"\[\]_/()?!.,:;><›‹»«“”„‘’‚‟‛'" + '"'
    return re.sub(f"^[{punct_pattern}]*?(.+?)[{punct_pattern}]*$", "\\1", token)

names = list(pd.read_csv("../models/names-mwb.tsv", sep="\t", header=0, names=["name"])["name"])
len(names)

4974

In [24]:
type_list_test_df = test_df[["tokens"]].copy()
type_list_test_df["labels"] = type_list_test_df["tokens"].apply(lambda x: ["B" if remove_punctuation(token) in names else "O" for token in x])
type_list_test_df

Unnamed: 0,tokens,labels
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, ..."
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [25]:
y_type_list = list(type_list_test_df["labels"].explode())

{
    "precision": precision_score(y_true, y_type_list, pos_label="B"),
    "recall": recall_score(y_true, y_type_list, pos_label="B"),
    "f1_micro": f1_score(y_true, y_type_list, average="micro"),
    "f1_macro": f1_score(y_true, y_type_list, average="macro"),
    "accuracy": accuracy_score(y_true, y_type_list)  
}

{'precision': 0.9956709956709957,
 'recall': 0.519774011299435,
 'f1_micro': 0.9661057310684236,
 'f1_macro': 0.832547471481593,
 'accuracy': 0.9661057310684236}