In [1]:
import ast
import pandas as pd

from datasets import ClassLabel, Dataset, Features, Sequence, Value



# Load Data

In [2]:
data_path = "../data/fine-tuning/"

In [3]:
df = pd.read_csv(data_path + "mwb-texts_NER_goldstandard.tsv", sep="\t", converters={2: ast.literal_eval, 3: ast.literal_eval})
df

Unnamed: 0,source,n,tokens,labels
0,Wh,57329,"[durh, den, markys., dâ, wart, unverdrozzen, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,UvZLanz,46148,"[und, er, sich, ûf, ein, puneiz, mit, rîchem, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,UvZLanz,20409,"[trûrikheit, vergaʒ., den, walt, nante, man, d...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O,..."
3,Wh,41911,"[krône, bî, der, zît, truoc:, daz, was, gar, â...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,Wig,54091,"[truoc, den, ir, grôziu, schœne, sluoc., owê, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...
2995,RvEBarl,76048,"[im, sîne, man, lange, weinende, dan., sie, sp...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,..."
2996,RvEBarl,79897,"[morgen, anehuop,, bî, dem, hol, vil, nâhen, g...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,..."
2997,Wh,36607,"[marhcrâve, ir, dô, sagete:, ‘dâ, kumet, der, ...","[O, O, O, O, O, O, O, O, O, O, B, O, B..."
2998,Wig,48137,"[enpfâhen, von, des, heldes, hant, der, den, h...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,..."


# Build Huggingface Dataset

In [4]:
labels = ["O", "B"]
dataset = Dataset.from_pandas(df[["tokens", "labels"]], features=Features({"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=labels))}))
dataset

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 3000
})

# Apply Train/Test Split

In [5]:
label_names = ['O', 'B']

id2label = {i: label for i, label in enumerate(label_names)}

label2id = {label: i for i, label in enumerate(label_names)}

In [6]:
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0)
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 600
    })
})

# Inspect and Store Train Dataset

In [7]:
train_df = dataset["train"].to_pandas()
train_df["tokens"] = train_df["tokens"].str.join(", ").str.split(", ")
train_df["labels"] = train_df["labels"].apply(lambda x: [id2label[i] for i in x]) # fix labels
train_df

Unnamed: 0,tokens,labels
0,"[sulen, ouch, Glorîône, und, dem, stolzen, Fau...","[O, O, B, O, O, O, B, O, O, O, B, ..."
1,"[niender, funde, enkeinen, degen, sô, stæte,, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P..."
2,"[wartmannes, mâl., nû, sage, mir,, helt,, al, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[uns, danne, daz, dehein, getwerc, enwære, noc...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,..."
4,"[elliu, chunne, hie, in, erde, joch, in, himel...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
2395,"[küneginne, schiet,, sô, daz, si, tâten, klage...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,..."
2396,"[wir, noch, hiute, werben, alsô, daz, vor, uns...","[O, O, O, O, O, O, O, O, O, B, O, O, O, O,..."
2397,"[Postefar, von, Laudundrehte,, und, den, herzo...","[B, O, B, O, O, O, B, O, B, O,..."
2398,"[zuo, geriten,, si, heten, Franzoyse, überstri...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,..."


In [8]:
#train_df.to_csv(data_path + "mwb-texts_NER_train.tsv", sep="\t", index=False)

In [10]:
df_concatenated = df[["source", "n", "tokens"]].copy()
df_concatenated["tokens"] = df_concatenated["tokens"].str.join(" ")

train_df_concatenated = train_df.copy()
train_df_concatenated["tokens"] = train_df_concatenated["tokens"].str.join(" ")
train_df_concatenated = train_df[["tokens"]].join(train_df_concatenated.set_index("tokens").join(df_concatenated.set_index("tokens")).reset_index(drop=True))
train_df_concatenated

Unnamed: 0,tokens,labels,source,n
0,"[sulen, ouch, Glorîône, und, dem, stolzen, Fau...","[O, O, B, O, O, O, B, O, O, O, B, ...",Wh,3885
1,"[niender, funde, enkeinen, degen, sô, stæte,, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P...",UvZLanz,11669
2,"[wartmannes, mâl., nû, sage, mir,, helt,, al, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Wh,52531
3,"[uns, danne, daz, dehein, getwerc, enwære, noc...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,...",Er,10377
4,"[elliu, chunne, hie, in, erde, joch, in, himel...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",VEzzo,1506
...,...,...,...,...
2395,"[küneginne, schiet,, sô, daz, si, tâten, klage...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,...",Parz,126436
2396,"[wir, noch, hiute, werben, alsô, daz, vor, uns...","[O, O, O, O, O, O, O, O, O, B, O, O, O, O,...",Wh,52956
2397,"[Postefar, von, Laudundrehte,, und, den, herzo...","[B, O, B, O, O, O, B, O, B, O,...",Parz,121601
2398,"[zuo, geriten,, si, heten, Franzoyse, überstri...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,...",Wh,67111


In [11]:
train_df["labels"].explode().value_counts()

O        46736
B     3582
Name: labels, dtype: int64

In [12]:
train_df_concatenated["source"].value_counts()

Parz        1032
Wh           625
UvZLanz      267
RvEBarl      141
Er           107
Wig          104
VAlex         90
Vateruns      25
VEzzo          4
SuTheol        2
Wahrh          2
SüklZw         1
Name: source, dtype: int64

In [13]:
train_df_concatenated.loc[train_df["labels"].str.join(" ").str.contains("B")]["source"].value_counts()

Parz        724
Wh          473
UvZLanz     174
RvEBarl     117
Wig          97
Er           83
VAlex        68
Vateruns      5
SuTheol       2
SüklZw        1
VEzzo         1
Wahrh         1
Name: source, dtype: int64

# Inspect and Store Test Dataset

In [14]:
test_df = dataset["test"].to_pandas()
test_df["tokens"] = test_df["tokens"].str.join(", ").str.split(", ")
test_df["labels"] = test_df["labels"].apply(lambda x: [id2label[i] for i in x]) # fix labels
test_df

Unnamed: 0,tokens,labels
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,..."
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, B, B, O, B, O, ..."
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,..."
...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, O, O, O, B, O, B, O, B, ..."
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [15]:
#test_df.to_csv(data_path + "mwb-texts_NER_test.tsv", sep="\t", index=False)

In [16]:
test_df_concatenated = test_df.copy()
test_df_concatenated["tokens"] = test_df_concatenated["tokens"].str.join(" ")
test_df_concatenated = test_df[["tokens"]].join(test_df_concatenated.set_index("tokens").join(df_concatenated.set_index("tokens")).reset_index(drop=True))
test_df_concatenated

Unnamed: 0,tokens,labels,source,n
0,"[sîn, ungevüegez, rîs, in, der, hende, als, ei...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Wh,41874
1,"[hiez, daz, aller, valsch, an, im, verswant,, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,...",Parz,79057
2,"[het, aldâ, genomn, der, stolze, küene, Wâleis...","[O, O, O, O, O, O, B, B, O, B, O, ...",Parz,111510
3,"[erslagn., nu, vergebt, im, sîne, schulde, dur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Parz,67344
4,"[umbe, begunder, gâhen,, des, küneges, vanen, ...","[O, O, O, O, O, O, O, O, O, O, B, O, O, O,...",Parz,32712
...,...,...,...,...
595,"[guoten, ritter, zimet,, swenn, er, den, schil...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",UvZLanz,10325
596,"[der, brust, bewart,, so, ist, werder, prîs, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Parz,443
597,"[ein, herre, genant, alsus,, der, künec, Iels,...","[O, O, O, O, O, O, B, O, B, O, B, ...",Er,7671
598,"[vor, im, her, gesant,, sît, ichz, lebende, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Parz,17370


In [17]:
test_df["labels"].explode().value_counts()

O        11713
B      885
Name: labels, dtype: int64

In [18]:
test_df_concatenated["source"].value_counts()

Parz        263
Wh          150
UvZLanz      68
RvEBarl      42
VAlex        29
Wig          20
Er           18
Vateruns      9
VEzzo         1
Name: source, dtype: int64

In [19]:
test_df_concatenated.loc[test_df["labels"].str.join(" ").str.contains("B")]["source"].value_counts()

Parz        185
Wh          117
UvZLanz      41
RvEBarl      34
VAlex        23
Wig          17
Er           16
Vateruns      4
VEzzo         1
Name: source, dtype: int64