# Bert-based morphological tagger's comparison with Vabamorf
## Training tagger on UD EST-EDT treebank

## Table of contents
   1. [**Gathering Data**](#andmete_kogumine)
   2. [**Model Training**](#mudeli_treenimine)
   3. [**Vabamorf evaluation on UD corpus**](#vabamorfi_hindamine)


[end](#end)

In [1]:
import os
import evaluate
import pkg_resources
import types
import pandas as pd
import numpy as np
import estnltk, estnltk.converters, estnltk.taggers

from bert_morph_tagger_notebook_functions import NotebookFunctions
from simpletransformers.ner import NERModel, NERArgs
from tqdm import tqdm
from bert_morph_tagger import BertMorphTagger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Get locally imported modules from current notebook - https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook - Alex P. Miller
def get_imports():
    
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
            
        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

estnltk==1.7.3
evaluate==0.4.2
numpy==1.26.4
pandas==2.2.2
simpletransformers==0.70.1
torch==2.5.1
tqdm==4.66.5


<a id='andmete_kogumine'></a>

### Gathering data

Initializing the model<!-- Mudeli ülesehitamine -->

In [None]:
unique_labels = NotebookFunctions.get_unique_labels("./unique_labels.json")
model = NotebookFunctions.initialize_model("NER_mudel", unique_labels=unique_labels)

Reading in the UD corpus. The .csv file is created in the [second notebook](./02_eval_UD_Est-EDT_treebank.ipynb) in the "Creating and preparing the dataset from converted UD corpus" section.

In [17]:
csv_ud_file = "ud_andmestik.csv"
df_ud = pd.read_csv(csv_ud_file, keep_default_na=False)

Extracting test set

In [18]:
train_df_ud = df_ud[df_ud['source'].str.contains('ud-train')].copy()
test_df_ud = df_ud[df_ud['source'].str.contains('ud-test')].copy()
dev_df_ud = df_ud[df_ud['source'].str.contains('ud-dev')].copy()
print(train_df_ud.shape)
print(test_df_ud.shape)
print(dev_df_ud.shape)

(344589, 7)
(48489, 7)
(44748, 7)


Labels unknown to Vabamorf and replacing them with appropriate known unique labels

In [19]:
NotebookFunctions.unknown_labels(unique_labels, train_df_ud);
NotebookFunctions.unknown_labels(unique_labels, test_df_ud);
NotebookFunctions.unknown_labels(unique_labels, dev_df_ud);

Labels in data that are not in unique labels list:
94053     T
94054     T
94056     T
94057     T
99199     T
         ..
432622    T
432623    T
432671    T
432672    T
432673    T
Name: labels, Length: 721, dtype: object
Unique:
['T' 'sg n_D' 'sg ad_D' 'sg p_D' 'sg g_DET' 'sg n_T' 'sg g_2']
Labels in data that are not in unique labels list:
46373    T
46374    T
46375    T
48036    T
48037    T
        ..
85108    T
85682    T
85683    T
85941    T
85942    T
Name: labels, Length: 85, dtype: object
Unique:
['T' 'sg g_place']
Labels in data that are not in unique labels list:
59            T
60            T
2101          T
2699          T
4837          T
4838          T
4840          T
4841          T
7022     sg n_p
7111          T
7112          T
7905          T
10014         T
19029         T
19030         T
19031         T
19155         T
19156         T
19157         T
19208         T
19250         T
19251         T
26068         T
27281         T
27282         T
27283         T

In [20]:
# Replace unknown pos 'T' with '?'
train_df_ud['labels'] = train_df_ud['labels'].apply(lambda x: '?' if x == 'T' else x)
train_df_ud['labels'] = train_df_ud['labels'].apply(lambda x: 'sg n_H' if x == 'sg n_T' else x)
# Change DET to P
train_df_ud['labels'] = train_df_ud['labels'].apply(lambda x: 'sg g_P' if x == 'sg g_DET' else x)
# Remove form from adverbs
train_df_ud['labels'] = train_df_ud['labels'].apply(lambda x: 'D' if x == 'sg ad_D' else x)
train_df_ud['labels'] = train_df_ud['labels'].apply(lambda x: 'D' if x == 'sg n_D' else x)
train_df_ud['labels'] = train_df_ud['labels'].apply(lambda x: 'D' if x == 'sg p_D' else x)
# Change 2 to S
train_df_ud['labels'] = train_df_ud['labels'].apply(lambda x: 'D' if x == 'sg g_2' else x)
# Test corpus
# Replace unknown pos 'T' with '?'
test_df_ud['labels'] = test_df_ud['labels'].apply(lambda x: '?' if x == 'T' else x)
# Replace unknown pos 'place' with 'S'
test_df_ud['labels'] = test_df_ud['labels'].apply(lambda x: 'sg g_S' if x == 'sg g_place' else x)
# Dev corpus
dev_df_ud['labels'] = dev_df_ud['labels'].apply(lambda x: '?' if x == 'T' else x)
dev_df_ud['labels'] = dev_df_ud['labels'].apply(lambda x: 'sg n_P' if x == 'sg n_p' else x)

In [21]:
NotebookFunctions.unknown_labels(unique_labels, train_df_ud);
NotebookFunctions.unknown_labels(unique_labels, test_df_ud);
NotebookFunctions.unknown_labels(unique_labels, dev_df_ud);

Labels in data that are not in unique labels list:
Series([], Name: labels, dtype: object)
Unique:
[]
Labels in data that are not in unique labels list:
Series([], Name: labels, dtype: object)
Unique:
[]
Labels in data that are not in unique labels list:
Series([], Name: labels, dtype: object)
Unique:
[]


Save sets to .csv files

In [22]:
train_df_ud.to_csv('UD_train.csv', index=False)
test_df_ud.to_csv('UD_test.csv', index=False)
dev_df_ud.to_csv('UD_dev.csv', index=False)

Removing unnecessary columns for the model<!-- Mudelile ebavajalike veergude eemaldamine -->

In [5]:
train_df_ud = train_df_ud.drop(labels=['file_prefix', 'source'], axis=1)
test_df_ud = test_df_ud.drop(labels=['file_prefix', 'source'], axis=1)
dev_df_ud = dev_df_ud.drop(labels=['file_prefix', 'source'], axis=1)

In [6]:
display(train_df_ud)
display(test_df_ud)
display(dev_df_ud)

Unnamed: 0,sentence_id,words,form,pos,labels
0,0,Iga,sg n,P,sg n_P
1,0,üheksas,sg n,O,sg n_O
2,0,kroon,sg n,S,sg n_S
3,0,tuli,s,V,s_V
4,0,salapärastelt,pl abl,A,pl abl_A
...,...,...,...,...,...
344584,1123,järgi,,K,K
344585,1123,",",,Z,Z
344586,1123,%,,N,N
344587,1123,",",,Z,Z


Unnamed: 0,sentence_id,words,form,pos,labels
0,0,Palju,,D,D
1,0,olulisi,pl p,A,pl p_A
2,0,komponente,pl p,S,pl p_S
3,0,",",,Z,Z
4,0,nagu,,J,J
...,...,...,...,...,...
48484,442,osutus,s,V,s_V
48485,442,sissetulekute,pl g,S,pl g_S
48486,442,ebavõrdsust,sg p,S,sg p_S
48487,442,suurendavaks,sg tr,A,sg tr_A


Unnamed: 0,sentence_id,words,form,pos,labels
0,0,Aga,,J,J
1,0,mulle,sg all,P,sg all_P
2,0,tundub,b,V,b_V
3,0,",",,Z,Z
4,0,et,,J,J
...,...,...,...,...,...
44743,468,ei,neg,V,neg_V
44744,468,peitu,neg o,V,neg o_V
44745,468,neurodegeneratiivne,sg n,A,sg n_A
44746,468,protsess,sg n,S,sg n_S


<a id='mudeli_treenimine'></a>

### Model training

Training model

In [12]:
model.args.output_dir = 'NER_mudel_v2'
model.args.cache_dir = 'NER_mudel_v2' + '/cache'
model.args.best_model_dir = 'NER_mudel_v2' + '/best_model'

In [13]:
if not os.path.exists(".\\NER_mudel_v2\\config.json"):
    # Train model
    print("Training model")
    model.train_model(train_df_ud, output_dir='NER_mudel_v2')
else:
    model = NotebookFunctions.initialize_model('NER_mudel_v2', unique_labels)

Initializing custom metrics to be used for evaluation

In [7]:
poseval = evaluate.load("evaluate-metric/poseval", module_type="metric")

def custom_metrics(preds, labels):

    # Evaluate using poseval
    result = poseval.compute(predictions=preds, references=labels)

    return result

Evaluating the model<!-- Mudeli hindamine -->

In [15]:
# Evaluate the model
result, model_outputs, preds_list = model.eval_model(test_df_ud, extra_metrics=custom_metrics)

100%|██████████| 3/3 [00:09<00:00,  3.06s/it]
  with amp.autocast():
Running Evaluation: 100%|██████████| 12/12 [00:04<00:00,  2.96it/s]


In [16]:
print(f"Evaluation Loss:{result['eval_loss']:.4f}")
print(f"Precision: \t{result['extra_metrics']['weighted avg']['precision']:.4f}")
print(f"Recall: \t{result['extra_metrics']['weighted avg']['recall']:.4f}")
print(f"F1 Score: \t{result['extra_metrics']['weighted avg']['f1-score']:.4f}")

Evaluation Loss:0.1823
Precision: 	0.9778
Recall: 	0.9765
F1 Score: 	0.9769


<a id='vabamorf_ud_korpusel'></a>

<a id='vabamorfi_hindamine'></a>

### Vabamorf evaluation on UD corpus

In [None]:
df_ud_vabamorf = pd.read_csv('ud_vabamorf.csv', keep_default_na=False)

In [None]:
NotebookFunctions.clean_df(df_ud_vabamorf, 'ud_vabamorf.csv')

Assigning NaN values in columns form and pos with an empty string
Removing NaN words
Modified dataframe saved to ud_vabamorf.csv


In [None]:
NotebookFunctions.create_labels_column(df_ud_vabamorf, 'ud_vabamorf.csv')

Creating column 'labels'
Column 'labels' created
Modified dataframe saved to ud_vabamorf.csv


In [None]:
test_df_ud_vabamorf = df_ud_vabamorf[df_ud_vabamorf['source'].str.contains('ud-test')].copy()

In [None]:
test_df_ud = df_ud[df_ud['source'].str.contains('ud-test')].copy()

In [None]:
def group_labels_by_sentence(df):
    # Preparing data for seqeval metrics (needs nested lists)
    grouped = df.groupby(['source', 'sentence_id'])['labels'].apply(list)
    return grouped.reset_index(drop=True).tolist()

labels_true = group_labels_by_sentence(test_df_ud)
labels_pred = group_labels_by_sentence(test_df_ud_vabamorf)

# precision = sk.metrics.precision_score(test_df_ud['labels'], test_df_ud_vabamorf['labels'], average='weighted')
# recall = sk.metrics.recall_score(test_df_ud['labels'], test_df_ud_vabamorf['labels'], average='weighted')
# f1 = sk.metrics.f1_score(test_df_ud['labels'], test_df_ud_vabamorf['labels'], average='weighted')
# print(f"Precision: \t{precision:.4f}")
# print(f"Recall: \t{recall:.4f}")
# print(f"F1 Score: \t{f1:.4f}")

results = poseval.compute(predictions=labels_true, references=labels_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(f"Precision: \t{results["weighted avg"]["precision"]:.4f}")
print(f"Recall: \t{results["weighted avg"]["recall"]:.4f}")
print(f"F1 Score: \t{results["weighted avg"]["f1-score"]:.4f}")

Precision: 	0.9194
Recall: 	0.9067
F1 Score: 	0.9082


| Model         | Precision | Recall | F1 score |
|---------------|-----------|--------|----------|
| Bert_morph_v2 | 0.9778    | 0.9765 | 0.9769   |
| Vabamorf      | 0.9194    | 0.9067 | 0.9082   |

\* Metrics are from weighted average

## END

<a id='end'></a>