## Table of contents
   1. [**Estonian Universal Dependencies' EDT corpus**](#eesti_ud_edt_korpus)
      1. [**Converting UD corpus to Vabamorf format**](#eesti_ud_edt_konverteerimine)
      2. [**Creating and preparing the dataset from converted UD corpus**](#teisendatud_ud_andmestiku_loomine_ja_tootlemine)
      3. [**Model testing**](#mudeli_testimine)

In [1]:
import os
import csv
import re
import gc
import itertools
import time
import json
import logging
import pkg_resources
import types
import evaluate
import pandas as pd
import sklearn as sk
import numpy as np
import estnltk
import torch
import simpletransformers
from simpletransformers.ner import NERModel, NERArgs
from est_ud_utils import load_ud_file_texts_with_corrections, load_ud_file_with_corrections
from est_ud_morph_conv import convert_ud_layer_to_reduced_morph_layer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Get locally imported modules from current notebook - https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook - Alex P. Miller
def get_imports():
    
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
            
        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

estnltk==1.7.3
evaluate==0.4.2
numpy==1.26.4
pandas==2.2.2
scikit-learn==1.5.1
simpletransformers==0.70.1
torch==2.4.0


<a id='eesti_ud_edt_korpus'></a>

### Estonian Universal Dependencies' EDT [corpus](https://github.com/UniversalDependencies/UD_Estonian-EDT)

<a id='eesti_ud_edt_konverteerimine'></a>

#### Converting UD corpus to Vabamorf format

["Convert Universal Dependencies' corpus to Vabamorf format" notebook](https://github.com/estnltk/estnltk-model-training/blob/main/ud_morph_tools/amb_morph_reordering/01_convert_ud_corpus_to_vm.ipynb) was used to convert Estonian UD EDT treebank into Vabamorf format.

In [None]:
def find_no_xpostag_rows():
    """
    Finds rows in the Estonian UD EDT treebank that contain rows where
    <code>xpostag == '_'</code>

    <i>In file <code>est_ud_utils.py</code> class <code>EstUDCorrectionsRewriter</code> has function <code>rewrite</code>, which has comment: \n
    #72: If <code>xpostag == '_'</code>, then add it based on upostag \n
    But not all xpostag conditions exist in the code as convertion throws an <code>AssertionError</code>.</i>
    """
    no_xpostag_regex = r"^\d+\t\S+\t\S+\t\S+\t_"
    conllu_dir = "UD_Estonian-EDT-r2.14"
    conllu_files = ["et_edt-ud-dev.conllu", "et_edt-ud-test.conllu", "et_edt-ud-train.conllu"]
    for c_file in conllu_files:
        print("\n", c_file, "\n")
        with open(file=os.path.join(conllu_dir, c_file), mode="r") as f:
            text = f.read()
            # Find all matches
            matches = re.findall(no_xpostag_regex, text, re.MULTILINE)

            # Print the matching rows
            for match in matches:
                print(match)

In [None]:
ud_corpus_dir = "UD_Estonian-EDT-r2.14" # UD Corpus location
output_dir = 'UD_converted' # Output directory

In [None]:
def convert_ud_to_vabamorf(ud_corpus_dir, output_dir):
    """Converts Universal Dependencies' (UD) corpus to Vabamorf format

    Args:
        ud_corpus_dir (str): path to directory containing UD corpus .conllu files
        output_dir (str): path to directory, where Vabamorf jsons files will be written
    """
    # Create directory if it doesn't exist
    if not os.path.isdir( output_dir ):
        os.makedirs(output_dir)
    assert os.path.isdir( output_dir )

    # Load UD corpus' files as EstNLTK Text objects
    loaded_texts  = []
    ud_layer_name = 'ud_syntax'
    for fname in os.listdir( ud_corpus_dir ):
        #if 'train' in fname:
        #    continue
        #if 'dev' in fname:
        #    continue
        #if 'test' in fname:
        #    continue
        if fname.endswith('.conllu'):
            fpath = os.path.join( ud_corpus_dir, fname )
            texts = load_ud_file_texts_with_corrections( fpath, ud_layer_name )
            for text in texts:
                text.meta['file'] = fname
                loaded_texts.append( text )

    # Convert UD's morphosyntactic annotations to Vabamorf-like annotations
    for tid, text in enumerate(loaded_texts):
        convert_ud_layer_to_reduced_morph_layer( text, 'ud_syntax', 'ud_morph_reduced', add_layer=True )
        fname = text.meta['file'].replace('.conllu', '_'+('{:03d}'.format(tid))+'.json')
        fpath = os.path.join(output_dir, fname)
        estnltk.converters.text_to_json(text, file=fpath)

<a id='teisendatud_ud_andmestiku_loomine_ja_tootlemine'></a>

#### Creating and preparing the dataset from converted UD corpus

In [3]:
def create_df_ud_corpus(jsons, in_dir, tokenizer, csv_filename):
    """
    Creates a new dataset from converted the Estonian UD EDT <a href="https://github.com/UniversalDependencies/UD_Estonian-EDT">corpus</a>. \n
    For each <code>.json</code> file, the following info is gathered:
    <ul>
        <li><code>sentence_id</code> -- given for each sentence</li>
        <li><code>words</code> -- words gathered from text</li>
        <li><code>form</code> -- word form notation</li>
        <li><code>pos</code> -- part of speech</li>
        <li><code>file_prefix</code> -- metadata</li>
        <li><code>source</code> -- file name where the text is taken from</li>
    </ul>
    <a href="https://github.com/Filosoft/vabamorf/blob/e6d42371006710175f7ec328c98f90b122930555/doc/tagset.md">Tables of morphological categories</a> for more information about <code>form</code> and <code>pos</code>.

    Args:
        jsons (list[str]): List of json files from which to read in the text
        in_dir (str): Directory containing list of files (<code>jsons</code>)
        tokenizer (str): Use goldstandard (<code>ud_morph_reduced</code>) or Vabamorf tokenization ((<code>morph_analysis</code>))
        csv_filename (str): CSV filename where to save the gathered text
    """
    if tokenizer not in {'ud_morph_reduced', 'morph_analysis'}:
        raise ValueError("create_df_ud_corpus: tokenizer must be one of %r." % {'ud_morph_reduced', 'morph_analysis'})

    tokens = list()
    sentence_id = 0
    fieldnames = ['sentence_id', 'words', 'form', 'pos', 'file_prefix', 'source']

    print("Beginning tokenization file by file. This can take a while.")
    for file_name in jsons:
        # print(f"Beginning to tokenize {file_name}")
        sentence_id = 0

        # Tokenization
        text = estnltk.converters.json_to_text(file=os.path.join(in_dir, file_name))
        if tokenizer == 'morph_analysis':
            text.tag_layer('morph_analysis')
        file_prefix = text.meta.get('file_prefix')
        for sentence in text.sentences:
            if tokenizer == 'ud_morph_reduced':
                sentence_analysis = sentence.ud_morph_reduced
                for text, form, pos in zip(sentence_analysis.text, sentence_analysis.form, sentence_analysis.pos):
                    if text:
                        tokens.append((sentence_id, text, form[0], pos[0], file_prefix, file_name)) # In case of multiplicity, select the first or index 0
            else:
                sentence_analysis = sentence.morph_analysis
                for text, form, pos in zip(sentence_analysis.text, sentence_analysis.form, sentence_analysis.partofspeech):
                    if text:
                        tokens.append((sentence_id, text, form[0], pos[0], file_prefix, file_name)) # In case of multiplicity, select the first or index 0
            sentence_id += 1
        # print(f"{file_name} tokenized")

    print("Tokenization completed successfully")
    print("Creating Pandas dataframe")
    df = pd.DataFrame(data=tokens, columns=fieldnames)
    df.to_csv(path_or_buf=csv_filename, index=False)
    print(f"Tokenized texts saved to {csv_filename}\n")

In [4]:
def clean_df(df, df_file_name):
    """Finishes dataframe by:
    <ul>
        <li>filling NaN values in columns <code>form</code> and <code>pos</code>with empty strings;</li>
        <li>removing NaN words.</li>
    </ul>

    Args:
        df (pandas.core.frame.DataFrame): Pandas dataframe to clean
        df_file_name (str): CSV file name from which dataframe was created
    """
    print("Assigning NaN values in columns form and pos with an empty string")
    # NaN values are assigned with an empty string
    df['form'] = df['form'].fillna('')
    df['pos'] = df['pos'].fillna('')
    print("Removing NaN words")
    # Removing NaN words
    df.dropna(subset=['words'], inplace=True)
    if df_file_name:
        df.to_csv(path_or_buf=df_file_name, index=False)
        print(f"Modified dataframe saved to {df_file_name}")
    else:
        print("Dataframe cleaned")

In [5]:
# New 'labels' column
def create_labels_column(df):
    """
    Creates a new column <code>labels</code> concatenating the values of columns <code>pos</code> (part of speech) and <code>form</code> (word form notation)

    Args:
        df (pandas.core.frame.DataFrame): Pandas dataframe to create a new column
    """
    df['labels'] = df.apply(lambda row: str(row['form']) + '_' + str(row['pos']) if row['form'] and row['pos'] else str(row['form']) or str(row['pos']), axis=1)
    print("Column 'labels' created")

In [6]:
ud_dir = "UD_converted"
jsons = os.listdir(ud_dir)
# print(jsons)

['et_edt-ud-dev_000.json', 'et_edt-ud-dev_001.json', 'et_edt-ud-dev_002.json', 'et_edt-ud-dev_003.json', 'et_edt-ud-dev_004.json', 'et_edt-ud-dev_005.json', 'et_edt-ud-dev_006.json', 'et_edt-ud-dev_007.json', 'et_edt-ud-dev_008.json', 'et_edt-ud-test_009.json', 'et_edt-ud-test_010.json', 'et_edt-ud-test_011.json', 'et_edt-ud-test_012.json', 'et_edt-ud-test_013.json', 'et_edt-ud-test_014.json', 'et_edt-ud-train_015.json', 'et_edt-ud-train_016.json', 'et_edt-ud-train_017.json', 'et_edt-ud-train_018.json', 'et_edt-ud-train_019.json', 'et_edt-ud-train_020.json', 'et_edt-ud-train_021.json', 'et_edt-ud-train_022.json', 'et_edt-ud-train_023.json', 'et_edt-ud-train_024.json', 'et_edt-ud-train_025.json', 'et_edt-ud-train_026.json', 'et_edt-ud-train_027.json', 'et_edt-ud-train_028.json', 'et_edt-ud-train_029.json', 'et_edt-ud-train_030.json', 'et_edt-ud-train_031.json', 'et_edt-ud-train_032.json', 'et_edt-ud-train_033.json', 'et_edt-ud-train_034.json', 'et_edt-ud-train_035.json', 'et_edt-ud-trai

In [7]:
if not os.path.exists('.\\ud_andmestik.csv'):
    create_df_ud_corpus(jsons, ud_dir, 'ud_morph_reduced', 'ud_andmestik.csv')

In [8]:
csv_ud_file = "ud_andmestik.csv"
df_ud = pd.read_csv(csv_ud_file, keep_default_na=False)

In [61]:
clean_df(df_ud, csv_ud_file)

Assigning NaN values in columns form and pos with an empty string
Removing NaN words
Modified dataframe saved to ud_andmestik.csv


In [9]:
create_labels_column(df_ud)
display(df_ud.head(5))
print(df_ud.shape)

Column 'labels' created


Unnamed: 0,sentence_id,words,form,pos,file_prefix,source,labels
0,0,Aga,,J,aja_ee199920,et_edt-ud-dev_000.json,J
1,0,mulle,sg all,P,aja_ee199920,et_edt-ud-dev_000.json,sg all_P
2,0,tundub,b,V,aja_ee199920,et_edt-ud-dev_000.json,b_V
3,0,",",,Z,aja_ee199920,et_edt-ud-dev_000.json,Z
4,0,et,,J,aja_ee199920,et_edt-ud-dev_000.json,J


(437826, 7)


<a id='mudeli_testimine'></a>

#### Model testing with UD corpus

Extracting test set

In [10]:
test_df_ud = df_ud[df_ud['source'].str.contains('ud-test')].copy()
print(test_df_ud.shape)

(48489, 7)


Reading in unique labels that the model is trained with

In [11]:
# Reading in unique labels
with open("unique_labels.json", 'r') as f:
    unique_labels = json.load(f)

Labels unknown to Vabamorf and replacing them with appropriate known unique labels

In [12]:
def unknown_labels(unique_labels_list, data):
    """Finds labels that are not present in the unique labels list.

    Args:
        unique_labels_list (list): list of unique labels (obtained from reading <code>unique_labels.json</code> file)
        data (pandas.core.frame.DataFrame): data to check for labels
    """
    df_unique_labels = pd.DataFrame(unique_labels_list, columns=['labels'])
    unique_labels_series = df_unique_labels['labels']
    df_labels = data['labels']#.drop_duplicates()
    labels_not_in_unique = df_labels[~df_labels.isin(unique_labels_series)]

    print("Labels in data that are not in unique labels list:")
    print(labels_not_in_unique)
    print("Unique:")
    print(labels_not_in_unique.unique())
    return labels_not_in_unique

In [13]:
labels_not_in_unique = unknown_labels(unique_labels, test_df_ud)

Labels in data that are not in unique labels list:
46373    T
46374    T
46375    T
48036    T
48037    T
        ..
85108    T
85682    T
85683    T
85941    T
85942    T
Name: labels, Length: 85, dtype: object
Unique:
['T' 'sg g_place']


In [14]:
# Replace unknown pos 'T' with '?'
test_df_ud['labels'] = test_df_ud['labels'].apply(lambda x: '?' if x == 'T' else x)
# Replace unknown pos 'place' with 'S'
test_df_ud['labels'] = test_df_ud['labels'].apply(lambda x: 'sg g_S' if x == 'sg g_place' else x)

In [15]:
labels_not_in_unique = unknown_labels(unique_labels, test_df_ud)

Labels in data that are not in unique labels list:
Series([], Name: labels, dtype: object)
Unique:
[]


Removing unnecessary columns for the model<!-- Mudelile ebavajalike veergude eemaldamine -->

In [16]:
test_df_ud = test_df_ud.drop(labels=['file_prefix', 'source'], axis=1)

Initializing the model<!-- Mudeli ülesehitamine -->

In [17]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Configurations
model_args = NERArgs()
model_args.train_batch_size = 8
model_args.evaluate_during_training = False
model_args.learning_rate = 5e-5
model_args.num_train_epochs = 10
model_args.use_early_stopping = True
model_args.use_cuda = torch.cuda.is_available()  # Use GPU if available
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False # Takes a lot of storage space
model_args.save_steps = -1
model_args.overwrite_output_dir = True
model_args.cache_dir = 'NER_mudel/cache'
model_args.best_model_dir = 'NER_mudel/best_model'
model_args.output_dir = 'NER_mudel'
model_args.use_multiprocessing = False

# Initialization
model = NERModel("camembert", "NER_mudel", args=model_args, labels=unique_labels)



Initializing custom metrics to be used for evaluation

In [24]:
poseval = evaluate.load("evaluate-metric/poseval", module_type="metric")

def custom_metrics(preds, labels):

    # Evaluate using poseval
    result = poseval.compute(predictions=preds, references=labels)

    return result

Evaluating the model<!-- Mudeli hindamine -->

In [25]:
# Evaluate the model
result, model_outputs, preds_list = model.eval_model(test_df_ud, extra_metrics=custom_metrics)

INFO:ner_model.py:1884:  Converting to features started.


100%|██████████| 3/3 [00:09<00:00,  3.01s/it]
  with amp.autocast():
Running Evaluation: 100%|██████████| 12/12 [00:01<00:00,  6.18it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


INFO:ner_model.py:1253: {'eval_loss': 0.9056411236524582, 'precision': 0.8831920868852051, 'recall': 0.9080618372301009, 'f1_score': 0.8954543163836114, 'extra_metrics': {'?': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0.0}, '?_H': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 8.0}, '?_N': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 490.0}, '?_O': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 116.0}, '?_S': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0}, '?_Y': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 276.0}, 'A': {'precision': 0.5136268343815513, 'recall': 0.6749311294765841, 'f1-score': 0.5833333333333334, 'support': 363.0}, 'D': {'precision': 0.9274108461700781, 'recall': 0.9771009337483326, 'f1-score': 0.9516076648262423, 'support': 4498.0}, 'G': {'precision': 0.9134615384615384, 'recall': 0.979381443298969, 'f1-score': 0.945273631840796, 'support': 97.0}, 'H': {'precision'

In [26]:
print(f"Evaluation Loss:{result['eval_loss']:.4f}")
print(f"Precision: \t{result['extra_metrics']['weighted avg']['precision']:.4f}")
print(f"Recall: \t{result['extra_metrics']['weighted avg']['recall']:.4f}")
print(f"F1 Score: \t{result['extra_metrics']['weighted avg']['f1-score']:.4f}")

Evaluation Loss:0.9056
Precision: 	0.9315
Recall: 	0.9162
F1 Score: 	0.9187


#### Vabamorf evaluation on UD corpus

In [46]:
if not os.path.exists('.\\ud_vabamorf.csv'):
    create_df_ud_corpus(jsons, ud_dir, 'morph_analysis', 'ud_vabamorf.csv')

In [47]:
test_df_ud = df_ud[df_ud['source'].str.contains('ud-test')].copy()

In [48]:
df_ud_vabamorf = pd.read_csv('ud_vabamorf.csv', keep_default_na=False)

In [49]:
clean_df(df_ud_vabamorf, 'ud_vabamorf.csv')

Assigning NaN values in columns form and pos with an empty string
Removing NaN words
Modified dataframe saved to ud_vabamorf.csv


In [50]:
create_labels_column(df_ud_vabamorf)

Column 'labels' created


In [51]:
test_df_ud_vabamorf = df_ud_vabamorf[df_ud_vabamorf['source'].str.contains('ud-test')].copy()

In [52]:
def group_labels_by_sentence(df):
    # Preparing data for seqeval metrics (needs nested lists)
    grouped = df.groupby(['source', 'sentence_id'])['labels'].apply(list)
    return grouped.reset_index(drop=True).tolist()

labels_true = group_labels_by_sentence(test_df_ud)
labels_pred = group_labels_by_sentence(test_df_ud_vabamorf)

# precision = sk.metrics.precision_score(test_df_ud['labels'], test_df_ud_vabamorf['labels'], average='weighted')
# recall = sk.metrics.recall_score(test_df_ud['labels'], test_df_ud_vabamorf['labels'], average='weighted')
# f1 = sk.metrics.f1_score(test_df_ud['labels'], test_df_ud_vabamorf['labels'], average='weighted')
# print(f"Precision: \t{precision:.4f}")
# print(f"Recall: \t{recall:.4f}")
# print(f"F1 Score: \t{f1:.4f}")

results = poseval.compute(predictions=labels_true, references=labels_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
print(f"Precision: \t{results["weighted avg"]["precision"]:.4f}")
print(f"Recall: \t{results["weighted avg"]["recall"]:.4f}")
print(f"F1 Score: \t{results["weighted avg"]["f1-score"]:.4f}")

Precision: 	0.9194
Recall: 	0.9067
F1 Score: 	0.9082


| Model    | Precision | Recall | F1 score |
|----------|-----------|--------|----------|
| Bert     | 0.9315    | 0.9162 | 0.9187   |
| Vabamorf | 0.9194    | 0.9067 | 0.9082   |

\* Metrics are from weighted average