# Bert-based morphological tagger's comparison with Vabamorf

## Table of contents
1. [**Gathering unused data and comparing Vabamorf / NER model**](#kasutamata_andmete_kogumine)
   1. [**Defined functions**](#def_func)
   2. [**Gathering unused data**](#gather_data)
   3. [**Creating comparison data**](#data_creation)
   4. [*Using own custom prediction (deprecated)*](#deprecated_pred)
   5. [**Using BertMorphTagger**](#estnltk_tagger)
   
[end](#end)

In [1]:
import os
import typing
import itertools
import json
import logging
import warnings
import pkg_resources
import types
import pandas as pd
import numpy as np
import estnltk, estnltk.converters, estnltk.taggers
import torch
import math
from morph_eval_utils import MorphDiffSummarizer, MorphDiffFinder, write_formatted_diff_str_to_file
from tqdm import tqdm
from simpletransformers.ner import NERModel, NERArgs
from bert_morph_tagger import BertMorphTagger

  from .autonotebook import tqdm as notebook_tqdm


INFO:config.py:58: PyTorch version 2.4.0 available.


In [2]:
# Get locally imported modules from current notebook - https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook - Alex P. Miller
def get_imports():
    
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
            
        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

estnltk==1.7.3
numpy==1.26.4
pandas==2.2.2
simpletransformers==0.70.1
torch==2.4.0
tqdm==4.66.5


## Gathering unused data and comparing Vabamorf / NER model <!-- Kasutamata andmete kogumine ja Vabamorfi ning NER mudeli võrdlemine -->

<a id='kasutamata_andmete_kogumine'></a>

### Defined functions

<a id='def_func'></a>

Reading text from files, tokenizing texts and save tokenized texts file by file into JSON.

In [11]:
def create_json_file_by_file_enc2017(
    jsons: typing.List[str], 
    in_dir: str, 
    save_dir: str, 
    do_morph_layer: bool = True, 
    bert_morph_tagger: typing.Optional[BertMorphTagger] = None, 
    necessary_layers: typing.List[str] = ['words', 'sentences', 'morph_analysis', 'bert_morph_tagging']
    ):
    """
    Creates a JSON file for each text file.
    <ul>
        <li>Skips JSON files that have already been created.</li>
        <li>Converts JSON file into EstNLTK Text object.</li>
        <li>Adds text type metadata and morph analysis.</li>
        <li>Adds <code>BertMorphTagger</code> layer</li>
        <li>Removes unnecessary layers.</li>
        <li>Converts EstNLTK Text object into JSON using <code>estnltk.converters.text_to_json.</code></li>
    </ul>
    Args:
        jsons (list): List of json files from which to read in the text
        in_dir (str): Directory where to read the json files from
        save_dir (str): Directory where to save the new json files
        bert_morph_tagger (optional, BertMorphTagger): Configured <code>BertMorphTagger</code> class instance, if None, will not use this tagger
        necessary_layers (optional, list[str]): Text object layers that will not be deleted
    """

    print("Beginning tokenization file by file")
    for file_name in tqdm(jsons):

        # Skipping previous JSON files
        if os.path.exists(os.path.join(save_dir, file_name)):
            continue

        # Convert json to EstNLTK Text object
        text = estnltk.converters.json_to_text(file=os.path.join(in_dir, file_name))

        # Add text type metadata
        text_type = text.meta.get('texttype') # Text type
        if not text_type:
            if file_name.startswith('wiki17'):
                text.meta.update({'texttype': 'wikipedia'})
            elif file_name.startswith('web13'):
                text.meta.update({'texttype': 'blogs_and_forums'})
            else:
                raise RuntimeError("Could not assign text type")

        # Add morph layer
        if do_morph_layer:
            text.tag_layer('morph_analysis')
        # Add BERT morph layer
        if isinstance(bert_morph_tagger, BertMorphTagger):
            if not do_morph_layer:
                text.tag_layer('sentences')
            text.add_layer(bert_morph_tagger.make_layer(text))

        # Remove unnecessary layers
        for layer in text.layers:
            if layer not in necessary_layers:
                text.pop_layer(layer, cascading=False)

        if 'morph_analysis' in text.layers and 'bert_morph_tagging' in text.layers: # Assertion that the length of both layers are the same
            assert len(text.morph_analysis) == len(text.bert_morph_tagging), \
            f"""Failed to assert file '{file_name}'
            Length of layers aren't the same:
            morph_analysis = {len(text.morph_analysis)}
            bert_morph_tagging = {len(text.bert_morph_tagging)}"""
        # Save to JSON
        os.makedirs(save_dir, exist_ok=True)
        estnltk.converters.text_to_json(text=text, file=os.path.join(save_dir, file_name))

    print("Tokenization completed successfully")


A function that collects texts for each text type *more or less* in proportion to the number of words given as `n` <!-- Funktsioon, millega kogutakse iga tekstiliigi kohta tekste enam-vähem proportsionaalselt sõnade arvu suhtes -->

In [12]:
# Function to collect texts file by file
def gather_rows_for_text_type(
    df: pd.DataFrame, 
    n: int, 
    random_state: typing.Optional[int] = None
    ):
    """Gathers about `n` (>= n) rows for each text type\n
    Ensures that all text types have about the same number of words.

    Args:
        df (pd.DataFrame): The DataFrame containing the text data.
        n (int): Number of words to gather.
        random_state (optional, int): Seed for the shuffle function (acts like <code>random_state</code>).

    Returns:
        pd.DataFrame: Gathered rows for each text type.
    """

    def gather_rows_for_type(
        group: pd.Group, 
        n: int, 
        random_state: typing.Optional[int] = None
        ):
        """Gathers about `n` (>= n) rows for the text type\n

        Args:
            group (): Pandas dataframe group
            n (int): Number of words to gather
            random_state (int): Seed for the shuffle function (acts like <code>random_state</code>)

        Returns:
            pd.DataFrame: Gathered rows for text type
        """
        gathered_rows = pd.DataFrame()
        sources = group['source'].unique()

        if random_state:
            np.random.seed(random_state)
            np.random.shuffle(sources)

        for source in sources:
            source_rows = group[group['source'] == source]
            gathered_rows = pd.concat([gathered_rows, source_rows])
            if len(gathered_rows) >= n:
                break

        return gathered_rows

    grouped = df.groupby('type')
    data = pd.concat([gather_rows_for_type(group, n, random_state) for _, group in grouped])
    return data

Initializing the model<!-- Mudeli ülesehitamine -->

In [13]:
def initialize_model(model_name, unique_labels, no_progress_bars=False):
    # Set up logging
    logger = logging.getLogger('simpletransformers.ner.ner_model')
    logger.setLevel(logging.ERROR)

    # Suppress specific warnings
    # warnings.filterwarnings("ignore", category=FutureWarning) # For warning message "FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated."
    warnings.filterwarnings("ignore", category=UserWarning) # For warnings like "UserWarning: <tag> seems not to be NE tag."

    # Configurations
    model_args = NERArgs()
    model_args.train_batch_size = 8
    model_args.evaluate_during_training = False
    model_args.learning_rate = 5e-5
    model_args.num_train_epochs = 10
    model_args.use_early_stopping = True
    model_args.use_cuda = torch.cuda.is_available()  # Use GPU if available
    model_args.save_eval_checkpoints = False
    model_args.save_model_every_epoch = False # Takes a lot of storage space
    model_args.save_steps = -1
    model_args.overwrite_output_dir = True
    model_args.cache_dir = model_name + '/cache'
    model_args.best_model_dir = model_name + '/best_model'
    model_args.output_dir = model_name
    model_args.use_multiprocessing = False
    model_args.silent = no_progress_bars

    # Initialization
    model = NERModel("camembert", model_name, args=model_args, labels=unique_labels)
    return model

Functions to predict labels to the group of sentences. Includes sentence splitting into clauses and clauses into equal length clauses' parts.

In [14]:
def check_token_count(model, sentence):
    """Checks token count in the sentence

    Args:
        model (): NER model that is predicted with
        sentence (list): list of words in a sentence

    Returns:
        bool: Whether the token count exceeds model's maximum sequence length
    """

    inputs = model.tokenizer('  '.join(sentence), return_tensors="pt")
    return bool(len(inputs["input_ids"][0]) >= model.args.max_seq_length)

def get_clause_parts(model, clause):
    """Splits clause into equal length clause segments. <i>Clauses can get long when there is a list in a sentence</i>

    Args:
        model (): NER model that is predicted with
        clause (list): list of words in a clause

    Returns:
        list: List of clause segments. Each segment is a list of words.
    """

    inputs = model.tokenizer('  '.join(clause), return_tensors="pt")
    clause_parts = np.array_split(clause, math.ceil(len(inputs["input_ids"][0]) / model.args.max_seq_length))
    return clause_parts

def get_clauses(model, sentence):
    """Splits sentence into clauses using EstNLTK clauses layer.

    Args:
        model (): NER model that is predicted with
        sentence (list): list of words in a sentence

    Returns:
        list: List of clauses. Each clause is a list of words.
    """

    sentence_text = '  '.join(sentence)
    text = estnltk.Text(sentence_text)
    text.tag_layer('clauses')
    clauses = list()
    for clause in text.clauses:
        if check_token_count(model, clause.text):
            clause_parts = get_clause_parts(model, clause.text)
            clauses.extend(clause_parts)
        else:
            clauses.append(clause.text)
    return clauses

def get_clauses_labels(model, sentence):
    """Predicts labels to clauses

    Args:
        model (): NER model that is predicted with
        sentence (list): list of words in a sentence

    Returns:
        list: List of tags predicted to words in clauses
    """

    clauses = get_clauses(model, sentence)

    ner_labels_parts = list()

    # Predict tags
    predictions, raw_outputs = model.predict(clauses, split_on_space=False)
    for prediction_part in predictions:
        # ner_words_part = [list(p.keys())[0] for p in prediction_part]
        ner_labels_part = [list(p.values())[0] for p in prediction_part]

        # ner_words_parts.append(ner_words_part)
        ner_labels_parts.append(ner_labels_part)

    # ner_words = list(itertools.chain.from_iterable(ner_words_parts))
    ner_labels = list(itertools.chain.from_iterable(ner_labels_parts))

    return ner_labels

def get_sentence_labels(model, sentence):
    """Predicts labels to a sentence

    Args:
        model (): NER model that is predicted with
        sentence (list): list of words in a sentence

    Returns:
        list: List of tags predicted to words in a sentence
    """

    # Predict tags
    predictions, raw_outputs = model.predict([sentence], split_on_space=False)

    # ner_words = [list(p.keys())[0] for p in predictions[0]]
    ner_labels = [list(p.values())[0] for p in predictions[0]]
    return ner_labels

def process_groups(model, groups):
    """Predicts labels to a list of groups. This group contains sentences for each source text file.

    Args:
        model (): NER model that is predicted with
        groups (): Group containing sentences for each source text file

    Raises:
        AssertionError: When the length of predicted labels mismatch the length of the sentence length. 
        <i>This might mean that generated clauses are wrong (meaning some words are missing because of <code>'  '.join(sentence)</code>)</i>

    Returns:
        list: List of predicted labels for each sentence in each source text file
    """
    chunk_results = []

    for _, group in groups:
        sentence = group.words.tolist()

        if check_token_count(model, sentence):  # Sentence splitting if token count is above model's max sequence length
            ner_labels = get_clauses_labels(model, sentence)
        else:
            ner_labels = get_sentence_labels(model, sentence)

        if len(ner_labels) != len(sentence):
            display(group)
            print(len(sentence), len(ner_labels))
            raise AssertionError("Predicted labels length mismatch sentence length.")

        group['ner_labels'] = ner_labels
        chunk_results.append(group)

    return chunk_results

Function to find morphological differences in multiple JSON files and save a summary.

In [15]:
def find_and_summarize_differences(jsons:typing.List[str], 
                                   in_dir:str, 
                                   output_dir:str, 
                                   morph_diff_finder:MorphDiffFinder, 
                                   morph_diff_summarizer:MorphDiffSummarizer):
    """
    Finds morphological differences in multiple JSON files and saves a summary.

    This function processes a list of JSON files, identifying morphological 
    differences in the text of each file. The differences are then summarized 
    and stored in an output file. 

    For each JSON file:
    <ul>
        <li>Text objects are created from the JSON data.</li>
        <li>Morphological differences between the text layers are identified.</li>
        <li>These differences are recorded using the provided summarizer.</li>
        <li>If any differences are found, they are saved to a separate file.</li>
        <li>Finally, a summary of all differences is written to a statistics file.</li>
    </ul>

    Args:
        jsons (List[str]): A list of JSON filenames to be processed.
        in_dir (str): Directory path where the input JSON files are located.
        output_dir (str): Directory path where output summary files will be stored.
        morph_diff_finder (MorphDiffFinder): An instance of MorphDiffFinder used 
            to identify morphological differences.
        morph_diff_summarizer (MorphDiffSummarizer): An instance of MorphDiffSummarizer 
            used to record and summarize the differences.

    Outputs:
    <ul>
        <li>For each JSON file with differences, a text file will be created in the 
          output directory containing the formatted differences.</li>
        <li>A final summary file is created in the output directory with statistics 
          on the total differences across all processed files.</li>
    </ul>
    """
    # Finds and saves differences for each json file
    os.makedirs(output_dir, exist_ok=True)
    for json in jsons:
        text_obj = estnltk.converters.json_to_text(file=os.path.join(in_dir, json))
        morph_diff_layer, formatted_diffs_str, total_diff_gaps = morph_diff_finder.find_difference(text_obj, fname=json, text_cat=text_obj.meta['texttype'])
        morph_diff_summarizer.record_from_diff_layer( 'morph_analysis', morph_diff_layer, text_obj.meta['texttype'], start_new_doc=True )
        if formatted_diffs_str is not None and len(formatted_diffs_str) > 0:
            fpath = os.path.join(output_dir, f'_{json}__ann_diffs.txt')
            write_formatted_diff_str_to_file( fpath, formatted_diffs_str )
        text_obj = None
        morph_diff_layer = None
        formatted_diffs_str = None
    # Summarizes the collected differences into results
    summarizer_result_str = morph_diff_summarizer.get_diffs_summary_output( show_doc_count=True )
    fpath = os.path.join(output_dir, f'_{'enc_2017'}__stats.txt')
    with open(fpath, 'w', encoding='utf-8') as out_f:
        out_f.write( 'TOTAL DIFF STATISTICS:'+os.linesep+summarizer_result_str )

### Gathering unused data

<a id='gather_data'></a>

In `finetune_bert_morph_tagger.ipynb` notebook files `andmestik.csv`, containing the whole enc2017 corpus, and `model_data.csv`, containing the subset of `andmestik.csv` used in model training, were created.

In [None]:
# Reading CSV file
df = pd.read_csv("andmestik.csv", keep_default_na=False)
model_df = pd.read_csv("model_data.csv", keep_default_na=False)

To get the other subset (unused data) of `andmestik.csv` not used in model training, we perform a left anti-join operation.

In [None]:
# Ensure both DataFrames have the same columns for comparison
common_columns = df.columns.intersection(model_df.columns)
# Perform a left anti-join to get the unused data
unused_data = df.merge(model_df, on=common_columns.tolist(), how='left', indicator=True)
unused_data = unused_data[unused_data['_merge'] == 'left_only'].drop(columns=['_merge'])

Example of the unused data, which is later saved into `unused_data.csv`.

In [None]:
# Print or save the result
display(unused_data)
if not os.path.exists("./unused_data.csv"):
    unused_data.to_csv("unused_data.csv", index=False)

Unnamed: 0,sentence_id,words,form,pos,type,source,labels
131164,0,BAGDAD,sg n,H,periodicals,nc_10532_642042.json,sg n_H
131165,0,",",,Z,periodicals,nc_10532_642042.json,Z
131166,0,29.,?,O,periodicals,nc_10532_642042.json,?_O
131167,0,november,sg n,S,periodicals,nc_10532_642042.json,sg n_S
131168,0,(,,Z,periodicals,nc_10532_642042.json,Z
...,...,...,...,...,...,...,...
10555188,2,tulemusena,sg es,S,wikipedia,wiki17_99964_x.json,sg es_S
10555189,2,[,,Z,wikipedia,wiki17_99964_x.json,Z
10555190,2,2,?,N,wikipedia,wiki17_99964_x.json,?_N
10555191,2,],,Z,wikipedia,wiki17_99964_x.json,Z


### Creating comparison data

<a id='data_creation'></a>

Comparison data is generated from unused data (`unused_data.csv`). Roughly 600,000 words will be collected for each text type, summing a total of ~3 million words. After collection, the resulting data is saved as `comparison_data.csv`.

In [None]:
unused_data = pd.read_csv("unused_data.csv", keep_default_na=False)

In [None]:
comparison_data = gather_rows_for_text_type(unused_data, 600000, 42)

In [None]:
if not os.path.exists("./comparison_data.csv"):
    comparison_data.to_csv("comparison_data.csv", index=False)

In [None]:
comparison_data = pd.read_csv("comparison_data.csv", keep_default_na=False)

In [None]:
comparison_data

Unnamed: 0,sentence_id,words,form,pos,type,source,labels
0,0,Koju,adt,S,blogs_and_forums,web13_274106_x.json,adt_S
1,0,jõudes,des,V,blogs_and_forums,web13_274106_x.json,des_V
2,0,ootas,s,V,blogs_and_forums,web13_274106_x.json,s_V
3,0,ees,,D,blogs_and_forums,web13_274106_x.json,D
4,0,üllatus,sg n,S,blogs_and_forums,web13_274106_x.json,sg n_S
...,...,...,...,...,...,...,...
3056351,17,aastaks,sg tr,S,wikipedia,wiki17_85786_x.json,sg tr_S
3056352,17,oli,s,V,wikipedia,wiki17_85786_x.json,s_V
3056353,17,ta,sg n,P,wikipedia,wiki17_85786_x.json,sg n_P
3056354,17,surnud,,A,wikipedia,wiki17_85786_x.json,A


### Using own custom prediction (deprecated)

<a id='deprecated_pred'></a>

These next steps were an attempt to use the model's predict function to get the labels and then do the comparison. However, the model predicts to each token and does not consider Vabamorf's word format. These results are before BertMorphTagger was made.

In [None]:
comparison_groups = comparison_data.groupby(['source', 'sentence_id'])

In [None]:
with open("unique_labels.json", 'r') as f:
    unique_labels = json.load(f)

model = initialize_model("NER_mudel", unique_labels)

In [None]:
predicted_chunks = process_groups(model, comparison_groups)

In [None]:
updated_comparison_data = pd.concat(predicted_chunks)

In [None]:
updated_comparison_data.to_csv("updated_comparison_data.csv", index=False)

In [None]:
updated_comparison_data = pd.read_csv('./updated_comparison_data.csv', keep_default_na=False)

In [None]:
label_differences = updated_comparison_data[updated_comparison_data['labels'] != updated_comparison_data['ner_labels']]

In [None]:
label_differences.to_csv("label_differences.csv")

In [None]:
display(label_differences)

Unnamed: 0,sentence_id,words,form,pos,type,source,labels,ner_labels
0,0,OKLAHOMA,sg g,H,periodicals,nc_10532_642051.json,sg g_H,sg n_H
5,0,Reuters-EPLO,?,Y,periodicals,nc_10532_642051.json,?_Y,sg n_H
8,0,Oklahoma,sg g,H,periodicals,nc_10532_642051.json,sg g_H,sg n_H
47,1,sajalases,sg in,S,periodicals,nc_10532_642051.json,sg in_S,sg in_A
61,2,Timothy,sg g,H,periodicals,nc_10532_642051.json,sg g_H,sg n_H
...,...,...,...,...,...,...,...,...
3056133,20,maini,pl ter,S,wikipedia,wiki17_99902_x.json,pl ter_S,sg ter_S
3056235,6,Kloogaranna,sg n,H,wikipedia,wiki17_99953_x.json,sg n_H,sg g_H
3056247,7,ehitatud,tud,V,wikipedia,wiki17_99953_x.json,tud_V,A
3056303,9,teise,adt,P,wikipedia,wiki17_99953_x.json,adt_P,sg g_P


In [None]:
print(f"Differences count: {len(label_differences)} / {len(updated_comparison_data)} ({round(len(label_differences) / len(updated_comparison_data) * 100, 3)}%)")

Differences count: 139917 / 3056356 (4.578%)


### Using BertMorphTagger

<a id='estnltk_tagger'></a>

Here, a new BertMorphTagger is used to predict the labels according to Vabamorf's format `form` and `partofspeech`. Note that the model predicts a label that is a concatenation of `form` and `partofspeech` joined with `_` (underscore).

In [16]:
comparison_data = pd.read_csv("comparison_data.csv", keep_default_na=False)
in_dir = '_plain_texts_json'
jsons = comparison_data['source'].unique().tolist()
morph_tagger = BertMorphTagger('./NER_mudel/', get_top_n_predictions=1, token_level=False)

JSON file creation

In [None]:
create_json_file_by_file_enc2017(jsons, in_dir, '_diff_morph_texts_json', True, bert_morph_tagger=morph_tagger)

Beginning tokenization file by file


100%|██████████| 5778/5778 [00:00<00:00, 29029.45it/s]

Tokenization completed successfully





Finding and summarizing differences using `MorphDiffFinder` and `MorphDiffSummarizer`

In [None]:
morph_diff_finder = MorphDiffFinder('morph_analysis', 
                                    'bert_morph_tagging', 
                                    diff_attribs  = ['partofspeech', 'form'], 
                                    focus_attribs = ['partofspeech', 'form'] )
morph_diff_summarizer = MorphDiffSummarizer('morph_analysis', 'bert_morph_tagging' )
in_dir = './_diff_morph_texts_json/'
output_dir = './differences/'

In [None]:
find_and_summarize_differences(jsons, in_dir, output_dir, morph_diff_finder, morph_diff_summarizer)

Merging differences into one single file

In [None]:
diff_files = os.listdir(output_dir)
lines = list()

In [None]:
for i, diff_file in enumerate(diff_files):
    with open(file=os.path.join(output_dir, diff_file), mode='r', encoding='UTF-8') as f:
        file_lines = f.readlines()
        lines.append(file_lines)

In [None]:
with open(file='differences__ann_diffs_.txt', mode='w', encoding='UTF-8') as f:
    for file_lines in lines:
        f.writelines(file_lines)

## END

<a id='end'></a>