### Newer BertMorphTagger (*NER_mudel_v2*) comparsion with vabamorf
#### Creating the differences file

In [None]:
import os
import pkg_resources
import types
import pandas as pd
import estnltk, estnltk.converters, estnltk.taggers

from morph_eval_utils import MorphDiffSummarizer, MorphDiffFinder, write_formatted_diff_str_to_file
from bert_morph_tagger_notebook_functions import NotebookFunctions
from tqdm import tqdm
from bert_morph_tagger import BertMorphTagger

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Get locally imported modules from current notebook - https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook - Alex P. Miller
def get_imports():
    
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
            
        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

estnltk==1.7.3
evaluate==0.4.2
numpy==1.26.4
pandas==2.2.2
simpletransformers==0.70.1
torch==2.5.1
tqdm==4.66.5


In [None]:
comparison_data = pd.read_csv("comparison_data.csv", keep_default_na=False)
in_dir = '_plain_texts_json'
unused_jsons = comparison_data['source'].unique().tolist()
morph_tagger_v2 = BertMorphTagger('./NER_mudel_v2/')

JSON file creation

In [None]:
NotebookFunctions.create_json_file_by_file_enc2017(unused_jsons, in_dir, '_diff_morph_texts_json_v2', 'enc2017', True, bert_morph_tagger=morph_tagger_v2)

Beginning to morphologically tag file by file


100%|██████████| 5778/5778 [3:00:21<00:00,  1.87s/it]     

Morphological tagging completed successfully





Finding and summarizing differences using `MorphDiffFinder` and `MorphDiffSummarizer`

In [None]:
morph_diff_finder = MorphDiffFinder('morph_analysis', 
                                    'bert_morph_tagging', 
                                    diff_attribs  = ['partofspeech', 'form'], 
                                    focus_attribs = ['partofspeech', 'form'] )
morph_diff_summarizer = MorphDiffSummarizer('morph_analysis', 'bert_morph_tagging' )
in_dir = './_diff_morph_texts_json_v2_complete/'
output_dir = './differences_v2_complete/'

In [None]:
NotebookFunctions.find_and_summarize_differences(unused_jsons, in_dir, output_dir, morph_diff_finder, morph_diff_summarizer)

100%|██████████| 17368/17368 [30:34<00:00,  9.47it/s]  


Merging differences into one single file

In [None]:
diff_files = os.listdir(output_dir)
lines = list()

In [None]:
for i, diff_file in tqdm(enumerate(diff_files)):
    with open(file=os.path.join(output_dir, diff_file), mode='r', encoding='UTF-8') as f:
        file_lines = f.readlines()
        lines.append(file_lines)

17347it [00:10, 1730.38it/s]


In [None]:
with open(file='differences__ann_diffs_v2_complete.txt', mode='w', encoding='UTF-8') as f:
    for file_lines in lines:
        f.writelines(file_lines)

Pick randomly 100 differences evenly distributed to each text type

In [None]:
!python pick_randomly_from_diffs.py differences__ann_diffs_v2.txt 100 -e

INFO:pick_randomly_from_diffs.py:66: Collecting difference indexes ...
INFO:pick_randomly_from_diffs.py:85: 
INFO:pick_randomly_from_diffs.py:86: Differences by text category / subcorpus:
INFO:pick_randomly_from_diffs.py:91:  78888  (21.49%)  wikipedia
INFO:pick_randomly_from_diffs.py:91:  77124  (21.01%)  science
INFO:pick_randomly_from_diffs.py:91:  74853  (20.39%)  blogs_and_forums
INFO:pick_randomly_from_diffs.py:91:  69715  (18.99%)  periodicals
INFO:pick_randomly_from_diffs.py:91:  66459  (18.11%)  fiction
INFO:pick_randomly_from_diffs.py:94: 
INFO:pick_randomly_from_diffs.py:95:  367039  (100.0%)  TOTAL
INFO:pick_randomly_from_diffs.py:96: 
INFO:pick_randomly_from_diffs.py:112: Picking randomly 20 differences from each text category ...
INFO:pick_randomly_from_diffs.py:157: Collecting randomly picked differences ...
INFO:pick_randomly_from_diffs.py:187: Saving into  differences__ann_diffs__x100_even.txt ...
INFO:pick_randomly_from_diffs.py:193: Done.


Pick randomly 1000 differences evenly distributed to each text type

In [None]:
!python pick_randomly_from_diffs.py differences__ann_diffs_v2.txt 1000 -e

INFO:pick_randomly_from_diffs.py:66: Collecting difference indexes ...
INFO:pick_randomly_from_diffs.py:85: 
INFO:pick_randomly_from_diffs.py:86: Differences by text category / subcorpus:
INFO:pick_randomly_from_diffs.py:91:  131014  (24.51%)  wikipedia
INFO:pick_randomly_from_diffs.py:91:  125992  (23.57%)  science
INFO:pick_randomly_from_diffs.py:91:  104636  (19.58%)  blogs_and_forums
INFO:pick_randomly_from_diffs.py:91:  93402  (17.47%)  periodicals
INFO:pick_randomly_from_diffs.py:91:  79476  (14.87%)  fiction
INFO:pick_randomly_from_diffs.py:94: 
INFO:pick_randomly_from_diffs.py:95:  534520  (100.0%)  TOTAL
INFO:pick_randomly_from_diffs.py:96: 
INFO:pick_randomly_from_diffs.py:112: Picking randomly 200 differences from each text category ...
INFO:pick_randomly_from_diffs.py:157: Collecting randomly picked differences ...
INFO:pick_randomly_from_diffs.py:187: Saving into  differences__ann_diffs_v2_x1000_even.txt ...
INFO:pick_randomly_from_diffs.py:193: Done.


Summarized results

In [None]:
with open("./differences__ann_diffs_v2.txt", mode="r") as f:
    for i in range(10):
        line = next(f).rstrip()
        print(line)

TOTAL DIFF STATISTICS:

morph_analysis
 blogs_and_forums | #docs: 924 | modified spans: 104636 / 600595 (0.1742) | annotations ratio: 1077830 / 1267462 (0.8504) | only in morph_analysis: 127951 (10.0951%) | only in bert_morph_tagging: 61681 (4.8665%)
 fiction          | #docs: 9 | modified spans: 79476 / 638806 (0.1244) | annotations ratio: 1201448 / 1353435 (0.8877) | only in morph_analysis: 113905 (8.4160%) | only in bert_morph_tagging: 38082 (2.8137%)
 periodicals      | #docs: 1799 | modified spans: 93402 / 600172 (0.1556) | annotations ratio: 1092528 / 1266743 (0.8625) | only in morph_analysis: 120307 (9.4973%) | only in bert_morph_tagging: 53908 (4.2556%)
 science          | #docs: 36 | modified spans: 125992 / 616552 (0.2043) | annotations ratio: 1062406 / 1301408 (0.8164) | only in morph_analysis: 153652 (11.8066%) | only in bert_morph_tagging: 85350 (6.5583%)
 wikipedia        | #docs: 3010 | modified spans: 131014 / 600255 (0.2183) | annotations ratio: 1011914 / 1257569 (0.80