## Experiments on using word-to-analyses-freq lexicons for reordering

In [1]:
from collections import defaultdict
import os, os.path
from estnltk.converters import json_to_text
from estnltk.taggers import MorphAnalysisReorderer

from eval_utils import GoldStandard
from eval_utils import add_normalized_form_to_words
from eval_utils import collect_matches
from eval_utils import write_out_freq_sorted_annotations
from eval_utils import evaluate_reorderer

In [2]:
# Corpus with gold standard annotations
input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

### Create lexicons based on train data

In [3]:
# Load gold standard texts and add pre-annotations
loaded_texts = []
for fname in os.listdir( input_dir ):
    if 'dev' in fname:
        continue
    if 'test' in fname:
        continue
    if fname.endswith('.json'):
        # Load Text with gold standard annotations
        text = json_to_text(file=os.path.join(input_dir, fname) )
        if 'normalized_form' not in text.words.attributes:
            add_normalized_form_to_words( text.words )
        assert 'normalized_form' in text.words.attributes
        # Add Vabamorf's default morph analysis
        text.tag_layer(['morph_analysis'])
        loaded_texts.append( text )
        print(' Loaded and pre-annotated ', fname)

 Loaded and pre-annotated  et_edt-ud-train_015.json
 Loaded and pre-annotated  et_edt-ud-train_016.json
 Loaded and pre-annotated  et_edt-ud-train_017.json
 Loaded and pre-annotated  et_edt-ud-train_018.json
 Loaded and pre-annotated  et_edt-ud-train_019.json
 Loaded and pre-annotated  et_edt-ud-train_020.json
 Loaded and pre-annotated  et_edt-ud-train_021.json
 Loaded and pre-annotated  et_edt-ud-train_022.json
 Loaded and pre-annotated  et_edt-ud-train_023.json
 Loaded and pre-annotated  et_edt-ud-train_024.json
 Loaded and pre-annotated  et_edt-ud-train_025.json
 Loaded and pre-annotated  et_edt-ud-train_026.json
 Loaded and pre-annotated  et_edt-ud-train_027.json
 Loaded and pre-annotated  et_edt-ud-train_028.json
 Loaded and pre-annotated  et_edt-ud-train_029.json
 Loaded and pre-annotated  et_edt-ud-train_030.json
 Loaded and pre-annotated  et_edt-ud-train_031.json
 Loaded and pre-annotated  et_edt-ud-train_032.json
 Loaded and pre-annotated  et_edt-ud-train_033.json
 Loaded and 

In [4]:
focus_fields = ['lemma','ending','clitic','partofspeech','form']
word_matches = collect_matches( loaded_texts, 'ud_morph_reduced', 
                                gold_morph_type=GoldStandard.UD_CORPUS, 
                                focus_fields = focus_fields )

 Processed documents:                                 24
 Ambiguous words from total words:                    28613 / 344646 (8.30%)
 Ambiguous words successfully matched to gold morph:  26774 / 28613 (93.57%)
 Ambiguous words with indistinguishable annotations:  159 / 28613 (0.56%)


In [5]:
# Include all analyses
write_out_freq_sorted_annotations( 'et_edt-ud-train_sorted_analyses_full.csv', 
                                   word_matches, focus_fields, 
                                   freq_threshold=-1, encoding='utf-8' )

In [6]:
# Include analyses at freq threshold 5
write_out_freq_sorted_annotations( 'et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                    word_matches, focus_fields, 
                                    freq_threshold=5, encoding='utf-8' )

### Add lexicons based on train+dev data

In [7]:
assert loaded_texts and len(loaded_texts) > 0
# Add dev data to loaded_texts
for fname in os.listdir( input_dir ):
    if 'train' in fname:
        continue
    if 'test' in fname:
        continue
    if fname.endswith('.json'):
        # Load Text with gold standard annotations
        text = json_to_text(file=os.path.join(input_dir, fname) )
        if 'normalized_form' not in text.words.attributes:
            add_normalized_form_to_words( text.words )
        assert 'normalized_form' in text.words.attributes
        # Add Vabamorf's default morph analysis
        text.tag_layer(['morph_analysis'])
        loaded_texts.append( text )
        print(' Loaded and pre-annotated ', fname)

 Loaded and pre-annotated  et_edt-ud-dev_000.json
 Loaded and pre-annotated  et_edt-ud-dev_001.json
 Loaded and pre-annotated  et_edt-ud-dev_002.json
 Loaded and pre-annotated  et_edt-ud-dev_003.json
 Loaded and pre-annotated  et_edt-ud-dev_004.json
 Loaded and pre-annotated  et_edt-ud-dev_005.json
 Loaded and pre-annotated  et_edt-ud-dev_006.json
 Loaded and pre-annotated  et_edt-ud-dev_007.json
 Loaded and pre-annotated  et_edt-ud-dev_008.json


In [8]:
focus_fields = ['lemma','ending','clitic','partofspeech','form']
word_matches = collect_matches( loaded_texts, 'ud_morph_reduced', 
                                gold_morph_type=GoldStandard.UD_CORPUS,
                                focus_fields = focus_fields )

 Processed documents:                                 33
 Ambiguous words from total words:                    32346 / 389278 (8.31%)
 Ambiguous words successfully matched to gold morph:  30273 / 32346 (93.59%)
 Ambiguous words with indistinguishable annotations:  170 / 32346 (0.53%)


In [9]:
# Include all analyses
write_out_freq_sorted_annotations( 'et_edt-ud-train_and_dev_sorted_analyses_full.csv', 
                                   word_matches, focus_fields, 
                                   freq_threshold=-1, encoding='utf-8' )

In [10]:
# Include analyses at freq threshold 5
write_out_freq_sorted_annotations( 'et_edt-ud-train_and_dev_sorted_analyses_cut_5.csv', 
                                    word_matches, focus_fields, 
                                    freq_threshold=5, encoding='utf-8' )

### A small test

In [1]:
from estnltk import Text
from estnltk.taggers import VabamorfTagger
t=Text('See toimus 1. mail, ütles üks.')
# Switch off applying reordered by default
t.layer_resolver.update( VabamorfTagger(use_reorderer=False) )

# Add morph without reorderer
t.tag_layer('morph_analysis')

# Output ambiguities
t.morph_analysis[lambda x : len(x.annotations) > 1]

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
mail,mail,maa,maa,['maa'],il,,pl ad,S
,mail,mai,mai,['mai'],l,,sg ad,S
üks,üks,üks,üks,['üks'],0,,sg n,N
,üks,üks,üks,['üks'],0,,sg n,P


In [3]:
# Try to ordering
from estnltk.taggers import MorphAnalysisReorderer
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )
morph_reorderer.retag( t )

# Output ambiguities
t.morph_analysis[lambda x : len(x.annotations) > 1]

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
mail,mail,mai,mai,['mai'],l,,sg ad,S
,mail,maa,maa,['maa'],il,,pl ad,S
üks,üks,üks,üks,['üks'],0,,sg n,P
,üks,üks,üks,['üks'],0,,sg n,N


## Evaluation

### Training data: train || eval data: dev

In [13]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'test'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 9 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           3733
   -- correct analysis first:      1749 / 3733 (46.85%)
   -- correct analysis not first:  1782 / 3733 (47.74%)
   -- correct analysis not found:  234 / 3733 (6.27%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           3733
   -- correct analysis first:      2759 / 3733 (73.91%)
   -- correct analysis not first:  772 / 3733 (20.68%)
   -- correct analysis not found:  234 / 3733 (6.27%)


 Summary: correct analysis first:  1749 / 3733 (46.85%) ==> 2759 / 3733 (73.91%)


In [14]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'test'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 9 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           3733
   -- correct analysis first:      1749 / 3733 (46.85%)
   -- correct analysis not first:  1782 / 3733 (47.74%)
   -- correct analysis not found:  234 / 3733 (6.27%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           3733
   -- correct analysis first:      2702 / 3733 (72.38%)
   -- correct analysis not first:  829 / 3733 (22.21%)
   -- correct analysis not found:  234 / 3733 (6.27%)


 Summary: correct analysis first:  1749 / 3733 (46.85%) ==> 2702 / 3733 (72.38%)


### Training data: train || eval data: test

In [15]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3037 / 4139 (73.38%)
   -- correct analysis not first:  950 / 4139 (22.95%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3037 / 4139 (73.38%)


In [16]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS,
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3008 / 4139 (72.67%)
   -- correct analysis not first:  979 / 4139 (23.65%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3008 / 4139 (72.67%)


### Training data: train and dev || eval data: test

In [17]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_and_dev_sorted_analyses_full.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3043 / 4139 (73.52%)
   -- correct analysis not first:  944 / 4139 (22.81%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3043 / 4139 (73.52%)


In [18]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_and_dev_sorted_analyses_cut_5.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3012 / 4139 (72.77%)
   -- correct analysis not first:  975 / 4139 (23.56%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3012 / 4139 (72.77%)


## Summary

* Evaluation using lexicons generated from UD corpus v2.5 (as of 2022-12-05):


           lexicon file:                               eval: dev              eval: test

    et_edt-ud-train_sorted_analyses_full.csv       46.85% --> 73.91%       50.71% -> 73.38%

    et_edt-ud-train_sorted_analyses_cut_5.csv      46.85% --> 72.38%       50.71% -> 72.67%

    et_edt-ud-train_and_dev_sorted_analyses_full.csv    --------           50.71% -> 73.52%

    et_edt-ud-train_and_dev_sorted_analyses_cut_5.csv   --------           50.71% -> 72.77%
                                                                          