## Experiments on using word-to-analyses-freq lexicons for reordering

In [1]:
from collections import defaultdict
import os, os.path
from estnltk.converters import json_to_text
from estnltk.taggers import MorphAnalysisReorderer

from eval_utils import GoldStandard
from eval_utils import add_normalized_form_to_words
from eval_utils import collect_matches
from eval_utils import write_out_freq_sorted_annotations
from eval_utils import evaluate_reorderer

In [2]:
# Corpus with gold standard annotations
input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

### Create lexicons based on train data

In [3]:
# Load gold standard texts and add pre-annotations
loaded_texts = []
for fname in os.listdir( input_dir ):
    if 'dev' in fname:
        continue
    if 'test' in fname:
        continue
    if fname.endswith('.json'):
        # Load Text with gold standard annotations
        text = json_to_text(file=os.path.join(input_dir, fname) )
        if 'normalized_form' not in text.words.attributes:
            add_normalized_form_to_words( text.words )
        assert 'normalized_form' in text.words.attributes
        # Add Vabamorf's default morph analysis
        text.tag_layer(['morph_analysis'])
        loaded_texts.append( text )
        print(' Loaded and pre-annotated ', fname)

 Loaded and pre-annotated  et_edt-ud-train_000.json
 Loaded and pre-annotated  et_edt-ud-train_001.json
 Loaded and pre-annotated  et_edt-ud-train_002.json
 Loaded and pre-annotated  et_edt-ud-train_003.json
 Loaded and pre-annotated  et_edt-ud-train_004.json
 Loaded and pre-annotated  et_edt-ud-train_005.json
 Loaded and pre-annotated  et_edt-ud-train_006.json
 Loaded and pre-annotated  et_edt-ud-train_007.json
 Loaded and pre-annotated  et_edt-ud-train_008.json
 Loaded and pre-annotated  et_edt-ud-train_009.json
 Loaded and pre-annotated  et_edt-ud-train_010.json
 Loaded and pre-annotated  et_edt-ud-train_011.json
 Loaded and pre-annotated  et_edt-ud-train_012.json
 Loaded and pre-annotated  et_edt-ud-train_013.json
 Loaded and pre-annotated  et_edt-ud-train_014.json
 Loaded and pre-annotated  et_edt-ud-train_015.json
 Loaded and pre-annotated  et_edt-ud-train_016.json
 Loaded and pre-annotated  et_edt-ud-train_017.json
 Loaded and pre-annotated  et_edt-ud-train_018.json
 Loaded and 

In [4]:
focus_fields = ['lemma','ending','clitic','partofspeech','form']
word_matches = collect_matches( loaded_texts, 'ud_morph_reduced', 
                                gold_morph_type=GoldStandard.UD_CORPUS, 
                                focus_fields = focus_fields )

 Processed documents:                                 24
 Ambiguous words from total words:                    28766 / 344646 (8.35%)
 Ambiguous words successfully matched to gold morph:  26929 / 28766 (93.61%)


In [5]:
# Include all analyses
write_out_freq_sorted_annotations( 'et_edt-ud-train_sorted_analyses_full.csv', 
                                   word_matches, focus_fields, 
                                   freq_threshold=-1, encoding='utf-8' )

In [6]:
# Include analyses at freq threshold 5
write_out_freq_sorted_annotations( 'et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                    word_matches, focus_fields, 
                                    freq_threshold=5, encoding='utf-8' )

### Create lexicons based on train+dev data

In [7]:
assert loaded_texts and len(loaded_texts) > 0
# Add dev data to loaded_texts
for fname in os.listdir( input_dir ):
    if 'train' in fname:
        continue
    if 'test' in fname:
        continue
    if fname.endswith('.json'):
        # Load Text with gold standard annotations
        text = json_to_text(file=os.path.join(input_dir, fname) )
        if 'normalized_form' not in text.words.attributes:
            add_normalized_form_to_words( text.words )
        assert 'normalized_form' in text.words.attributes
        # Add Vabamorf's default morph analysis
        text.tag_layer(['morph_analysis'])
        loaded_texts.append( text )
        print(' Loaded and pre-annotated ', fname)

 Loaded and pre-annotated  et_edt-ud-dev_000.json
 Loaded and pre-annotated  et_edt-ud-dev_001.json
 Loaded and pre-annotated  et_edt-ud-dev_002.json
 Loaded and pre-annotated  et_edt-ud-dev_003.json
 Loaded and pre-annotated  et_edt-ud-dev_004.json
 Loaded and pre-annotated  et_edt-ud-dev_005.json
 Loaded and pre-annotated  et_edt-ud-dev_006.json
 Loaded and pre-annotated  et_edt-ud-dev_007.json
 Loaded and pre-annotated  et_edt-ud-dev_008.json


In [8]:
focus_fields = ['lemma','ending','clitic','partofspeech','form']
word_matches = collect_matches( loaded_texts, 'ud_morph_reduced', 
                                gold_morph_type=GoldStandard.UD_CORPUS,
                                focus_fields = focus_fields )

 Processed documents:                                 33
 Ambiguous words from total words:                    32510 / 389278 (8.35%)
 Ambiguous words successfully matched to gold morph:  30440 / 32510 (93.63%)


In [9]:
# Include all analyses
write_out_freq_sorted_annotations( 'et_edt-ud-train_and_dev_sorted_analyses_full.csv', 
                                   word_matches, focus_fields, 
                                   freq_threshold=-1, encoding='utf-8' )

In [10]:
# Include analyses at freq threshold 5
write_out_freq_sorted_annotations( 'et_edt-ud-train_and_dev_sorted_analyses_cut_5.csv', 
                                    word_matches, focus_fields, 
                                    freq_threshold=5, encoding='utf-8' )

### A small test

In [11]:
# Example text
from estnltk import Text
t=Text('See toimus 1. mail, ütles üks.').tag_layer(['morph_analysis'])

# Output ambiguities
t.morph_analysis[lambda x : len(x.annotations) > 1]

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
mail,mail,maa,maa,['maa'],il,,pl ad,S
,mail,mai,mai,['mai'],l,,sg ad,S
üks,üks,üks,üks,['üks'],0,,sg n,N
,üks,üks,üks,['üks'],0,,sg n,P


In [12]:
# Try to ordering
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )
morph_reorderer.retag( t )

# Output ambiguities
t.morph_analysis[lambda x : len(x.annotations) > 1]

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
mail,mail,mai,mai,['mai'],l,,sg ad,S
,mail,maa,maa,['maa'],il,,pl ad,S
üks,üks,üks,üks,['üks'],0,,sg n,P
,üks,üks,üks,['üks'],0,,sg n,N


## Evaluation

### Training data: train || eval data: dev

In [13]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'test'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 9 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           3744
   -- correct analysis first:      1762 / 3744 (47.06%)
   -- correct analysis not first:  1781 / 3744 (47.57%)
   -- correct analysis not found:  233 / 3744 (6.22%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           3744
   -- correct analysis first:      2772 / 3744 (74.04%)
   -- correct analysis not first:  771 / 3744 (20.59%)
   -- correct analysis not found:  233 / 3744 (6.22%)


 Summary: correct analysis first:  1762 / 3744 (47.06%) ==> 2772 / 3744 (74.04%)


In [14]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'test'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 9 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           3744
   -- correct analysis first:      1762 / 3744 (47.06%)
   -- correct analysis not first:  1781 / 3744 (47.57%)
   -- correct analysis not found:  233 / 3744 (6.22%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           3744
   -- correct analysis first:      2715 / 3744 (72.52%)
   -- correct analysis not first:  828 / 3744 (22.12%)
   -- correct analysis not found:  233 / 3744 (6.22%)


 Summary: correct analysis first:  1762 / 3744 (47.06%) ==> 2715 / 3744 (72.52%)


### Training data: train || eval data: test

In [16]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3069 / 4170 (73.60%)
   -- correct analysis not first:  949 / 4170 (22.76%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3069 / 4170 (73.60%)


In [17]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS,
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3040 / 4170 (72.90%)
   -- correct analysis not first:  978 / 4170 (23.45%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3040 / 4170 (72.90%)


### Training data: train and dev || eval data: test

In [18]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_and_dev_sorted_analyses_full.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3075 / 4170 (73.74%)
   -- correct analysis not first:  943 / 4170 (22.61%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3075 / 4170 (73.74%)


In [19]:
morph_reorderer = MorphAnalysisReorderer(reorderings_csv_file='et_edt-ud-train_and_dev_sorted_analyses_cut_5.csv', 
                                         postag_freq_csv_file=None,
                                         form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', 
                    gold_morph_type=GoldStandard.UD_CORPUS, 
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3044 / 4170 (73.00%)
   -- correct analysis not first:  974 / 4170 (23.36%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3044 / 4170 (73.00%)


## Summary


           lexicon file:                               eval: dev              eval: test

    et_edt-ud-train_sorted_analyses_full.csv       47.06% --> 74.04%       51.10% -> 73.60%
                                                   (+26.98)               (+22.5)

    et_edt-ud-train_sorted_analyses_cut_5.csv      47.06% --> 72.52%       51.10% -> 72.90%
                                                   (+25.46)               (+21.8)

    et_edt-ud-train_and_dev_sorted_analyses_full.csv    --------           51.10% -> 73.74%
                                                                          (+22.64)

    et_edt-ud-train_and_dev_sorted_analyses_cut_5.csv   --------           51.10% -> 73.00%
                                                                          (+21.9)