## Experiments on using morph_analysis category frequency lexicons for reordering

In [1]:
from collections import defaultdict
import os, os.path
from estnltk.converters import json_to_text
from estnltk.taggers import MorphAnalysisReorderer

from eval_utils import GoldStandard
from eval_utils import add_normalized_form_to_words
from eval_utils import write_out_freq_sorted_categories
from eval_utils import collect_category_stats
from eval_utils import evaluate_reorderer

In [2]:
# Corpus with gold standard annotations
input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

### Create lexicons based on train data

In [3]:
# Load gold standard texts and add pre-annotations
loaded_texts = []
for fname in os.listdir( input_dir ):
    if 'dev' in fname:
        continue
    if 'test' in fname:
        continue
    if fname.endswith('.json'):
        # Load Text with gold standard annotations
        text = json_to_text(file=os.path.join(input_dir, fname) )
        if 'normalized_form' not in text.words.attributes:
            add_normalized_form_to_words( text.words )
        assert 'normalized_form' in text.words.attributes
        # Add Vabamorf's default morph analysis
        text.tag_layer(['morph_analysis'])
        loaded_texts.append( text )
        print(' Loaded and pre-annotated ', fname)

 Loaded and pre-annotated  et_edt-ud-train_000.json
 Loaded and pre-annotated  et_edt-ud-train_001.json
 Loaded and pre-annotated  et_edt-ud-train_002.json
 Loaded and pre-annotated  et_edt-ud-train_003.json
 Loaded and pre-annotated  et_edt-ud-train_004.json
 Loaded and pre-annotated  et_edt-ud-train_005.json
 Loaded and pre-annotated  et_edt-ud-train_006.json
 Loaded and pre-annotated  et_edt-ud-train_007.json
 Loaded and pre-annotated  et_edt-ud-train_008.json
 Loaded and pre-annotated  et_edt-ud-train_009.json
 Loaded and pre-annotated  et_edt-ud-train_010.json
 Loaded and pre-annotated  et_edt-ud-train_011.json
 Loaded and pre-annotated  et_edt-ud-train_012.json
 Loaded and pre-annotated  et_edt-ud-train_013.json
 Loaded and pre-annotated  et_edt-ud-train_014.json
 Loaded and pre-annotated  et_edt-ud-train_015.json
 Loaded and pre-annotated  et_edt-ud-train_016.json
 Loaded and pre-annotated  et_edt-ud-train_017.json
 Loaded and pre-annotated  et_edt-ud-train_018.json
 Loaded and 

#### Collect category frequencies from all words

In [4]:
pos_freq, form_freq = collect_category_stats( loaded_texts, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                                              collect_only_from_ambiguous=False)
write_out_freq_sorted_categories('et_edt-ud-train_cat_postag_freq_all.csv', pos_freq, 'partofspeech')
write_out_freq_sorted_categories('et_edt-ud-train_cat_form_freq_all.csv', form_freq, 'form')

 Punctuation was excluded.
 Processed documents:                                 24
 Ambiguous words from total words:                    28766 / 288329 (9.98%)
 Words successfully matched to gold morph:            260157 / 288329 (90.23%)


#### Collect category frequencies only from ambiguous words

In [5]:
pos_freq_a, form_freq_a = collect_category_stats( loaded_texts, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                                                  collect_only_from_ambiguous=True)
write_out_freq_sorted_categories('et_edt-ud-train_cat_postag_freq_amb.csv', pos_freq_a, 'partofspeech')
write_out_freq_sorted_categories('et_edt-ud-train_cat_form_freq_amb.csv', form_freq_a, 'form')

 Stats collected only from ambiguous words.
 Punctuation was excluded.
 Processed documents:                                 24
 Ambiguous words from total words:                    28766 / 28766 (100.00%)
 Words successfully matched to gold morph:            26929 / 28766 (93.61%)


### A small test

In [6]:
# Example text
from estnltk import Text
t=Text('viidanud').tag_layer(['morph_analysis'])

# Output ambiguities
t.morph_analysis[lambda x : len(x.annotations) > 1]

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,1

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
viidanud,viidanud,viidanud,viida=nud,['viidanud'],0,,,A
,viidanud,viidanud,viida=nud,['viidanud'],0,,sg n,A
,viidanud,viidanud,viida=nud,['viidanud'],d,,pl n,A
,viidanud,viitama,viita,['viita'],nud,,nud,V


In [7]:
# Try to ordering
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file=None, 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file=None )
morph_reorderer.retag( t )

# Output ambiguities
t.morph_analysis[lambda x : len(x.annotations) > 1]

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,1

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
viidanud,viidanud,viitama,viita,['viita'],nud,,nud,V
,viidanud,viidanud,viida=nud,['viidanud'],0,,,A
,viidanud,viidanud,viida=nud,['viidanud'],0,,sg n,A
,viidanud,viidanud,viida=nud,['viidanud'],d,,pl n,A


## Evaluation

### Training data: train || eval data: test

In [8]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_amb.csv',
                                          form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3082 / 4170 (73.91%)
   -- correct analysis not first:  936 / 4170 (22.45%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3082 / 4170 (73.91%)


In [9]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3082 / 4170 (73.91%)
   -- correct analysis not first:  936 / 4170 (22.45%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3082 / 4170 (73.91%)


In [10]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file = 'et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_amb.csv',
                                          form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3073 / 4170 (73.69%)
   -- correct analysis not first:  945 / 4170 (22.66%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3073 / 4170 (73.69%)


In [11]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file = 'et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3073 / 4170 (73.69%)
   -- correct analysis not first:  945 / 4170 (22.66%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3073 / 4170 (73.69%)


### Results I : used only word-to-reorderings and postag_freq lexicons

                 used lexicons:                                    eval: test data
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.60%)
      (baseline)
      ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.91%) 
     'et_edt-ud-train_cat_postag_freq_amb.csv'
      ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.91%) (+)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_cut_5.csv'                 (51.10%) ==> (73.69%)
     'et_edt-ud-train_cat_postag_freq_amb.csv'
      ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_cut_5.csv'                 (51.10%) ==> (73.69%)
     'et_edt-ud-train_cat_postag_freq_all.csv'
      ------------------------------------------------------------------------------------
      

In [12]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_amb.csv',
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_amb.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3075 / 4170 (73.74%)
   -- correct analysis not first:  943 / 4170 (22.61%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3075 / 4170 (73.74%)


In [13]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_all.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3019 / 4170 (72.40%)
   -- correct analysis not first:  999 / 4170 (23.96%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3019 / 4170 (72.40%)


In [14]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_amb.csv',
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_all.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3019 / 4170 (72.40%)
   -- correct analysis not first:  999 / 4170 (23.96%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3019 / 4170 (72.40%)


In [15]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_amb.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3075 / 4170 (73.74%)
   -- correct analysis not first:  943 / 4170 (22.61%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3075 / 4170 (73.74%)


In [16]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file=None,
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_all.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3018 / 4170 (72.37%)
   -- correct analysis not first:  1000 / 4170 (23.98%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3018 / 4170 (72.37%)


In [17]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file=None,
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_amb.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4170
   -- correct analysis first:      2131 / 4170 (51.10%)
   -- correct analysis not first:  1887 / 4170 (45.25%)
   -- correct analysis not found:  204 / 4170 (4.89%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4170
   -- correct analysis first:      3017 / 4170 (72.35%)
   -- correct analysis not first:  1001 / 4170 (24.00%)
   -- correct analysis not found:  204 / 4170 (4.89%)


 Summary: correct analysis first:  2131 / 4170 (51.10%) ==> 3017 / 4170 (72.35%)


### Results II : used word-to-reorderings, postag_freq and form_freq lexicons 

                 used lexicons:                                    eval: test data
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.60%)
     None
     None
      (baseline 1)
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.91%)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     None
      (baseline 2)
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (72.37%) (--)
     None
     'et_edt-ud-train_cat_form_freq_all.csv'
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (72.35%) (--)
     None
     'et_edt-ud-train_cat_form_freq_amb.csv' 
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.74%) (-)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     'et_edt-ud-train_cat_form_freq_amb.csv'      
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (72.40%) (--)
     'et_edt-ud-train_cat_postag_freq_amb.csv'
     'et_edt-ud-train_cat_form_freq_all.csv'      
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (72.40%) (--)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     'et_edt-ud-train_cat_form_freq_all.csv'  
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.74%) (-)
     'et_edt-ud-train_cat_postag_freq_amb.csv'
     'et_edt-ud-train_cat_form_freq_amb.csv'  
      ------------------------------------------------------------------------------------

---

## Summary



                 used lexicons:                                    eval: test data

     ---------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.60%)
     None
     None
      (baseline)
     ---------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.91%) (+)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     None
      (best model 1)
     ---------------------------------------------------------------------------------------
       'et_edt-ud-train_sorted_analyses_full.csv'                (51.10%) ==> (73.74%) (-)
       'et_edt-ud-train_cat_postag_freq_all.csv'
       'et_edt-ud-train_cat_form_freq_amb.csv'  
      (best model 2)
     ---------------------------------------------------------------------------------------
    

## Inspecting morph_analysis_reorderer's diffs

In [3]:
from eval_utils import diff_reorderer

morph_reorderer1 = MorphAnalysisReorderer( postag_freq_csv_file=None,
                                           form_freq_csv_file=None )

morph_reorderer2 = MorphAnalysisReorderer( postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                           form_freq_csv_file=None )

import os, os.path
input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

from eval_utils import evaluate_reorderer, GoldStandard

diff_reorderer( morph_reorderer1, morph_reorderer2, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'test'], debug_take_first=False, show_fnames=False, show_all_diffs=True  )

Loading evaluation texts (UD_CORPUS)...
 Total 9 texts loaded for evaluation. 

Showing differences for all words (including reocurring ones).
lükatud
    [('lükatud', 'A', ''), ('lükatud', 'A', 'sg n'), ('lükatud', 'A', 'pl n'), ('lükkama', 'V', 'tud')]
    --> [('lükkama', 'V', 'tud'), ('lükatud', 'A', ''), ('lükatud', 'A', 'sg n'), ('lükatud', 'A', 'pl n')] (-)
lükatud
    [('lükatud', 'A', ''), ('lükatud', 'A', 'sg n'), ('lükatud', 'A', 'pl n'), ('lükkama', 'V', 'tud')]
    --> [('lükkama', 'V', 'tud'), ('lükatud', 'A', ''), ('lükatud', 'A', 'sg n'), ('lükatud', 'A', 'pl n')] (+)
vaevelnud
    [('vaevelnud', 'A', ''), ('vaevelnud', 'A', 'sg n'), ('vaevelnud', 'A', 'pl n'), ('vaevlema', 'V', 'nud')]
    --> [('vaevlema', 'V', 'nud'), ('vaevelnud', 'A', ''), ('vaevelnud', 'A', 'sg n'), ('vaevelnud', 'A', 'pl n')] (+)
torganud
    [('torganud', 'A', ''), ('torganud', 'A', 'sg n'), ('torganud', 'A', 'pl n'), ('torkama', 'V', 'nud')]
    --> [('torkama', 'V', 'nud'), ('torganud', 'A', '