## Experiments on using morph_analysis category frequency lexicons for reordering

In [1]:
from collections import defaultdict
import os, os.path
from estnltk.converters import json_to_text
from estnltk.taggers import MorphAnalysisReorderer

from eval_utils import GoldStandard
from eval_utils import add_normalized_form_to_words
from eval_utils import write_out_freq_sorted_categories
from eval_utils import collect_category_stats
from eval_utils import evaluate_reorderer

In [2]:
# Corpus with gold standard annotations
input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

### Create lexicons based on train data

In [3]:
# Load gold standard texts and add pre-annotations
loaded_texts = []
for fname in os.listdir( input_dir ):
    if 'dev' in fname:
        continue
    if 'test' in fname:
        continue
    if fname.endswith('.json'):
        # Load Text with gold standard annotations
        text = json_to_text(file=os.path.join(input_dir, fname) )
        if 'normalized_form' not in text.words.attributes:
            add_normalized_form_to_words( text.words )
        assert 'normalized_form' in text.words.attributes
        # Add Vabamorf's default morph analysis
        text.tag_layer('morph_analysis')
        loaded_texts.append( text )
        print(' Loaded and pre-annotated ', fname)

 Loaded and pre-annotated  et_edt-ud-train_015.json
 Loaded and pre-annotated  et_edt-ud-train_016.json
 Loaded and pre-annotated  et_edt-ud-train_017.json
 Loaded and pre-annotated  et_edt-ud-train_018.json
 Loaded and pre-annotated  et_edt-ud-train_019.json
 Loaded and pre-annotated  et_edt-ud-train_020.json
 Loaded and pre-annotated  et_edt-ud-train_021.json
 Loaded and pre-annotated  et_edt-ud-train_022.json
 Loaded and pre-annotated  et_edt-ud-train_023.json
 Loaded and pre-annotated  et_edt-ud-train_024.json
 Loaded and pre-annotated  et_edt-ud-train_025.json
 Loaded and pre-annotated  et_edt-ud-train_026.json
 Loaded and pre-annotated  et_edt-ud-train_027.json
 Loaded and pre-annotated  et_edt-ud-train_028.json
 Loaded and pre-annotated  et_edt-ud-train_029.json
 Loaded and pre-annotated  et_edt-ud-train_030.json
 Loaded and pre-annotated  et_edt-ud-train_031.json
 Loaded and pre-annotated  et_edt-ud-train_032.json
 Loaded and pre-annotated  et_edt-ud-train_033.json
 Loaded and 

#### Collect category frequencies from all words

In [4]:
pos_freq, form_freq = collect_category_stats( loaded_texts, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                                              collect_only_from_ambiguous=False)
write_out_freq_sorted_categories('et_edt-ud-train_cat_postag_freq_all.csv', pos_freq, 'partofspeech')
write_out_freq_sorted_categories('et_edt-ud-train_cat_form_freq_all.csv', form_freq, 'form')

 Punctuation was excluded.
 Processed documents:                                 24
 Ambiguous words from total words:                    28613 / 288329 (9.92%)
 Words successfully matched to gold morph:            260156 / 288329 (90.23%)


#### Collect category frequencies only from ambiguous words

In [5]:
pos_freq_a, form_freq_a = collect_category_stats( loaded_texts, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                                                  collect_only_from_ambiguous=True)
write_out_freq_sorted_categories('et_edt-ud-train_cat_postag_freq_amb.csv', pos_freq_a, 'partofspeech')
write_out_freq_sorted_categories('et_edt-ud-train_cat_form_freq_amb.csv', form_freq_a, 'form')

 Stats collected only from ambiguous words.
 Punctuation was excluded.
 Processed documents:                                 24
 Ambiguous words from total words:                    28613 / 28613 (100.00%)
 Words successfully matched to gold morph:            26774 / 28613 (93.57%)


### A small test

In [7]:
# Example text
from estnltk import Text
from estnltk.taggers import VabamorfTagger
t=Text('viidanud')
# Switch off applying reorderer by default
t.layer_resolver.update( VabamorfTagger(use_reorderer=False) )

# Add morph without reorderer
t.tag_layer('morph_analysis')

# Output ambiguities
t.morph_analysis[lambda x : len(x.annotations) > 1]

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,1

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
viidanud,viidanud,viidanud,viida=nud,['viidanud'],0,,,A
,viidanud,viidanud,viida=nud,['viidanud'],0,,sg n,A
,viidanud,viidanud,viida=nud,['viidanud'],d,,pl n,A
,viidanud,viitama,viita,['viita'],nud,,nud,V


In [8]:
# Try to ordering
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file=None, 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file=None )
morph_reorderer.retag( t )

# Output ambiguities
t.morph_analysis[lambda x : len(x.annotations) > 1]

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,1

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
viidanud,viidanud,viitama,viita,['viita'],nud,,nud,V
,viidanud,viidanud,viida=nud,['viidanud'],0,,,A
,viidanud,viidanud,viida=nud,['viidanud'],0,,sg n,A
,viidanud,viidanud,viida=nud,['viidanud'],d,,pl n,A


## Evaluation

### Training data: train || eval data: test

In [9]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_amb.csv',
                                          form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3050 / 4139 (73.69%)
   -- correct analysis not first:  937 / 4139 (22.64%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3050 / 4139 (73.69%)


In [10]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3050 / 4139 (73.69%)
   -- correct analysis not first:  937 / 4139 (22.64%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3050 / 4139 (73.69%)


In [11]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file = 'et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_amb.csv',
                                          form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3041 / 4139 (73.47%)
   -- correct analysis not first:  946 / 4139 (22.86%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3041 / 4139 (73.47%)


In [12]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file = 'et_edt-ud-train_sorted_analyses_cut_5.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file=None )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3041 / 4139 (73.47%)
   -- correct analysis not first:  946 / 4139 (22.86%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3041 / 4139 (73.47%)


### Results I : used only word-to-reorderings and postag_freq lexicons

                 used lexicons:                                    eval: test data
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.60%)
      (baseline)
      ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.91%) 
     'et_edt-ud-train_cat_postag_freq_amb.csv'
      ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.91%) (+)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_cut_5.csv'                 (51.10%) ==> (73.69%)
     'et_edt-ud-train_cat_postag_freq_amb.csv'
      ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_cut_5.csv'                 (51.10%) ==> (73.69%)
     'et_edt-ud-train_cat_postag_freq_all.csv'
      ------------------------------------------------------------------------------------
      

In [13]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_amb.csv',
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_amb.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3042 / 4139 (73.50%)
   -- correct analysis not first:  945 / 4139 (22.83%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3042 / 4139 (73.50%)


In [14]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_all.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      2986 / 4139 (72.14%)
   -- correct analysis not first:  1001 / 4139 (24.18%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 2986 / 4139 (72.14%)


In [15]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_amb.csv',
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_all.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      2986 / 4139 (72.14%)
   -- correct analysis not first:  1001 / 4139 (24.18%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 2986 / 4139 (72.14%)


In [16]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_amb.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      3042 / 4139 (73.50%)
   -- correct analysis not first:  945 / 4139 (22.83%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 3042 / 4139 (73.50%)


In [17]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file=None,
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_all.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      2985 / 4139 (72.12%)
   -- correct analysis not first:  1002 / 4139 (24.21%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 2985 / 4139 (72.12%)


In [18]:
morph_reorderer = MorphAnalysisReorderer( reorderings_csv_file='et_edt-ud-train_sorted_analyses_full.csv', 
                                          postag_freq_csv_file=None,
                                          form_freq_csv_file='et_edt-ud-train_cat_form_freq_amb.csv' )

input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

evaluate_reorderer( morph_reorderer, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'dev'], show_fnames=False )

Loading evaluation texts (UD_CORPUS)...
 Total 6 texts loaded for evaluation. 

 Evaluation #1: Ambiguous analyses appear in their default ordering 

  Ambiguous words total:           4139
   -- correct analysis first:      2099 / 4139 (50.71%)
   -- correct analysis not first:  1888 / 4139 (45.61%)
   -- correct analysis not found:  204 / 4139 (4.93%)

 Evaluation #2: Ambiguous analyses have been reordered by the morph_reorderer

  Ambiguous words total:           4139
   -- correct analysis first:      2984 / 4139 (72.09%)
   -- correct analysis not first:  1003 / 4139 (24.23%)
   -- correct analysis not found:  204 / 4139 (4.93%)


 Summary: correct analysis first:  2099 / 4139 (50.71%) ==> 2984 / 4139 (72.09%)


### Results II : used word-to-reorderings, postag_freq and form_freq lexicons 

                 used lexicons:                                    eval: test data
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.60%)
     None
     None
      (baseline 1)
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.91%)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     None
      (baseline 2)
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (72.37%) (--)
     None
     'et_edt-ud-train_cat_form_freq_all.csv'
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (72.35%) (--)
     None
     'et_edt-ud-train_cat_form_freq_amb.csv' 
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.74%) (-)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     'et_edt-ud-train_cat_form_freq_amb.csv'      
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (72.40%) (--)
     'et_edt-ud-train_cat_postag_freq_amb.csv'
     'et_edt-ud-train_cat_form_freq_all.csv'      
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (72.40%) (--)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     'et_edt-ud-train_cat_form_freq_all.csv'  
     ------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.74%) (-)
     'et_edt-ud-train_cat_postag_freq_amb.csv'
     'et_edt-ud-train_cat_form_freq_amb.csv'  
      ------------------------------------------------------------------------------------

---

## Summary



                 used lexicons:                                    eval: test data

     ---------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.60%)
     None
     None
      (baseline)
     ---------------------------------------------------------------------------------------
     'et_edt-ud-train_sorted_analyses_full.csv'                  (51.10%) ==> (73.91%) (+)
     'et_edt-ud-train_cat_postag_freq_all.csv'
     None
      (best model 1)
     ---------------------------------------------------------------------------------------
       'et_edt-ud-train_sorted_analyses_full.csv'                (51.10%) ==> (73.74%) (-)
       'et_edt-ud-train_cat_postag_freq_all.csv'
       'et_edt-ud-train_cat_form_freq_amb.csv'  
      (best model 2)
     ---------------------------------------------------------------------------------------
    

## Inspecting morph_analysis_reorderer's diffs

In [3]:
from eval_utils import diff_reorderer

morph_reorderer1 = MorphAnalysisReorderer( postag_freq_csv_file=None,
                                           form_freq_csv_file=None )

morph_reorderer2 = MorphAnalysisReorderer( postag_freq_csv_file='et_edt-ud-train_cat_postag_freq_all.csv',
                                           form_freq_csv_file=None )

import os, os.path
input_dir = 'UD_converted'
assert os.path.isdir( input_dir )

from eval_utils import evaluate_reorderer, GoldStandard

diff_reorderer( morph_reorderer1, morph_reorderer2, input_dir, 'ud_morph_reduced', gold_morph_type=GoldStandard.UD_CORPUS, \
                    exclude_strs=['train', 'test'], debug_take_first=False, show_fnames=False, show_all_diffs=True  )

Loading evaluation texts (UD_CORPUS)...
 Total 9 texts loaded for evaluation. 

Showing differences for all words (including reocurring ones).
lükatud
    [('lükatud', 'A', ''), ('lükatud', 'A', 'sg n'), ('lükatud', 'A', 'pl n'), ('lükkama', 'V', 'tud')]
    --> [('lükkama', 'V', 'tud'), ('lükatud', 'A', ''), ('lükatud', 'A', 'sg n'), ('lükatud', 'A', 'pl n')] (-)
lükatud
    [('lükatud', 'A', ''), ('lükatud', 'A', 'sg n'), ('lükatud', 'A', 'pl n'), ('lükkama', 'V', 'tud')]
    --> [('lükkama', 'V', 'tud'), ('lükatud', 'A', ''), ('lükatud', 'A', 'sg n'), ('lükatud', 'A', 'pl n')] (+)
vaevelnud
    [('vaevelnud', 'A', ''), ('vaevelnud', 'A', 'sg n'), ('vaevelnud', 'A', 'pl n'), ('vaevlema', 'V', 'nud')]
    --> [('vaevlema', 'V', 'nud'), ('vaevelnud', 'A', ''), ('vaevelnud', 'A', 'sg n'), ('vaevelnud', 'A', 'pl n')] (+)
torganud
    [('torganud', 'A', ''), ('torganud', 'A', 'sg n'), ('torganud', 'A', 'pl n'), ('torkama', 'V', 'nud')]
    --> [('torkama', 'V', 'nud'), ('torganud', 'A', '