## Small test: how new Vabamorf binary dictionaries work compared to old ones

In [1]:
from estnltk import Text
from estnltk.vabamorf.morf import Vabamorf as VabamorfInstance
from estnltk.taggers import VabamorfAnalyzer, VabamorfDisambiguator, VabamorfTagger

# Create morph analysers and taggers that use new dictionaries
def create_new_vm_tagger_components( lex_path, disamb_lex_path ):
    # change binary dictionaries
    # for details, see:  https://github.com/estnltk/estnltk/blob/version_1.6/tutorials/nlp_pipeline/B_06_morphological_analysis.ipynb
    vm_instance   = VabamorfInstance( lex_path=lex_path, disamb_lex_path=disamb_lex_path )
    analyser      = VabamorfAnalyzer( output_layer='new_morph_analysis', vm_instance=vm_instance )
    disambiguator = VabamorfDisambiguator( output_layer='new_morph_analysis', vm_instance=vm_instance )
    tagger        = VabamorfTagger( output_layer='new_morph_analysis', vm_instance=vm_instance )
    return analyser, disambiguator, tagger

# 1) Download new binary dictionaries from:  https://github.com/Filosoft/vabamorf/tree/master/dct/binary
# 2) Place to the current folder and rename:
#     'et.dct'  --> '2019-10-15_et.dct'
#     'et3.dct' --> '2019-10-15_et3.dct'
# 3) Create new morph analysers & taggers
new_analyser, new_disambiguator, new_tagger = create_new_vm_tagger_components( '2019-10-15_et.dct', '2019-10-15_et3.dct' )

### Test on Estonian Web Treebank (web texts)

In [2]:
import os

# download corpus from:  https://github.com/UniversalDependencies/UD_Estonian-EWT/ (exact commit: 6cd4d14)
eval_data_dir = 'UD_Estonian-EWT-master'

from estnltk.converters.conll_importer import conll_to_texts_list

loaded_texts = []
for fname in os.listdir( eval_data_dir ):
    if fname.endswith('.conllu'):
        fpath = os.path.join( eval_data_dir, fname )
        texts = conll_to_texts_list(file = fpath, syntax_layer='ud_syntax')
        for text in texts:
            text.meta['file'] = fname
            loaded_texts.append( text )

In [3]:
total_words = 0
differences = 0
for text in loaded_texts:
    # 1) add default/old morph analysis
    text.tag_layer(['morph_analysis'])
    # 2) add new morph analysis
    new_tagger.tag(text)
    # 3) compare
    for word in text.words:
        # old morph analyses
        old_morph = word.morph_analysis
        # new morph analyses
        new_morph = word.new_morph_analysis
        if old_morph != new_morph:
            # Output changed analyses
            old_analyses = [(a.root, a.partofspeech, a.form) for a in old_morph.annotations]
            new_analyses = [(a.root, a.partofspeech, a.form) for a in new_morph.annotations]
            print(word.text, old_analyses, '-->', new_analyses)
            print()
            differences += 1
        total_words += 1
print ('Total differences: ', differences, '/', total_words)

mürasem [('mürasem', 'S', 'sg n')] --> [('mürase=m', 'C', 'sg n')]

suht [('suht', 'S', 'sg n')] --> [('suht', 'D', '')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

jura [('jura', 'V', 'o')] --> [('jura', 'S', 'sg p')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

ok [('ok', 'Y', '?')] --> [('ok', 'I', '')]

ok [('ok', 'Y', '?')] --> [('ok', 'I', '')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

ups [('ups', 'S', 'sg n')] --> [('ups', 'I', '')]

suht [('suht', 'S', 'sg n')] --> [('suht', 'D', '')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

masuajal [('masua=ja', 'S', 'sg ad')] --> [(

### Test on Estonian UD corpus (more or less standard written language)

In [4]:
import os

# download corpus from:  https://github.com/UniversalDependencies/UD_Estonian-EDT (version 2.4)
eval_data_dir = 'UD_Estonian-EDT-master'

from estnltk.converters.conll_importer import conll_to_texts_list

loaded_texts = []
for fname in os.listdir( eval_data_dir ):
    if fname.endswith('dev.conllu'):  # Take out dev part of the corpus
        fpath = os.path.join( eval_data_dir, fname )
        texts = conll_to_texts_list(file = fpath, syntax_layer='ud_syntax')
        for text in texts:
            text.meta['file'] = fname
            loaded_texts.append( text )

In [5]:
total_words = 0
differences = 0
for text in loaded_texts:
    # 1) add default/old morph analysis
    text.tag_layer(['morph_analysis'])
    # 2) add new morph analysis
    new_tagger.tag(text)
    # 3) compare
    for word in text.words:
        # old morph analyses
        old_morph = word.morph_analysis
        # new morph analyses
        new_morph = word.new_morph_analysis
        if old_morph != new_morph:
            # Output changed analyses
            old_analyses = [(a.root, a.partofspeech, a.form) for a in old_morph.annotations]
            new_analyses = [(a.root, a.partofspeech, a.form) for a in new_morph.annotations]
            print(word.text, old_analyses, '-->', new_analyses)
            print()
            differences += 1
        total_words += 1
print ('Total differences: ', differences, '/', total_words)

millenniumi [('millennium', 'S', 'sg g'), ('millenniumi', 'S', 'sg g')] --> [('millennium', 'S', 'sg g')]

Manerism [('Manerism', 'H', 'sg n')] --> [('manerism', 'S', 'sg n')]

tselluliidile [('tselluliid', 'S', 'sg all'), ('tselluliit', 'S', 'sg all')] --> [('tselluliit', 'S', 'sg all')]

Miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

Miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

ugrimugride [('ugrimugr', 'S', 'pl g'), ('ugrimugri', 'S', 'pl g')] --> [('ugri_mugri', 'S', 'pl g')]

venelikku [('vene_ligu', 'S', 'adt')] --> [('venelik', 'A', 'sg p')]

miks [('miks', 'D', '')] --> [('miks', 'D', ''), ('miks', 'S', 'sg n')]

udmurdi [('udmurt', 'S', 'sg g')] --> [('udmurdi', 'G', '')]

Kalašnikovi [('Kalašnikov', 'H', 'sg g')] --> [('kalašnikov', 'S', 'sg g')]

Udmurdi [('Udmurd', 'H', 'sg g'), ('Udmurdi', 'H', 'sg g'), ('Udmurt', 'H', 'sg g')] --> [('udmurdi', 'G'

Isheemilise [('Isheemilinen', 'H', 'sg g')] --> [('isheemiline', 'A', 'sg g')]

isheemilised [('isheemi=line', 'A', 'pl n')] --> [('isheemiline', 'A', 'pl n')]

isheemilised [('isheemi=line', 'A', 'pl n')] --> [('isheemiline', 'A', 'pl n')]

isheemilist [('isheemili', 'S', 'sg el')] --> [('isheemiline', 'A', 'sg p')]

sümptomaatika [('sümptoma_atika', 'G', '')] --> [('sümptomaatika', 'S', 'sg n')]

kolde-sümptomaatika [('kolde-sümptoma_atika', 'G', '')] --> [('kolde-sümptomaatika', 'S', 'sg g')]

sümptomaatika [('sümptoma_atika', 'G', '')] --> [('sümptomaatika', 'S', 'sg n')]

isheemilisest [('isheemi=line', 'A', 'sg el')] --> [('isheemiline', 'A', 'sg el')]

blokaatoril [('blokaatori', 'S', 'sg ad'), ('blokaator', 'S', 'sg ad')] --> [('blokaator', 'S', 'sg ad')]

koldesümptomaatika [('koldesümptoma_atika', 'G', '')] --> [('kolde_sümptomaatika', 'S', 'sg n')]

Amüloidnaastude [('Amüloid_naast', 'H', 'pl g')] --> [('amüloid_naast', 'S', 'pl g')]

Total differences:  109 / 44632


---