### VabamorfAnalyzer that allows multiple variants of a normalized word (spelling corrections)

In [1]:
from typing import MutableMapping

from estnltk import Annotation
from estnltk.text import Layer, Text
from estnltk.layer.ambiguous_span import AmbiguousSpan

from estnltk.taggers import Tagger
from estnltk.vabamorf.morf import Vabamorf

from estnltk.taggers.morph_analysis.morf_common import DEFAULT_PARAM_GUESS
from estnltk.taggers.morph_analysis.morf_common import DEFAULT_PARAM_PROPERNAME
from estnltk.taggers.morph_analysis.morf_common import DEFAULT_PARAM_PHONETIC
from estnltk.taggers.morph_analysis.morf_common import DEFAULT_PARAM_COMPOUND
from estnltk.taggers.morph_analysis.morf import VabamorfTagger

from estnltk.taggers.morph_analysis.morf_common import _get_word_text
from estnltk.taggers.morph_analysis.morf_common import _convert_vm_records_to_morph_analysis_records


# ===============================
#    VabamorfAnalyzer
# ===============================

class VabamorfAnalyzer( Tagger ):
    """Performs morphological analysis with Vabamorf's analyzer.
       Note: resulting analyses will be ambiguous."""
    output_layer      = 'morph_analysis'
    output_attributes = VabamorfTagger.output_attributes
    input_layers      = ['words', 'sentences']
    conf_param = [ # Configuration flags:
                   "guess",
                   "propername",
                   "compound",
                   "phonetic",
                   # Internal stuff:
                   '_vm_instance', \
                   # Names of the specific input layers:
                   '_input_words_layer', \
                   '_input_sentences_layer', \
                   # For backward compatibility:
                   'depends_on', 'layer_name', 'attributes',
                   # Extra configuration flags:
                   'extra_attributes', \
                 ]
    layer_name = output_layer       # <- For backward compatibility ...
    depends_on = input_layers       # <- For backward compatibility ...
    attributes = output_attributes  # <- For backward compatibility ...
    
    def __init__(self,
                 output_layer='morph_analysis',
                 input_words_layer='words',
                 input_sentences_layer='sentences',
                 extra_attributes=None,
                 vm_instance=None,
                 guess = DEFAULT_PARAM_GUESS,
                 propername = DEFAULT_PARAM_PROPERNAME,
                 compound = DEFAULT_PARAM_COMPOUND,
                 phonetic = DEFAULT_PARAM_PHONETIC ):
        """Initialize VabamorfAnalyzer class.

        Parameters
        ----------
        layer_name: str (default: 'morph_analysis')
            Name of the layer where morph analysis results 
            will be stored.
        input_words_layer: str (default: 'words')
            Name of the input words layer;
        input_sentences_layer: str (default: 'sentences')
            Name of the input sentences layer;
        extra_attributes: list of str (default: None)
            List containing names of extra attributes that will be 
            attached to Spans. All extra attributes will be 
            initialized to None.
        vm_instance: estnltk.vabamorf.morf.Vabamorf
            An instance of Vabamorf that is to be used for analysing
            text morphologically.
        propername: boolean (default: True)
            Propose additional analysis variants for proper names 
            (a.k.a. proper name guessing).
        guess: boolean (default: True)
            Use guessing in case of unknown words.
        compound: boolean (default: True)
            Add compound word markers to root forms.
        phonetic: boolean (default: False)
            Add phonetic information to root forms.
        """
        # Set input/output layer names
        self.output_layer = output_layer
        self._input_words_layer          = input_words_layer
        self._input_sentences_layer      = input_sentences_layer
        self.input_layers = [input_words_layer, input_sentences_layer]
        self.extra_attributes = extra_attributes
        if self.extra_attributes:
            for extra_attr in self.extra_attributes:
                self.output_attributes += (extra_attr,)
            self.attributes = self.output_attributes  # <- For backward compatibility ...
        if vm_instance:
            self._vm_instance = vm_instance
        else:
            self._vm_instance = Vabamorf.instance()
        # Set analysis parameters:
        self.guess = guess
        self.propername = propername
        self.compound = compound
        self.phonetic = phonetic
        # Other stuff
        self.layer_name = self.output_layer  # <- For backward compatibility ...
        self.depends_on = self.input_layers  # <- For backward compatibility ...

    def _make_layer(self, text: Text, layers, status: dict):
        """Analyses given Text object morphologically. 
        
        Note: disambiguation is not performed, so the results of
        analysis will (most likely) be ambiguous.
        
        Parameters
        ----------
        text: estnltk.text.Text
            Text object that is to be analysed morphologically.
            The Text object must have layers 'words', 'sentences'.
        
        layers: MutableMapping[str, Layer]
           Layers of the text. Contains mappings from the 
           name of the layer to the Layer object. Must contain
           words, and sentences;
          
        status: dict
           This can be used to store metadata on layer tagging.
        """
        # Fetch parameters of the analysis
        current_kwargs = {}
        current_kwargs["disambiguate"] = False # perform analysis without disambiguation
        current_kwargs["guess"]      = self.guess
        current_kwargs["propername"] = self.propername
        current_kwargs["compound"]   = self.compound
        current_kwargs["phonetic"]   = self.phonetic
        # --------------------------------------------
        #   Use Vabamorf for morphological analysis
        # --------------------------------------------
        # Perform morphological analysis sentence by sentence
        word_layer = layers[self._input_words_layer]
        word_span_id = 0
        analysis_results = []
        for sentence in layers[self._input_sentences_layer]:
            # A) Collect all words inside the sentence
            sentence_words = []
            sentence_words_count = 0
            while word_span_id < len(word_layer):
                span = word_layer[word_span_id]
                if sentence.start <= span.start and \
                    span.end <= sentence.end:
                    # > Word is inside the sentence
                    # Get the normalized variant(s)
                    _word = _get_word_text( span )
                    # If we have only one variant, 
                    # package it into a list
                    if isinstance(_word, str):
                        _word = [ _word ]
                    assert isinstance(_word, list)
                    sentence_words.append( _word )
                    sentence_words_count += len( _word )
                    word_span_id += 1
                    if sentence_words_count > 15000:
                        # if 149129 < len(wordlist) on Linux,
                        # if  15000 < len(wordlist) < 17500 on Windows,
                        # then self.instance.analyze(words=wordlist, **self.current_kwargs) raises
                        # RuntimeError: CFSException: internal error with vabamorf
                        # B) Therefore, we analyse approx 15000 words at time, and then empty the buffer
                        res = self._perform_vm_analysis( sentence_words, current_kwargs )
                        analysis_results.extend( res )
                        sentence_words = []
                        sentence_words_count = 0
                elif sentence.end <= span.start:
                    break
            # B) Analyse what's left unanalysed in the sentence
            if sentence_words_count > 0:
                assert sentence_words_count < 15000, '(!) Unexpected amount of unanalysed words left: {}'.format(len(sentence_words_count))
                res = self._perform_vm_analysis( sentence_words, current_kwargs )
                analysis_results.extend( res )

        # Assert that all words obtained an analysis 
        # ( Note: there must be empty analyses for unknown 
        #         words if guessing is not used )
        assert len(layers[ self._input_words_layer ]) == len(analysis_results), \
            '(!) Unexpectedly the number words ('+str(len(layers[ self._input_words_layer ]))+') '+\
            'does not match the number of obtained morphological analyses ('+str(len(analysis_results))+').'

        # --------------------------------------------
        #   Store analysis results in a new layer     
        # --------------------------------------------
        # A) Create layer
        morph_attributes   = self.output_attributes
        current_attributes = morph_attributes
        morph_layer = Layer(name  =self.output_layer,
                            parent=self._input_words_layer,
                            text_object=text,
                            ambiguous=True,
                            attributes=current_attributes )
        morph_layer._base = self._input_words_layer
        # B) Populate layer
        for word, analyses_dict in zip(layers[ self._input_words_layer ], analysis_results):
            # Convert from Vabamorf dict to a list of Spans
            records = _convert_vm_records_to_morph_analysis_records(analyses_dict, layer_attributes=current_attributes,
                                                                    sort_analyses=False)
            # Attach spans (if word has morphological analyses)
            for record in records:
                # the analyses here are not always unique
                morph_layer.add_annotation(word.base_span, **record)
            if not records:
                # if word has no morphological analyses (e.g.
                # it is an unknown word), then attach an 
                # empty Span as a placeholder
                morph_layer.add_annotation(word.base_span)

        # C) Return the layer
        return morph_layer


    def _perform_vm_analysis( self, sentence_words, analysis_kwargs ):
        """Analyses given list of words with Vabamorf. (Only for class-internal usage) """
        # Unpack the words: flatten the input list
        flat_words = [w for word_variants in sentence_words for w in word_variants]
        # Analyse with Vabamorf
        initial_results = self._vm_instance.analyze(words=flat_words, **analysis_kwargs)
        # Pack the words: merge all analyses of a word into a single list of analyses
        packed_results = self._pack_expanded_analysis_results( initial_results, sentence_words )
        return packed_results


    def _pack_expanded_analysis_results( self, analysis_results, initial_sentence_words, sort_analyses=True ):
        """Packs expanded analysis results. (Only for class-internal usage) """
        merged_analysis_results = []
        analysis_index      = 0
        initial_words_index = 0
        while initial_words_index < len(initial_sentence_words):
            merged_morph_record = { 'analysis':[] }
            for initial_word in initial_sentence_words[initial_words_index]:
                current_analysis_dict = analysis_results[analysis_index]
                # Sanity check
                assert current_analysis_dict['text'] == initial_word
                if sort_analyses:
                    # Sort analyses (to assure a fixed order, e.g. for testing purposes)
                    current_analysis_dict['analysis'] = sorted(current_analysis_dict['analysis'],
                                           key=lambda x: x['root']+x['ending']+x['clitic']+x['partofspeech']+x['form'],
                                           reverse=False )
                merged_morph_record['analysis'].extend( current_analysis_dict['analysis'] )
                analysis_index += 1
            merged_analysis_results.append( merged_morph_record )
            initial_words_index += 1
        return merged_analysis_results


### Usage examples

In [2]:
# Guessing switched on
vm_analyser = VabamorfAnalyzer()

In [3]:
from estnltk import Text

text = Text('''isaand kui juuuubbeee ... ''')
text.tag_layer(['words', 'sentences'])

for word in text.words:
    if word.text == 'isaand':
        word.annotations[0].normalized_form = ['isand', 'issand']
    if word.text == 'juuuubbeee':
        word.annotations[0].normalized_form = ['jube']

vm_analyser.tag(text)
text.morph_analysis

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"lemma, root, root_tokens, ending, clitic, form...",words,,True,4

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
isaand,isand,isand,"('isand',)",0.0,,sg n,S
,issand,issand,"('issand',)",0.0,,,I
,issand,issand,"('issand',)",0.0,,sg n,S
kui,kui,kui,"('kui',)",0.0,,,D
,kui,kui,"('kui',)",0.0,,,J
juuuubbeee,jube,jube,"('jube',)",0.0,,sg n,A
,jube,jube,"('jube',)",0.0,,,D
...,...,...,"('...',)",,,,Z


In [4]:
from estnltk import Text

text = Text('''lihtsalt ei teee välja , see ppole tema asi ...''')
text.tag_layer(['words', 'sentences'])

for word in text.words:
    if word.text == 'teee':
        word.annotations[0].normalized_form = ['tee', 'te']
    if word.text == 'ppole':
        word.annotations[0].normalized_form = ['pole']

vm_analyser.tag(text)
text.morph_analysis

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,10

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
lihtsalt,lihtne,lihtne,"('lihtne',)",lt,,sg abl,A
,lihtsalt,lihtsalt,"('lihtsalt',)",0,,,D
ei,ei,ei,"('ei',)",0,,,D
,ei,ei,"('ei',)",0,,neg,V
teee,tee,tee,"('tee',)",0,,sg g,S
,tee,tee,"('tee',)",0,,sg n,S
,tegema,tege,"('tege',)",0,,o,V
,sina,sina,"('sina',)",0,,pl g,P
,sina,sina,"('sina',)",0,,pl n,P
välja,väli,väli,"('väli',)",0,,adt,S


In [5]:
# Text with multiple sentences
from estnltk import Text

text = Text('''neet eesti naised , kes lähevad välismaaa meestele mehale ... Ja siis on oops ....''')
text.tag_layer(['words', 'sentences'])
text.sentences

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,2

text
"['neet', 'eesti', 'naised', ',', 'kes', 'lähevad', 'välismaaa', 'meestele', 'mehale', '...']"
"['Ja', 'siis', 'on', 'oops', '....']"


In [6]:
for word in text.words:
    if word.text == 'neet':
        word.annotations[0].normalized_form = ['neet', 'need']
    if word.text == 'välismaaa':
        word.annotations[0].normalized_form = ['välismaa']
    if word.text == 'mehale':
        word.annotations[0].normalized_form = ['mehele', 'mehale']
    if word.text == 'oops':
        word.annotations[0].normalized_form = ['ops', 'ups']
vm_analyser.tag(text)
text.morph_analysis

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,15

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
neet,neet,neet,"('neet',)",0,,sg n,S
,see,see,"('see',)",d,,pl n,P
eesti,eesti,eesti,"('eesti',)",0,,,G
naised,naine,naine,"('naine',)",d,,pl n,S
",",",",",","(',',)",,,,Z
kes,kes,kes,"('kes',)",0,,pl n,P
,kes,kes,"('kes',)",0,,sg n,P
lähevad,minema,mine,"('mine',)",vad,,vad,V
välismaaa,välismaa,välis_maa,"('välis', 'maa')",0,,sg g,S
,välismaa,välis_maa,"('välis', 'maa')",0,,sg n,S


In [7]:
# Guessing switched off
vm_analyser_wo_guesser = VabamorfAnalyzer(guess = False, propername=False)

In [8]:
from estnltk import Text

text = Text('''appppi ma niiiii niiii õnnnelik , naq jessss ...''')
text.tag_layer(['words', 'sentences'])

for word in text.words:
    if word.text == 'appppi':
        word.annotations[0].normalized_form = ['appi', 'abi']
    if word.text == 'niiiii':
        word.annotations[0].normalized_form = ['nii']
    if word.text == 'naq':
        word.annotations[0].normalized_form = ['nagu']
vm_analyser_wo_guesser.tag(text)
text.morph_analysis

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,9

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
appppi,abi,abi,"('abi',)",0.0,,adt,S
,appi,appi,"('appi',)",0.0,,,I
,abi,abi,"('abi',)",0.0,,sg g,S
,abi,abi,"('abi',)",0.0,,sg n,S
,abi,abi,"('abi',)",0.0,,sg p,S
ma,mina,mina,"('mina',)",0.0,,sg n,P
niiiii,nii,nii,"('nii',)",0.0,,,D
niiii,,,,,,,
õnnnelik,õnnnelik,õnn_nelik,"('õnn', 'nelik')",0.0,,sg n,S
",",,,,,,,
