# MorphMultiwordProperNamesCorrector

In [1]:
from estnltk import Text, Layer, Annotation

from estnltk.taggers import Retagger
from estnltk.taggers import VabamorfTagger
from estnltk.taggers import MorphAnalysisReorderer


class MorphMultiwordProperNamesCorrector(Retagger):
    """Reanalyses sentences that contain multiword propername phrases with VabamorfTagger."""
    conf_param = ('vabamorftagger', 'morph_reorderer')
    
    def __init__(self, output_layer='morph_analysis', input_layers=['morph_analysis', 'sentences']):
        self.output_layer = output_layer
        self.input_layers = input_layers
        self.output_attributes = VabamorfTagger.output_attributes
        self.vabamorftagger    = VabamorfTagger()
        self.morph_reorderer   = MorphAnalysisReorderer()

    def _change_layer(self, raw_text, layers, status):
        # Analyse input sentence by sentence
        morph_layer = layers[self.output_layer]
        morph_span_id = 0
        reanalysed_sentences = 0
        reanalysed_words     = 0
        for sentence in layers[ self.input_layers[1] ]:
            multiword_propernames_locations = set()
            last_partofspeech = None
            words_in_sentence = []
            local_word_id = 0
            while morph_span_id < len( morph_layer): 
                morph_span = morph_layer[morph_span_id]
                if sentence.start <= morph_span.start and \
                    morph_span.end <= sentence.end:
                    # a) word is in this sentence
                    if 'H' in morph_span.partofspeech and last_partofspeech == 'H':
                        # Remember locations of multiword propernames
                        multiword_propernames_locations.add( local_word_id - 1 )
                        multiword_propernames_locations.add( local_word_id )
                    words_in_sentence.append( morph_span )
                    morph_span_id += 1
                    local_word_id += 1
                    last_partofspeech = 'H' if 'H' in morph_span.partofspeech else morph_span.annotations[0]['partofspeech']
                elif sentence.end <= morph_span.start:
                    # b) word is in the next sentence
                    break
            # Reanalyse the sentence that contains multiword proper names
            if len(multiword_propernames_locations) > 0:
                # Make a new Text for sentence; add tokenization
                sentence_str = sentence.enclosing_text
                sentence_text = Text(sentence_str).tag_layer(['words', 'sentences'])
                # If everything is ok with tokenization, then reapply morph
                if len(sentence_text['words']) == len(words_in_sentence):
                    # Add morph + reordering
                    self.vabamorftagger.tag( sentence_text )
                    self.morph_reorderer.retag( sentence_text )
                    # Transfer new analyses to multiword propernames
                    for wid, old_morph_span in enumerate( words_in_sentence ):
                        if wid in multiword_propernames_locations:
                            new_morph_annotations = sentence_text['morph_analysis'][wid].annotations
                            old_morph_span.clear_annotations()
                            for annotation in new_morph_annotations:
                                new_annotation_dict = annotation.to_record(with_text=False)
                                del new_annotation_dict['start']
                                del new_annotation_dict['end']
                                old_morph_span.add_annotation( Annotation(old_morph_span, **new_annotation_dict) )
                            reanalysed_words += 1
                    reanalysed_sentences += 1
        # Record stats
        morph_layer.meta['reanalysed_words'] = reanalysed_words
        morph_layer.meta['reanalysed_sentences'] = reanalysed_sentences


## Example

In this example, MorphMultiwordProperNamesCorrector is applied on a simple morph_analysed Text.

(!) But in order to get the full effect, you should apply it after the text has been analysed with VabamorfCorpusTagger and corrected with MorphAnalysisReorderer -- it should be the final post-correction for the morph analysis.

In [2]:
t=Text('''
Vormi mängude ja materjali valiku eest lavastusele " Nähtamatu poiss " Haapsalu Noorte Huvikeskuse loovus- ja 
draamaring " Üks " juhendajad Anne Suislep ja Triin Reemann Energilise lastepärase huumori ja loo jutustamise
eest lavastuses " Ull Jaak ehk küll on häda kui pea on mäda " juhendaja Kadi Kronberg Lasva Rahvamaja laste 
näitering ( Võrumaa ). Preemia väljaandjaks on MTÜ Vilde Teater. Edgar Savisaar. Siin lauses pole midagi analüüsida. 
See on Tõlkebüroo Filosoof.''')
       
t.tag_layer()
t

text
"Vormi mängude ja materjali valiku eest lavastusele "" Nähtamatu poiss "" Haapsalu Noorte Huvikeskuse loovus- ja draamaring "" Üks "" juhendajad Anne Suislep ja Triin Reemann Energilise lastepärase huumori ja loo jutustamiseeest lavastuses "" Ull Jaak ehk küll on häda kui pea on mäda "" juhendaja Kadi Kronberg Lasva Rahvamaja laste näitering ( Võrumaa ). Preemia väljaandjaks on MTÜ Vilde Teater. Edgar Savisaar. Siin lauses pole midagi analüüsida. See on Tõlkebüroo Filosoof."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,5
tokens,,,,False,79
compound_tokens,"type, normalized",,tokens,False,1
words,normalized_form,,,True,78
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,78


In [3]:
corrector = MorphMultiwordProperNamesCorrector()
corrector.retag(t)
t.morph_analysis

0,1
reanalysed_sentences,2
reanalysed_words,12

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,78

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Vormi,Vormi,vorm,vorm,['vorm'],0,,sg g,S
mängude,mängude,mäng,mäng,['mäng'],de,,pl g,S
ja,ja,ja,ja,['ja'],0,,,J
materjali,materjali,materjal,materjal,['materjal'],0,,sg g,S
valiku,valiku,valik,valik,['valik'],0,,sg g,S
eest,eest,eest,eest,['eest'],0,,,K
lavastusele,lavastusele,lavastus,lavastus,['lavastus'],le,,sg all,S
"""","""","""","""","['""']",,,,Z
Nähtamatu,Nähtamatu,Nähtamatu,Nähtamatu,['Nähtamatu'],0,,sg g,H
poiss,poiss,poiss,poiss,['poiss'],0,,sg n,S
