# ENC processing

## MorphMultiwordProperNamesCorrector

In [1]:
from estnltk import Text, Layer, Annotation

from estnltk.taggers import Retagger
from estnltk.taggers import VabamorfTagger
from estnltk.taggers import MorphAnalysisReorderer
from estnltk.taggers import VabamorfCorpusTagger

class MorphMultiwordProperNamesCorrector(Retagger):
    """Reanalyses sentences that contain multiword propername phrases with VabamorfTagger."""
    conf_param = ('vabamorftagger', 'morph_reorderer')
    
    def __init__(self, output_layer='morph_analysis', input_layers=['morph_analysis', 'sentences']):
        self.output_layer = output_layer
        self.input_layers = input_layers
        self.output_attributes = VabamorfTagger.output_attributes
        # addition:
        # use slang_lex to enable extended morph lexicon
        self.vabamorftagger    = VabamorfTagger(slang_lex=True)
        self.morph_reorderer   = MorphAnalysisReorderer()

    def _change_layer(self, raw_text, layers, status):
        # Analyse input sentence by sentence
        morph_layer = layers[self.output_layer]
        morph_span_id = 0
        reanalysed_sentences = 0
        reanalysed_words     = 0
        for sentence in layers[ self.input_layers[1] ]:
            multiword_propernames_locations = set()
            last_partofspeech = None
            words_in_sentence = []
            local_word_id = 0
            while morph_span_id < len( morph_layer): 
                morph_span = morph_layer[morph_span_id]
                if sentence.start <= morph_span.start and \
                    morph_span.end <= sentence.end:
                    # a) word is in this sentence
                    if 'H' in morph_span.partofspeech and last_partofspeech == 'H':
                        # Remember locations of multiword propernames
                        multiword_propernames_locations.add( local_word_id - 1 )
                        multiword_propernames_locations.add( local_word_id )
                    words_in_sentence.append( morph_span )
                    morph_span_id += 1
                    local_word_id += 1
                    last_partofspeech = 'H' if 'H' in morph_span.partofspeech else morph_span.annotations[0]['partofspeech']
                elif sentence.end <= morph_span.start:
                    # b) word is in the next sentence
                    break
            # Reanalyse the sentence that contains multiword proper names
            if len(multiword_propernames_locations) > 0:
                # Make a new Text for sentence; add tokenization
                sentence_str = sentence.enclosing_text
                sentence_text = Text(sentence_str).tag_layer(['words', 'sentences'])
                # If everything is ok with tokenization, then reapply morph
                if len(sentence_text['words']) == len(words_in_sentence):
                    # Add morph + reordering
                    self.vabamorftagger.tag( sentence_text )
                    self.morph_reorderer.retag( sentence_text )
                    # Transfer new analyses to multiword propernames
                    for wid, old_morph_span in enumerate( words_in_sentence ):
                        if wid in multiword_propernames_locations:
                            new_morph_annotations = sentence_text['morph_analysis'][wid].annotations
                            old_morph_span.clear_annotations()
                            for annotation in new_morph_annotations:
                                new_annotation_dict = annotation.to_record(with_text=False)
                                del new_annotation_dict['start']
                                del new_annotation_dict['end']
                                old_morph_span.add_annotation( Annotation(old_morph_span, **new_annotation_dict) )
                            reanalysed_words += 1
                    reanalysed_sentences += 1
        # Record stats
        morph_layer.meta['reanalysed_words'] = reanalysed_words
        morph_layer.meta['reanalysed_sentences'] = reanalysed_sentences

INFO:utils.py:157: NumExpr defaulting to 4 threads.


## Stanza

First, you'll need models:  https://entu.keeleressursid.ee/public-document/entity-9862/2021-05-29

In [2]:
# For the best performance with StanzaSyntaxTagger, you should use input_type='morph_extended'
# This requires that the 'morph_extended' layer is computed
#
from estnltk.taggers.syntax.stanza_tagger.stanza_tagger import StanzaSyntaxTagger
stanza_model = 'stanza_resources\et\depparse\morph_extended.pt'
stanza_tagger = StanzaSyntaxTagger( input_type='morph_extended', 
                                    input_morph_layer='morph_extended',  
                                    depparse_path=stanza_model,
                                    add_parent_and_children=True  # add parent and children spans explcitly
                                   )  # Note: add use_gpu=True, if you can use GPU

# What stanza needs?
stanza_tagger.input_layers

('sentences', 'morph_extended', 'words')

In [3]:
# StanzaSyntaxEnsembleTagger is statistically more preferable than StanzaSyntaxTagger, 
# but is approx. 10x slower as we run 10 Stanza models.
# The performance difference in LAS scores is about 0.5 percent points.
# Additionally it is more stable and does not depend so much on the training set. 
#
import os
from estnltk.taggers.syntax.stanza_tagger.ensemble_tagger import StanzaSyntaxEnsembleTagger
models_dir = 'stanza_resources\et\depparse\ensemble_models'
model_paths = []
for fname in os.listdir(models_dir):
    if fname.endswith('.pt'):
        print(fname)
        model_paths.append( os.path.join(models_dir, fname) )
#print(model_paths)
stanza_ensemble_tagger = StanzaSyntaxEnsembleTagger( model_paths=model_paths,
                                                     add_parent_and_children=True  # add parent and children spans explcitly
                                                    )  # Note: add use_gpu=True, if you can use GPU

# What stanza needs?
stanza_ensemble_tagger.input_layers

model_1.pt
model_10.pt
model_2.pt
model_3.pt
model_4.pt
model_5.pt
model_6.pt
model_7.pt
model_8.pt
model_9.pt


('sentences', 'morph_extended', 'words')

## 1. Apply VabamorfCorpusTagger + MorphMultiwordProperNamesCorrector

The first part of the pipeline should stay as it is, because analyses with Vabamorf's categories are still needed in the output.

In [4]:
text_str='''
Vormi mängude ja materjali valiku eest lavastusele " Nähtamatu poiss " Haapsalu Noorte Huvikeskuse loovus- ja 
draamaring " Üks " juhendajad Anne Suislep ja Triin Reemann Energilise lastepärase huumori ja loo jutustamise
eest lavastuses " Ull Jaak ehk küll on häda kui pea on mäda " juhendaja Kadi Kronberg Lasva Rahvamaja laste 
näitering ( Võrumaa ). Preemia väljaandjaks on MTÜ Vilde Teater. Edgar Savisaar. Siin lauses pole midagi analüüsida. 
See on Tõlkebüroo Filosoof. Suur koer jälitas kiiret kassi.'''

disambiguator = VabamorfCorpusTagger(slang_lex=True)
corrector = MorphMultiwordProperNamesCorrector()

disambiguated_texts = disambiguator.tag([ Text(text_str).tag_layer(['words', 'sentences']) ])
t = disambiguated_texts[0]
t

text
"Vormi mängude ja materjali valiku eest lavastusele "" Nähtamatu poiss "" Haapsalu Noorte Huvikeskuse loovus- ja draamaring "" Üks "" juhendajad Anne Suislep ja Triin Reemann Energilise lastepärase huumori ja loo jutustamiseeest lavastuses "" Ull Jaak ehk küll on häda kui pea on mäda "" juhendaja Kadi Kronberg Lasva Rahvamaja laste näitering ( Võrumaa ). Preemia väljaandjaks on MTÜ Vilde Teater. Edgar Savisaar. Siin lauses pole midagi analüüsida. See on Tõlkebüroo Filosoof. Suur koer jälitas kiiret kassi."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,6
tokens,,,,False,85
compound_tokens,"type, normalized",,tokens,False,1
words,normalized_form,,,True,84
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,84


In [5]:
corrector.retag(t)
t.morph_analysis

0,1
reanalysed_sentences,4
reanalysed_words,19

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,84

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Vormi,Vormi,vorm,vorm,['vorm'],0,,sg g,S
mängude,mängude,mäng,mäng,['mäng'],de,,pl g,S
ja,ja,ja,ja,['ja'],0,,,J
materjali,materjali,materjal,materjal,['materjal'],0,,sg g,S
valiku,valiku,valik,valik,['valik'],0,,sg g,S
eest,eest,eest,eest,['eest'],0,,,K
lavastusele,lavastusele,lavastus,lavastus,['lavastus'],le,,sg all,S
"""","""","""","""","['""']",,,,Z
Nähtamatu,Nähtamatu,nähtamatu,nähtamatu,['nähtamatu'],0,,sg n,A
poiss,poiss,poiss,poiss,['poiss'],0,,sg n,S


## 2. Apply Syntax

In [6]:
# Add syntax
t.tag_layer('morph_extended')
stanza_tagger.tag(t)
#
# Or use:  stanza_ensemble_tagger.tag(t)
#          then the layer name will be 'stanza_ensemble_syntax'
t.stanza_syntax

layer name,attributes,parent,enveloping,ambiguous,span count
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_extended,,False,84

text,id,lemma,upostag,xpostag,feats,head,deprel,deps,misc,parent_span,children
Vormi,1,vorm,S,S,"OrderedDict([('com', 'com'), ('sg', 'sg'), ('gen', 'gen')])",2,nmod,_,_,"Span('mängude', [{'id': 2, 'lemma': 'mäng', 'upostag': 'S', 'xpostag': 'S', 'fea ..., type: <class 'estnltk.layer.span.Span'>",()
mängude,2,mäng,S,S,"OrderedDict([('com', 'com'), ('pl', 'pl'), ('gen', 'gen')])",5,nmod,_,_,"Span('valiku', [{'id': 5, 'lemma': 'valik', 'upostag': 'S', 'xpostag': 'S', 'fea ..., type: <class 'estnltk.layer.span.Span'>","(""Span('Vormi', [{'id': 1, 'lemma': 'vorm', 'upostag': 'S', 'xpostag': 'S', 'fea ..., type: <class 'tuple'>, length: 2"
ja,3,ja,J,J,"OrderedDict([('sub', 'sub'), ('crd', 'crd')])",4,cc,_,_,"Span('materjali', [{'id': 4, 'lemma': 'materjal', 'upostag': 'S', 'xpostag': 'S' ..., type: <class 'estnltk.layer.span.Span'>",()
materjali,4,materjal,S,S,"OrderedDict([('com', 'com'), ('sg', 'sg'), ('gen', 'gen')])",2,conj,_,_,"Span('mängude', [{'id': 2, 'lemma': 'mäng', 'upostag': 'S', 'xpostag': 'S', 'fea ..., type: <class 'estnltk.layer.span.Span'>","(""Span('ja', [{'id': 3, 'lemma': 'ja', 'upostag': 'J', 'xpostag': 'J', 'feats': ..., type: <class 'tuple'>, length: 1"
valiku,5,valik,S,S,"OrderedDict([('com', 'com'), ('sg', 'sg'), ('gen', 'gen')])",7,nmod,_,_,"Span('lavastusele', [{'id': 7, 'lemma': 'lavastus', 'upostag': 'S', 'xpostag': ' ..., type: <class 'estnltk.layer.span.Span'>","(""Span('mängude', [{'id': 2, 'lemma': 'mäng', 'upostag': 'S', 'xpostag': 'S', 'f ..., type: <class 'tuple'>, length: 2"
eest,6,eest,K,K,"OrderedDict([('post', 'post')])",5,case,_,_,"Span('valiku', [{'id': 5, 'lemma': 'valik', 'upostag': 'S', 'xpostag': 'S', 'fea ..., type: <class 'estnltk.layer.span.Span'>",()
lavastusele,7,lavastus,S,S,"OrderedDict([('com', 'com'), ('sg', 'sg'), ('all', 'all')])",34,nmod,_,_,"Span('lavastuses', [{'id': 34, 'lemma': 'lavastus', 'upostag': 'S', 'xpostag': ' ..., type: <class 'estnltk.layer.span.Span'>","(""Span('valiku', [{'id': 5, 'lemma': 'valik', 'upostag': 'S', 'xpostag': 'S', 'f ..., type: <class 'tuple'>, length: 2"
"""",8,"""",Z,Z,OrderedDict(),10,punct,_,_,"Span('poiss', [{'id': 10, 'lemma': 'poiss', 'upostag': 'S', 'xpostag': 'S', 'fea ..., type: <class 'estnltk.layer.span.Span'>",()
Nähtamatu,9,nähtamatu,A,A,"OrderedDict([('pos', 'pos'), ('sg', 'sg'), ('nom', 'nom')])",10,amod,_,_,"Span('poiss', [{'id': 10, 'lemma': 'poiss', 'upostag': 'S', 'xpostag': 'S', 'fea ..., type: <class 'estnltk.layer.span.Span'>",()
poiss,10,poiss,S,S,"OrderedDict([('com', 'com'), ('sg', 'sg'), ('nom', 'nom')])",7,appos,_,_,"Span('lavastusele', [{'id': 7, 'lemma': 'lavastus', 'upostag': 'S', 'xpostag': ' ..., type: <class 'estnltk.layer.span.Span'>","('Span(\'""\', [{\'id\': 8, \'lemma\': \'""\', \'upostag\': \'Z\', \'xpostag\': \' ..., type: <class 'tuple'>, length: 4"


## 3. Assemble output

In [7]:
# Put it all together
def print_token(token, fp):
    #  ****  Old fields
    word = token.text
    ana = token.morph_analysis.annotations[0]
    tag = ana.partofspeech
    features = '_'.join(ana.form.split())
    longtag = '.'.join([tag] + ana.form.split())
    lempos = '%s-%s' % (ana.lemma, tag.lower())
    root = ana.root
    root_tokens = ' '.join(ana.root_tokens)
    clitic = ana.clitic
    ending = ana.ending
    #  ****  New fields
    # First, there are morphosyntactic annotations 
    # from the 'morph_extended' layer (this info is 
    # also used by stanza)
    morph_extended = token.morph_extended.annotations[0]
    extended_feat = '_'.join(morph_extended.form.split())
    # There are some fields, which may not be filled in 
    # for every word. Check that values are not None
    # before including them
    punctuation_type = ''
    pronoun_type = ''
    finite_verb = ''
    subcat = ''
    if morph_extended.punctuation_type is not None:
        punctuation_type = morph_extended.punctuation_type
    if morph_extended.pronoun_type is not None:
        pronoun_type = '_'.join(morph_extended.pronoun_type)
    if morph_extended.fin is not None:
        finite_verb = 'fin' if morph_extended.fin else 'inf'
    if morph_extended.subcat is not None:
        subcat = '_'.join(morph_extended.subcat)
    
    # Syntactic info from stanza
    # ( alternatively, use stanza_ensemble_syntax if you use StanzaSyntaxEnsembleTagger )
    syn_id = token.stanza_syntax.id
    syn_head = token.stanza_syntax.head
    syn_rel = token.stanza_syntax.deprel
    
    # Finally, add information about the syntactic head / parent
    # (if the word is syntactic root, these fields will be empty)
    head_word = ''
    head_lemma = ''
    head_tag = ''
    head_features = ''
    head_syn_rel = ''
    if token.stanza_syntax.parent_span is not None:
        # ( alternatively, use stanza_ensemble_syntax if you use StanzaSyntaxEnsembleTagger )
        # Get parent token
        parent_token = token.stanza_syntax.parent_span
        # Get features of the parent token:
        parent_analysis = parent_token.morph_analysis.annotations[0]
        head_word = parent_token.text
        head_lemma = parent_analysis.lemma
        head_tag = parent_analysis.partofspeech
        head_features = '_'.join(parent_analysis.form.split())
        head_syn_rel = parent_token.stanza_syntax.deprel
    out_str = \
        f'{word}\t{longtag}\t{lempos}\t{features}\t{root_tokens}\t{root}\t{ending}\t{clitic}\t{extended_feat}\t{punctuation_type}\t{pronoun_type}\t{finite_verb}\t{subcat}\t{syn_id}\t{syn_head}\t{syn_rel}\t{head_word}\t{head_lemma}\t{head_tag}\t{head_features}\t{head_syn_rel}'.replace('\n', ' ')
    if fp:
        fp.write( out_str + '\n' )
    else:
        print( out_str )

In [8]:
for token in t.words:
    print_token(token, None)

Vormi	S.sg.g	vorm-s	sg_g	vorm	vorm	0		com_sg_gen					1	2	nmod	mängude	mäng	S	pl_g	nmod
mängude	S.pl.g	mäng-s	pl_g	mäng	mäng	de		com_pl_gen					2	5	nmod	valiku	valik	S	sg_g	nmod
ja	J	ja-j		ja	ja	0		sub_crd					3	4	cc	materjali	materjal	S	sg_g	conj
materjali	S.sg.g	materjal-s	sg_g	materjal	materjal	0		com_sg_gen					4	2	conj	mängude	mäng	S	pl_g	nmod
valiku	S.sg.g	valik-s	sg_g	valik	valik	0		com_sg_gen					5	7	nmod	lavastusele	lavastus	S	sg_all	nmod
eest	K	eest-k		eest	eest	0		post				gen	6	5	case	valiku	valik	S	sg_g	nmod
lavastusele	S.sg.all	lavastus-s	sg_all	lavastus	lavastus	le		com_sg_all					7	34	nmod	lavastuses	lavastus	S	sg_in	nmod
"	Z	"-z		"	"				Quo				8	10	punct	poiss	poiss	S	sg_n	appos
Nähtamatu	A.sg.n	nähtamatu-a	sg_n	nähtamatu	nähtamatu	0		pos_sg_nom					9	10	amod	poiss	poiss	S	sg_n	appos
poiss	S.sg.n	poiss-s	sg_n	poiss	poiss	0		com_sg_nom					10	7	appos	lavastusele	lavastus	S	sg_all	nmod
"	Z	"-z		"	"				Quo				11	10	punct	poiss	poiss	S	sg_n	appos
Haapsalu	G	haapsalu-g		haap 

In [9]:
# Just for testing: separate rows with ===
for token in t.words:
    print_token(token, None)
    print('='*100)

Vormi	S.sg.g	vorm-s	sg_g	vorm	vorm	0		com_sg_gen					1	2	nmod	mängude	mäng	S	pl_g	nmod
mängude	S.pl.g	mäng-s	pl_g	mäng	mäng	de		com_pl_gen					2	5	nmod	valiku	valik	S	sg_g	nmod
ja	J	ja-j		ja	ja	0		sub_crd					3	4	cc	materjali	materjal	S	sg_g	conj
materjali	S.sg.g	materjal-s	sg_g	materjal	materjal	0		com_sg_gen					4	2	conj	mängude	mäng	S	pl_g	nmod
valiku	S.sg.g	valik-s	sg_g	valik	valik	0		com_sg_gen					5	7	nmod	lavastusele	lavastus	S	sg_all	nmod
eest	K	eest-k		eest	eest	0		post				gen	6	5	case	valiku	valik	S	sg_g	nmod
lavastusele	S.sg.all	lavastus-s	sg_all	lavastus	lavastus	le		com_sg_all					7	34	nmod	lavastuses	lavastus	S	sg_in	nmod
"	Z	"-z		"	"				Quo				8	10	punct	poiss	poiss	S	sg_n	appos
Nähtamatu	A.sg.n	nähtamatu-a	sg_n	nähtamatu	nähtamatu	0		pos_sg_nom					9	10	amod	poiss	poiss	S	sg_n	appos
poiss	S.sg.n	poiss-s	sg_n	poiss	poiss	0		com_sg_nom					10	7	appos	lavastusele	lavastus	S	sg_all	nmod
"	Z	"-z		"	"				Quo				11	10	punct	poiss	poiss	S	sg_n	appos
Haapsalu	G	haapsalu-g		haap 