# Morphological analysis with premorph and postmorph

In [1]:
from estnltk import Text
from estnltk.resolve_layer_dag import make_resolver

Premorph and postmorph are tools to improve the morphological analysis given by vabamorf. Premorph normalizes the input before giving it to vabamorf, postmorph normalizes the output of vabamorf. By default, premorph and postmorph are both executed.

In [2]:
# Let's take a sentence that contains an unnecessary hyphen and an incorrectly declined number.
t = 'Mis lil-li müüs Tiit 10e krooniga?'
text = Text(t)
text

text
Mis lil-li müüs Tiit 10e krooniga?


In [3]:
text.tag_layer()

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
normalized_words,normal,words,,False,1
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7
sentences,,,words,False,1


In [4]:
# it is equivalent to the previous to write
text = Text(t)
resolver = make_resolver(
                            disambiguate=True,
                            guess=True,
                            propername=True,
                            phonetic=False,
                            compound=True
                        )
text.tag_layer(resolver=resolver)

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
normalized_words,normal,words,,False,1
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7
sentences,,,words,False,1


In [5]:
text['words']

text
Mis
lil-li
müüs
Tiit
10e
krooniga
?


In [6]:
text = Text('vä-ga.hea oli  sh. uus.')
text.tag_layer()

text
vä-ga.hea oli sh. uus.

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,10
compound_tokens,type,,tokens,False,2
words,,,,False,7
normalized_words,normal,words,,False,1
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7
sentences,,,words,False,1


In [7]:
text['words']

text
vä-ga
.
hea
oli
sh.
uus
.


In [8]:
text['sentences']

text
vä-ga.hea oli sh. uus.


In [9]:
# And let's tag the default layers on the sentence.
text.tag_layer()

text
vä-ga.hea oli sh. uus.

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,10
compound_tokens,type,,tokens,False,2
words,,,,False,7
normalized_words,normal,words,,False,1
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7
sentences,,,words,False,1


The default layer tagged by tag_layer() method is the morphological analysis layer. To perform morphological analysis, 'words', 'compound_tokens', 'sentences' and 'normalized' layers are tagged on the text first.

We can also see the existing layers with the Text class 'layers' attribute.

In [10]:
text.layers

{'compound_tokens': <estnltk.text.Layer at 0x7fb3079e4fd0>,
 'morph_analysis': <estnltk.text.Layer at 0x7fb307939630>,
 'normalized_words': <estnltk.text.Layer at 0x7fb3079bc898>,
 'sentences': <estnltk.text.Layer at 0x7fb3079bc518>,
 'tokens': <estnltk.text.Layer at 0x7fb3079e4f28>,
 'words': <estnltk.text.Layer at 0x7fb3079e4f98>}

In [11]:
# And now we can ask for morphological analysis of the sentence
text['morph_analysis']

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
vä-ga,väga,väga,"(väga,)",0,,,D
.,.,.,"(.,)",,,,Z
hea,hea,hea,"(hea,)",0,,sg n,A
oli,olema,ole,"(ole,)",i,,s,V
sh.,sh,sh,"(sh,)",0,,?,Y
uus,uus,uus,"(uus,)",0,,sg n,A
.,.,.,"(.,)",,,,Z


As we can see, the hyphen was removed and the correct form of the word "lilli" was found. For 10e the form 'sg g' was given out.

In [12]:
#--------IMPORTANT-----------------#
# How to switch on/off disambiguation and/or guessing in vabamorf?
# Do I have to do this using VabamorfTagger or can I do it with tag_layer() method? How?

The previous code actually uses WordNormalizingTagger, VabamorfTagger and VabamordCorrectionRewriter tools. So, we can write it out as follows:

In [13]:
# Import the taggers and rewriter
from estnltk.taggers import WordNormalizingTagger
from estnltk.taggers import VabamorfTagger
from estnltk.rewriting import VabamorfCorrectionRewriter

An instance of VabamorfCorrectionRewriter is created that fixes the analysis of tokens containing numbers. This is the rewriter that is used for postmorph by default.
In our example, it processes the 10e token.
If we want, we can write our own rewriter and use that instead.

In [14]:
# Create an instance of VabamorfCorrectionRewriter 
vabamorf_corrector = VabamorfCorrectionRewriter(replace=True)
vabamorf_corrector

class,VabamorfCorrectionRewriter
_replace,True
_rules_file,/home/paul/workspace/estnltk/estnltk/rewriting/postmorph/rules_files/number_analysis_rules.csv
_pronoun_correction,True


In [15]:
# Let's take the same sentence as previously
text = Text(t)

In [16]:
# And tag the layer 'words' on it.
text.tag_layer(['words'])

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7


In [17]:
# Now we can ask for the words layer
text['words']

text
Mis
lil-li
müüs
Tiit
10e
krooniga
?


Now we can normalize the sentence with WordNormalizingTagger that takes care of the unnecessary hypens in words and [maybe sth else?]. In our example, it normalizes the word "lilli". It can be replaced with our own tagger if we decide to write one.

In [18]:
from estnltk.rewriting import MorphAnalyzedToken
mat = MorphAnalyzedToken('lil-li')
mat.normal

MorphAnalyzedToken('lilli')

In [19]:
# Normalize the sentence with WordNormalizingTagger
WordNormalizingTagger().tag(text)

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
normalized_words,normal,words,,False,1


In [20]:
# And we can see the word that was changed
text['normalized_words']

text,normal
lil-li,lilli


Now we can use the VabamorfTagger on the normalized layer received from WordNormalizingTagger and ask for the created VabamorfCorrectionRewriter to be used after vabamorf.

In [21]:
# Tag the text with VabamorfTagger using default premorph and postmorph
VabamorfTagger(premorf_layer='normalized_words', postmorph_rewriter=vabamorf_corrector).tag(text)

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
normalized_words,normal,words,,False,1
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7


In [22]:
# The same output is received as in the first example
text['morph_analysis']

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Mis,mis,mis,"(mis,)",0,,pl n,P
,mis,mis,"(mis,)",0,,sg n,P
lil-li,lill,lill,"(lill,)",i,,pl p,S
müüs,müüma,müü,"(müü,)",s,,s,V
Tiit,Tiit,Tiit,"(Tiit,)",0,,sg n,H
10e,10,10,[10],0,,sg g,N
krooniga,kroon,kroon,"(kroon,)",ga,,sg kom,S
?,?,?,"(?,)",,,,Z


As mentioned, we can customize premorph or postmorph.

To turn off postmorph for the same example, we need to set postmorph_rewriter to None:

In [23]:
text = Text(t)

In [24]:
text.tag_layer(['words'])

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7


In [25]:
WordNormalizingTagger().tag(text)

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
normalized_words,normal,words,,False,1


In [26]:
# postmorph_rewriter = None says that we don't want to apply the default rewriter
VabamorfTagger(premorf_layer='normalized_words', postmorph_rewriter=None).tag(text)

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
normalized_words,normal,words,,False,1
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7


In [27]:
text['morph_analysis']

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Mis,mis,mis,"(mis,)",0,,pl n,P
,mis,mis,"(mis,)",0,,sg n,P
lil-li,lill,lill,"(lill,)",i,,pl p,S
müüs,müüma,müü,"(müü,)",s,,s,V
Tiit,Tiit,Tiit,"(Tiit,)",0,,sg n,H
10e,10e,10e,"(10e,)",0,,?,Y
krooniga,kroon,kroon,"(kroon,)",ga,,sg kom,S
?,?,?,"(?,)",,,,Z


Now we got only one analysis for the token '10e' which, unfortunately, is not correct.

We can also turn off premorph:

In [28]:
text = Text(t)

In [29]:
text.tag_layer(['words'])

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7


In [30]:
# premorph_layer = None says that we don't want to apply the default rewriter
VabamorfTagger(premorph_layer=None, postmorph_rewriter=vabamorf_corrector).tag(text)

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7


In [31]:
text['morph_analysis']

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Mis,mis,mis,"(mis,)",0,,pl n,P
,mis,mis,"(mis,)",0,,sg n,P
lil-li,lil-li,lil-li,"(lil, li)",0,,?,Y
müüs,müüma,müü,"(müü,)",s,,s,V
Tiit,Tiit,Tiit,"(Tiit,)",0,,sg n,H
10e,10,10,[10],0,,sg g,N
krooniga,kroon,kroon,"(kroon,)",ga,,sg kom,S
?,?,?,"(?,)",,,,Z


In [32]:
text

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7


Here we can see that the word "lil-li" was not normalized and therefore didn't receive the correct analysis.

In [33]:
# Can't we switch off premorph and postmorph some easier way?
# Somehow without explicitly creating the VabamorfTagger?

## Use the `morf_analysis` layer to create a `corrected_morph` layer


In [34]:
# What is the purpose of this?
# According to Sven, this shows how to tag your own layer with a custom postmorph rewriter
# in addition to the default layer.
# The example code is too complicated  and undocumented - impossible to understand.
# An easier example would be nice that wouldn't try to contain everything.

1. Create a text object.
2. Tag the `normalized` layer (and also the `words` layer  #this happens somehow magically by tagging the 'normalized' layer?).
3. Create a layer `_morph` that contains the data from the layers `morf_analysis` and `normalized`.
5. Rewrite the `_morph` layer and get the `corrected_morph` layer as a result.
6. Attach the `corrected_morph` layer to the text object.

Now `text.corrected_morph` is the same as `t.morf_analysis` in the first example where premorph and postmorph are executed.

In [35]:
# Import the necessary stuff
from estnltk.text import Span, Layer
from estnltk.rewriting.postmorph.vabamorf_corrector import VabamorfCorrectionRewriter

In [36]:
text = Text('Mis lil-li müüs Tiit 10e krooniga?')
text.tag_layer(['normalized_words'])
VabamorfTagger(premorf_layer='normalized_words', postmorph_rewriter=None).tag(text)

text
Mis lil-li müüs Tiit 10e krooniga?

layer,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,type,,tokens,False,1
words,,,,False,7
normalized_words,normal,words,,False,1
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7


In [37]:
morph_attributes = text['morph_analysis'].attributes
attributes = morph_attributes + ['word_normal']
attributes

['lemma',
 'root',
 'root_tokens',
 'ending',
 'clitic',
 'form',
 'partofspeech',
 'word_normal']

In [38]:
_morph = Layer(name='words',
               parent='words',
               ambiguous=True,
               attributes=attributes
               )

In [39]:
for word, analyses in zip(text.words, text.morph_analysis):
    for analysis in analyses:
        span = _morph.add_span(Span(parent=word))
        for attr in morph_attributes:
            setattr(span, attr, getattr(analysis, attr))
        setattr(span, 'word_normal', word.normal or word.text)

In [40]:
postmorph_rewriter = VabamorfCorrectionRewriter()

In [41]:
corrected_morph = _morph.rewrite(source_attributes=attributes,
                                 target_attributes=morph_attributes, 
                                 rules=postmorph_rewriter,
                                 name='corrected_morph',
                                 ambiguous=True)

In [42]:
text['corrected_morph'] = corrected_morph

In [43]:
text['corrected_morph']

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Mis,mis,mis,"(mis,)",0,,pl n,P
,mis,mis,"(mis,)",0,,sg n,P
lil-li,lill,lill,"(lill,)",i,,pl p,S
müüs,müüma,müü,"(müü,)",s,,s,V
Tiit,Tiit,Tiit,"(Tiit,)",0,,sg n,H
10e,10,10,[10],0,,sg g,N
krooniga,kroon,kroon,"(kroon,)",ga,,sg kom,S
?,?,?,"(?,)",,,,Z


The lemma, root, and root tokens of ordinal number ends with point.

In [44]:
text = Text('3-a')
text.tag_layer()
text['morph_analysis']

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
3,3,3,[3],0.0,,?,N
-,-,-,"(-,)",,,,Z
a,a,a,"(a,)",0.0,,?,Y
