# Syntax preprocessing

This tutorial describes the tools that are needed to convert the morphologically analysed Estonian text into the format in which it can be parsed with **VISL-CG3 parser**. VISL CG3 is a rule-based syntactic parser that has thousands of Estonian-specific handcrafted rules for tagging syntactic functions and dependencies. However, the parser needs a little more information than is given out by the standard morphological analyser (e.g. pronoun types, verb subcategorization, etc.), and its input needs to be in a different format. Therefore, several taggers, rewriters, and an exporter that are described in the rest of the tutorial are needed.

## Syntax preprocessing taggers and rewriters


|tagger|rewriters|source attributes|target attributes|values|
|------|---------|-----------------|-----------------|------|
|&nbsp;| PunctuatinonTypeRewriter | partofspeech, root|punctuation_type|```None```, 'Fst', 'Com', 'Col', ... |
|&nbsp;| MorphToSyntaxMorphRewriter|partofspeech, form|partofspeech, form||
|PronounTypeTagger|PronounTypeRewriter|root, ending, clitic, partofspeech|pronoun_type| ```None```, ('det',), ('pers ps3',), ('pos', 'det', 'refl'), ... |
|&nbsp;|RemoveDuplicateAnalysesRewriter|root, ending, clitic, partofspeech, form|&nbsp;|&nbsp;|
|&nbsp;|RemoveAdpositionAnalysesRewriter|partofspeech, form|&nbsp;|&nbsp;|
|&nbsp;|LetterCaseRewriter|word_text|cap|```None```, 'cap'|
|FiniteFormTagger|MorphToSyntaxMorphRewriter, FiniteFormRewriter|partofspeech, form|fin|```None```, ```True```, ```False```|
|VerbExtensionSuffixTagger|VerbExtensionSuffixRewriter|root|verb_extension_suffix|```None```,'tud','nud','mine','nu','tu','v','tav','mata','ja'|
|SubcatTagger|MorphToSyntaxMorphRewriter, SubcatRewriter|root, partofspeech, form|subcat|```None```, 'Intr', 'Part', 'gen', ...|
|MorphExtendedTagger|PunctuatinonTypeRewriter, MorphToSyntaxMorphRewriter, PronounTypeRewriter, RemoveDuplicateAnalysesRewriter, RemoveAdpositionAnalysesRewriter, LetterCaseRewriter, FiniteFormRewriter, VerbExtensionSuffixRewriter, SubcatRewriter|root, ending, clitic, partofspeech, form|partofspeech, form, punctuation_type, pronoun_type, cap, fin, verb_extension_suffix, subcat ||

In [1]:
from estnltk import Text
from estnltk.core import abs_path

## PronounTypeRetagger

Create an instance of `PronounTypeRetagger`.

In [2]:
# NBVAL_IGNORE_OUTPUT
from estnltk.taggers import PronounTypeRetagger

retagger = PronounTypeRetagger('morph_analysis')
retagger

name,output layer,output attributes,input layers
PronounTypeRetagger,morph_analysis,"('pronoun_type',)","('morph_analysis',)"

0,1
check_output_consistency,False
pronoun_types,"defaultdict(<class 'tuple'>, {'emb-kumb': ('det',), 'eikeegi': ('indef',), 'eimi ..., type: <class 'collections.defaultdict'>, length: 98"


Run retagger and observe that the `pronoun_type` attribute is added to the `morph_analysis` layer.

In [3]:
text = Text('Kumb, sina või mina?').tag_layer()

retagger.retag(text)
text.morph_analysis

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, pronoun_type",words,,True,6

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech,pronoun_type
Kumb,Kumb,kumb,kumb,['kumb'],0.0,,sg n,P,"('rel',)"
",",",",",",",","[',']",,,,Z,
sina,sina,sina,sina,['sina'],0.0,,sg n,P,"('ps2',)"
või,või,või,või,['või'],0.0,,,J,
mina,mina,mina,mina,['mina'],0.0,,sg n,S,
?,?,?,?,['?'],,,,Z,


## FiniteFormTagger

In [4]:
from estnltk.taggers import FiniteFormTagger

fsToSyntFulesFile = abs_path('taggers/syntax_preprocessing/rules_files/tmorftrtabel.txt')

tagger = FiniteFormTagger('morph_analysis', fs_to_synt_rules_file=fsToSyntFulesFile)
tagger

name,output layer,output attributes,input layers
FiniteFormTagger,morph_extended,"('normalized_text', 'lemma', 'root', 'root_tokens', 'ending', 'clitic', 'form', 'partofspeech', 'fin')","('morph_analysis',)"

0,1
morph_to_syntax_morph_tagger,"MorphToSyntaxMorphRetagger(('morph_analysis',)->morph_extended)"
finite_form_retagger,"FiniteFormRetagger(('morph_extended',)->morph_extended)"


In [5]:
text = Text('laulma hüpelnud tantsija').tag_layer()

tagger.tag(text)

text.morph_extended

layer name,attributes,parent,enveloping,ambiguous,span count
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, fin",morph_analysis,,True,3

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech,fin
laulma,laulma,laulma,laul,['laul'],ma,,mod sup ps ill,V,False
,laulma,laulma,laul,['laul'],ma,,aux sup ps ill,V,False
,laulma,laulma,laul,['laul'],ma,,main sup ps ill,V,False
hüpelnud,hüpelnud,hüplema,hüple,['hüple'],nud,,mod indic impf ps neg,V,True
,hüpelnud,hüplema,hüple,['hüple'],nud,,mod partic past ps,V,False
,hüpelnud,hüplema,hüple,['hüple'],nud,,aux indic impf ps neg,V,True
,hüpelnud,hüplema,hüple,['hüple'],nud,,aux partic past ps,V,False
,hüpelnud,hüplema,hüple,['hüple'],nud,,main indic impf ps neg,V,True
,hüpelnud,hüplema,hüple,['hüple'],nud,,main partic past ps,V,False
,hüpelnud,hüpelnud,hüpel=nud,['hüpelnud'],0,,pos,A,


## VerbExtensionSuffixRetagger

In [6]:
# NBVAL_IGNORE_OUTPUT
from estnltk.taggers import VerbExtensionSuffixRetagger

retagger = VerbExtensionSuffixRetagger('morph_analysis')
retagger

name,output layer,output attributes,input layers
VerbExtensionSuffixRetagger,morph_analysis,"('verb_extension_suffix',)","('morph_analysis',)"

0,1
check_output_consistency,False
function,<function rewrite at 0x000001E06F487C80>


In [7]:
text = Text('Laulev hüpelnud tantsija').tag_layer()
retagger.retag(text)
text.morph_analysis

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, verb_extension_suffix",words,,True,3

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech,verb_extension_suffix
Laulev,Laulev,laulev,laulev,['laulev'],0,,sg n,A,()
hüpelnud,hüpelnud,hüplema,hüple,['hüple'],nud,,nud,V,()
,hüpelnud,hüpelnud,hüpel=nud,['hüpelnud'],0,,,A,"('nud',)"
,hüpelnud,hüpelnud,hüpel=nud,['hüpelnud'],0,,sg n,A,"('nud',)"
,hüpelnud,hüpelnud,hüpel=nud,['hüpelnud'],d,,pl n,A,"('nud',)"
tantsija,tantsija,tantsija,tantsija,['tantsija'],0,,sg n,S,()


## SubcatTagger

In [8]:
from os.path import relpath
from estnltk.taggers import SubcatTagger

fsToSyntFulesFile = relpath(abs_path('taggers/syntax_preprocessing/rules_files/tmorftrtabel.txt'))
subcatFile = relpath(abs_path('taggers/syntax_preprocessing/rules_files/abileksikon06utf.lx'))

tagger = SubcatTagger('morph_analysis', 'morph_extended',
                        fs_to_synt_rules_file=fsToSyntFulesFile, 
                        subcat_rules_file=subcatFile)
tagger

name,output layer,output attributes,input layers
SubcatTagger,morph_extended,"('normalized_text', 'lemma', 'root', 'root_tokens', 'ending', 'clitic', 'form', 'partofspeech', 'subcat')","('morph_analysis',)"

0,1
morph_to_syntax_morph_tagger,"MorphToSyntaxMorphRetagger(('morph_analysis',)->morph_extended)"
subcat_retagger,"SubcatRetagger(('morph_extended',)->morph_extended)"


In [9]:
text = Text('Järel juurduma').tag_layer()
tagger.tag(text)
text.morph_extended

layer name,attributes,parent,enveloping,ambiguous,span count
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, subcat",morph_analysis,,True,2

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech,subcat
Järel,Järel,järel,järel,['järel'],0,,post,K,"('gen',)"
,Järel,järel,järel,['järel'],0,,pre,K,
juurduma,juurduma,juurduma,juurdu,['juurdu'],ma,,mod sup ps ill,V,"('Intr',)"
,juurduma,juurduma,juurdu,['juurdu'],ma,,aux sup ps ill,V,"('Intr',)"
,juurduma,juurduma,juurdu,['juurdu'],ma,,main sup ps ill,V,"('Intr',)"


## MorphExtendedTagger

In [10]:
from estnltk.taggers import MorphExtendedTagger

fsToSyntFulesFile = relpath(abs_path('taggers/syntax_preprocessing/rules_files/tmorftrtabel.txt'))
subcat_file = relpath(abs_path('taggers/syntax_preprocessing/rules_files/abileksikon06utf.lx'))

tagger = MorphExtendedTagger(fs_to_synt_rules_file=fsToSyntFulesFile, 
                             allow_to_remove_all=False, 
                             subcat_rules_file=subcat_file)
tagger

name,output layer,output attributes,input layers
MorphExtendedTagger,morph_extended,"('normalized_text', 'lemma', 'root', 'root_tokens', 'ending', 'clitic', 'form', 'partofspeech', 'punctuation_type', 'pronoun_type', 'letter_case', 'fin', 'verb_extension_suffix', 'subcat')","('morph_analysis',)"

0,1
punctuation_type_retagger,"PunctuationTypeRetagger(('morph_extended',)->morph_extended)"
morph_to_syntax_morph_retagger,"MorphToSyntaxMorphRetagger(('morph_analysis',)->morph_extended)"
pronoun_type_retagger,"PronounTypeRetagger(('morph_extended',)->morph_extended)"
letter_case_retagger,"LetterCaseRetagger(('morph_extended',)->morph_extended)"
remove_adposition_analyses_retagger,"RemoveAdpositionAnalysesRetagger(('morph_extended',)->morph_extended)"
finite_form_retagger,"FiniteFormRetagger(('morph_extended',)->morph_extended)"
verb_extension_suffix_retagger,"VerbExtensionSuffixRetagger(('morph_extended',)->morph_extended)"
subcat_retagger,"SubcatRetagger(('morph_extended',)->morph_extended)"


In [11]:
text = Text('Ta on rääkinud!').tag_layer()
tagger.tag(text)
text['morph_extended']

layer name,attributes,parent,enveloping,ambiguous,span count
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,4

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech,punctuation_type,pronoun_type,letter_case,fin,verb_extension_suffix,subcat
Ta,Ta,tema,tema,['tema'],0,,sg nom,P,,['ps3'],cap,,[],
on,on,olema,ole,['ole'],0,,mod indic pres ps3 sg ps af,V,,,,True,[],['Intr']
,on,olema,ole,['ole'],0,,aux indic pres ps3 sg ps af,V,,,,True,[],['Intr']
,on,olema,ole,['ole'],0,,main indic pres ps3 sg ps af,V,,,,True,[],['Intr']
,on,olema,ole,['ole'],0,,mod indic pres ps3 pl ps af,V,,,,True,[],['Intr']
,on,olema,ole,['ole'],0,,aux indic pres ps3 pl ps af,V,,,,True,[],['Intr']
,on,olema,ole,['ole'],0,,main indic pres ps3 pl ps af,V,,,,True,[],['Intr']
rääkinud,rääkinud,rääkima,rääki,['rääki'],nud,,mod indic impf ps neg,V,,,,True,[],"['Part-P', 'El']"
,rääkinud,rääkima,rääki,['rääki'],nud,,mod partic past ps,V,,,,False,[],"['Part-P', 'El']"
,rääkinud,rääkima,rääki,['rääki'],nud,,aux indic impf ps neg,V,,,,True,[],"['Part-P', 'El']"


## CG3 exporter

In [12]:
from estnltk.converters.CG3_exporter import export_CG3
text = Text('Lähme! Ta on rääkinud.')
text.analyse('syntax_preprocessing')
export_CG3(text)

['"<s>"',
 '"<Lähme>"',
 '    "mine" Lme V mod indic pres ps1 pl ps af cap <FinV>',
 '    "mine" Lme V aux indic pres ps1 pl ps af cap <FinV>',
 '    "mine" Lme V main indic pres ps1 pl ps af cap <FinV>',
 '"<!>"',
 '    "!" Z Exc',
 '"</s>"',
 '"<s>"',
 '"<Ta>"',
 '    "tema" L0 P pers ps3 sg nom cap',
 '"<on>"',
 '    "ole" L0 V mod indic pres ps3 sg ps af <FinV> <Intr>',
 '    "ole" L0 V aux indic pres ps3 sg ps af <FinV> <Intr>',
 '    "ole" L0 V main indic pres ps3 sg ps af <FinV> <Intr>',
 '    "ole" L0 V mod indic pres ps3 pl ps af <FinV> <Intr>',
 '    "ole" L0 V aux indic pres ps3 pl ps af <FinV> <Intr>',
 '    "ole" L0 V main indic pres ps3 pl ps af <FinV> <Intr>',
 '"<rääkinud>"',
 '    "rääki" Lnud V mod indic impf ps neg <FinV> <Part-P> <El>',
 '    "rääki" Lnud V mod partic past ps <Part-P> <El>',
 '    "rääki" Lnud V aux indic impf ps neg <FinV> <Part-P> <El>',
 '    "rääki" Lnud V aux partic past ps <Part-P> <El>',
 '    "rääki" Lnud V main indic impf ps neg <FinV> <Par