# Syntax preprocessing taggers, rewriters and exporter


|tagger|rewriters|source attributes|target attributes|values|
|------|---------|-----------------|-----------------|------|
||PunctuatinonTypeRewriter|partofspeech, root|punctuation_type|```None```, 'Fst', 'Com', 'Col', ... |
||MorphToSyntaxMorphRewriter|partofspeech, form|partofspeech, form||
|PronounTypeTagger|PronounTypeRewriter|root, ending, clitic, partofspeech|pronoun_type| ```None```, ('det',), ('pers ps3',), ('pos', 'det', 'refl'), ... |
||RemoveDuplicateAnalysesRewriter|root, ending, clitic, partofspeech, form|||
||RemoveAdpositionAnalysesRewriter|partofspeech, form|||
||LetterCaseRewriter|word_text|cap|```None```, 'cap'|
|FiniteFormTagger|MorphToSyntaxMorphRewriter, FiniteFormRewriter|partofspeech, form|fin|```None```, ```True```, ```False```|
|VerbExtensionSuffixTagger|VerbExtensionSuffixRewriter|root|verb_extension_suffix|```None```,'tud','nud','mine','nu','tu','v','tav','mata','ja'|
|SubcatTagger|MorphToSyntaxMorphRewriter, SubcatRewriter|root, partofspeech, form|subcat|```None```, 'Intr', 'Part', 'gen', ...|
|MorphExtendedTagger|PunctuatinonTypeRewriter, MorphToSyntaxMorphRewriter, PronounTypeRewriter, RemoveDuplicateAnalysesRewriter, RemoveAdpositionAnalysesRewriter, LetterCaseRewriter, FiniteFormRewriter, VerbExtensionSuffixRewriter, SubcatRewriter|root, ending, clitic, partofspeech, form|partofspeech, form, punctuation_type, pronoun_type, cap, fin, verb_extension_suffix, subcat ||

In [1]:
from estnltk import Text
from estnltk.core import abs_path

## PronounTypeTagger

In [2]:
from estnltk.taggers import PronounTypeTagger

tagger = PronounTypeTagger()
tagger

name,layer,attributes,depends_on
PronounTypeTagger,pronoun_type,"(lemma, root, root_tokens, ending, clitic, form, partofspeech, pronoun_type)",[morph_analysis]


In [3]:
text = Text('Kumb, sina või mina?').tag_layer()
tagger.tag(text)
text['pronoun_type']

layer name,attributes,parent,enveloping,ambiguous,span count
pronoun_type,"lemma, root, root_tokens, ending, clitic, form, partofspeech, pronoun_type",morph_analysis,,True,6

text,lemma,root,root_tokens,ending,clitic,form,partofspeech,pronoun_type
Kumb,kumb,kumb,"('kumb',)",0.0,,sg n,P,"('rel',)"
",",",",",","(',',)",,,,Z,
sina,sina,sina,"('sina',)",0.0,,sg n,P,"('ps2',)"
või,või,või,"('või',)",0.0,,,J,
mina,mina,mina,"('mina',)",0.0,,sg n,S,
?,?,?,"('?',)",,,,Z,


## FiniteFormTagger

In [4]:
from estnltk.taggers import FiniteFormTagger

fsToSyntFulesFile = abs_path('rewriting/syntax_preprocessing/rules_files/tmorftrtabel.txt')

tagger = FiniteFormTagger(fs_to_synt_rules_file=fsToSyntFulesFile)
tagger

name,layer,attributes,depends_on
FiniteFormTagger,finite_form,"(lemma, root, root_tokens, ending, clitic, form, partofspeech, fin)",[morph_analysis]


In [5]:
text = Text('laulma hüpelnud tantsija').tag_layer()
tagger.tag(text)
text['finite_form']

layer name,attributes,parent,enveloping,ambiguous,span count
finite_form,"lemma, root, root_tokens, ending, clitic, form, partofspeech, fin",morph_analysis,,True,3

text,lemma,root,root_tokens,ending,clitic,form,partofspeech,fin
laulma,laulma,laul,"('laul',)",ma,,mod sup ps ill,V,False
,laulma,laul,"('laul',)",ma,,aux sup ps ill,V,False
,laulma,laul,"('laul',)",ma,,main sup ps ill,V,False
hüpelnud,hüpelnud,hüpel=nud,"('hüpelnud',)",0,,pos,A,
,hüpelnud,hüpel=nud,"('hüpelnud',)",0,,pos sg nom,A,
,hüpelnud,hüpel=nud,"('hüpelnud',)",d,,pos pl nom,A,
,hüplema,hüple,"('hüple',)",nud,,mod indic impf ps neg,V,True
,hüplema,hüple,"('hüple',)",nud,,mod partic past ps,V,False
,hüplema,hüple,"('hüple',)",nud,,aux indic impf ps neg,V,True
,hüplema,hüple,"('hüple',)",nud,,aux partic past ps,V,False


## VerbExtensionSuffixTagger

In [6]:
from estnltk.taggers import VerbExtensionSuffixTagger

tagger = VerbExtensionSuffixTagger()
tagger

name,layer,attributes,depends_on
VerbExtensionSuffixTagger,verb_extension_suffix,"(lemma, root, root_tokens, ending, clitic, form, partofspeech, verb_extension_suffix)",[morph_analysis]


In [7]:
text = Text('Laulev hüpelnud tantsija').tag_layer()
tagger.tag(text)
text['verb_extension_suffix']

layer name,attributes,parent,enveloping,ambiguous,span count
verb_extension_suffix,"lemma, root, root_tokens, ending, clitic, form, partofspeech, verb_extension_suffix",morph_analysis,,True,3

text,lemma,root,root_tokens,ending,clitic,form,partofspeech,verb_extension_suffix
Laulev,laulev,laulev,"('laulev',)",0,,sg n,A,()
hüpelnud,hüpelnud,hüpel=nud,"('hüpelnud',)",0,,,A,"('nud',)"
,hüpelnud,hüpel=nud,"('hüpelnud',)",0,,sg n,A,"('nud',)"
,hüpelnud,hüpel=nud,"('hüpelnud',)",d,,pl n,A,"('nud',)"
,hüplema,hüple,"('hüple',)",nud,,nud,V,()
tantsija,tantsija,tantsija,"('tantsija',)",0,,sg n,S,()


## SubcatTagger

In [8]:
from os.path import relpath
from estnltk.taggers import SubcatTagger

fsToSyntFulesFile = relpath(abs_path('rewriting/syntax_preprocessing/rules_files/tmorftrtabel.txt'))
subcatFile = relpath(abs_path('rewriting/syntax_preprocessing/rules_files/abileksikon06utf.lx'))

tagger = SubcatTagger(fs_to_synt_rules_file=fsToSyntFulesFile, 
                      subcat_rules_file=subcatFile)
tagger

name,layer,attributes,depends_on
SubcatTagger,subcat,"(lemma, root, root_tokens, ending, clitic, form, partofspeech, subcat)",[morph_analysis]

0,1
fs_to_synt_rules_file,../../estnltk/rewriting/syntax_preprocessing/rules_files/tmorftrtabel.txt
subcat_rules_file,../../estnltk/rewriting/syntax_preprocessing/rules_files/abileksikon06utf.lx


In [9]:
text = Text('Järel juurduma').tag_layer()
tagger.tag(text)
text['subcat']

layer name,attributes,parent,enveloping,ambiguous,span count
subcat,"lemma, root, root_tokens, ending, clitic, form, partofspeech, subcat",morph_analysis,,True,2

text,lemma,root,root_tokens,ending,clitic,form,partofspeech,subcat
Järel,järel,järel,"('järel',)",0,,post,K,"('gen',)"
,järel,järel,"('järel',)",0,,pre,K,
juurduma,juurduma,juurdu,"('juurdu',)",ma,,mod sup ps ill,V,"('Intr',)"
,juurduma,juurdu,"('juurdu',)",ma,,aux sup ps ill,V,"('Intr',)"
,juurduma,juurdu,"('juurdu',)",ma,,main sup ps ill,V,"('Intr',)"


## MorphExtendedTagger

In [10]:
from estnltk.taggers import MorphExtendedTagger

fsToSyntFulesFile = relpath(abs_path('rewriting/syntax_preprocessing/rules_files/tmorftrtabel.txt'))
subcat_file = relpath(abs_path('rewriting/syntax_preprocessing/rules_files/abileksikon06utf.lx'))

tagger = MorphExtendedTagger(fs_to_synt_rules_file=fsToSyntFulesFile, 
                             allow_to_remove_all=False, 
                             subcat_rules_file=subcat_file)
tagger

name,layer,attributes,depends_on
MorphExtendedTagger,morph_extended,"(lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat)",[morph_analysis]

0,1
allow_to_remove_all,False
fs_to_synt_rules_file,../../estnltk/rewriting/syntax_preprocessing/rules_files/tmorftrtabel.txt
subcat_rules_file,../../estnltk/rewriting/syntax_preprocessing/rules_files/abileksikon06utf.lx


In [11]:
text = Text('Ta on rääkinud!').tag_layer()
tagger.tag(text)
text['morph_extended']

layer name,attributes,parent,enveloping,ambiguous,span count
morph_extended,"lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,4

text,lemma,root,root_tokens,ending,clitic,form,partofspeech,punctuation_type,pronoun_type,letter_case,fin,verb_extension_suffix,subcat
Ta,tema,tema,"('tema',)",0,,sg nom,P,,"('ps3',)",cap,,(),
on,olema,ole,"('ole',)",0,,mod indic pres ps3 sg ps af,V,,,,True,(),"('Intr',)"
,olema,ole,"('ole',)",0,,aux indic pres ps3 sg ps af,V,,,,True,(),"('Intr',)"
,olema,ole,"('ole',)",0,,main indic pres ps3 sg ps af,V,,,,True,(),"('Intr',)"
,olema,ole,"('ole',)",0,,mod indic pres ps3 pl ps af,V,,,,True,(),"('Intr',)"
,olema,ole,"('ole',)",0,,aux indic pres ps3 pl ps af,V,,,,True,(),"('Intr',)"
,olema,ole,"('ole',)",0,,main indic pres ps3 pl ps af,V,,,,True,(),"('Intr',)"
rääkinud,rääkinud,rääki=nud,"('rääkinud',)",0,,pos,A,,,,,"('nud',)",
,rääkinud,rääki=nud,"('rääkinud',)",0,,pos sg nom,A,,,,,"('nud',)",
,rääkinud,rääki=nud,"('rääkinud',)",d,,pos pl nom,A,,,,,"('nud',)",


# CG3 exporter

In [12]:
from estnltk.converters.CG3_exporter import export_CG3
text = Text('Lähme! Ta on rääkinud.')
text.analyse('syntax_preprocessing')
export_CG3(text)

['"<s>"',
 '"<Lähme>"',
 '    "mine" Lme V mod indic pres ps1 pl ps af cap <FinV>',
 '    "mine" Lme V aux indic pres ps1 pl ps af cap <FinV>',
 '    "mine" Lme V main indic pres ps1 pl ps af cap <FinV>',
 '"<!>"',
 '    "!" Z Exc',
 '"</s>"',
 '"<s>"',
 '"<Ta>"',
 '    "tema" L0 P pers ps3 sg nom cap',
 '"<on>"',
 '    "ole" L0 V mod indic pres ps3 sg ps af <FinV> <Intr>',
 '    "ole" L0 V aux indic pres ps3 sg ps af <FinV> <Intr>',
 '    "ole" L0 V main indic pres ps3 sg ps af <FinV> <Intr>',
 '    "ole" L0 V mod indic pres ps3 pl ps af <FinV> <Intr>',
 '    "ole" L0 V aux indic pres ps3 pl ps af <FinV> <Intr>',
 '    "ole" L0 V main indic pres ps3 pl ps af <FinV> <Intr>',
 '"<rääkinud>"',
 '    "rääki=nud" L0 A pos partic <nud>',
 '    "rääki=nud" L0 A pos sg nom partic <nud>',
 '    "rääki=nud" Ld A pos pl nom partic <nud>',
 '    "rääki" Lnud V mod indic impf ps neg <FinV> <Part-P> <El>',
 '    "rääki" Lnud V mod partic past ps <Part-P> <El>',
 '    "rääki" Lnud V aux indic impf p