# Visl row analyzer

In [31]:
from estnltk import Text
from estnltk.taggers import Tagger
from estnltk.layer.layer import Layer
from estnltk.converters.CG3_exporter import export_CG3
from estnltk.taggers.syntax.vislcg3_syntax import VISLCG3Pipeline
from estnltk import PACKAGE_PATH
import os
import re

In [32]:
cats = {
    'case' : ['nom', 'gen', 'part', 'ill', 'in', 'el', 'all', 'ad', 'abl', 
              'tr', 'term', 'es', 'abes', 'kom', 'adit'],
    'number' : ['sg', 'pl'],
    'voice' : ['imps', 'ps'],
    'tense' : ['pres', 'past', 'impf'],
    'mood' : ['indic', 'cond', 'imper', 'quot'],
    'person' : ['ps1', 'ps2', 'ps3'],
    'negation' : ['af', 'neg'],
    'inf_form' : ['sup', 'inf', 'ger', 'partic'],
    'pronoun_type' : ['pos', 'det', 'refl', 'dem', 'inter_rel', 'pers', 'rel', 'rec', 'indef'],
    'adjective_type' : ['pos', 'comp', 'super'],
    'verb_type' : ['main', 'mod', 'aux'],
    'substantive_type' : ['prop', 'com'],
    'numeral_type' : ['card', 'ord'],
    'number_format' : ['l', 'roman', 'digit'],
    'adposition_type' : ['pre', 'post'],
    'conjunction_type' : ['crd', 'sub'],
    'abbreviation_type' : ['adjectival', 'adverbial', 'nominal', 'verbal'],
    'punctuation_type' : ['Col', 'Com', 'Cpr', 'Cqu', 'Csq', 'Dsd', 'Dsh', 'Ell', 
                          'Els', 'Exc', 'Fst', 'Int', 'Opr', 'Oqu', 'Osq', 'Quo', 'Scl', 'Sla', 'Sml'],   
    'capitalized' : ['cap']
}

def get_analyzed_forms(forms):
    analyzed_forms = {}
    for i, form in enumerate(forms):
        for key in cats.keys():
            if form in cats[key]:
                if key not in analyzed_forms:
                    analyzed_forms[key] = form
                else:
                    analyzed_forms[key] += ' '+form
        
    return analyzed_forms

def process_visl_line(line):
    if not isinstance(line, str):
        raise Exception('(!) Unexpected type of input argument! Expected a string.')
    pat_analysis_line = re.compile('^\s+"(.+)"\s([^"]+)$')
    # 3 types of analyses:
    pat_ending_pos_form = re.compile('^L\S+\s+\S\s+([^#@]+).*$')
    pat_pos_form = re.compile('^\S\s+([^#@]+).*$')
    pat_ending_pos = re.compile('^(L\S+\s+)?\S\s+[#@].+$')
    pat_ending_pos2 = re.compile('^(L\S+\s+)?\S$')
    if line.startswith('  ') or line.startswith('\t'):
        analysis_match = pat_analysis_line.match(line)
        # Analysis line; in case of multiple analyses, pick the first one;
        if analysis_match:
            lemma = analysis_match.group(1)
            cats = analysis_match.group(2)
            if cats.startswith('Z '):
                postag = 'Z'
            else:
                postag = (cats.split())[1] if len(cats.split()) > 1 else 'X'
            L = re.findall('^L(\w+)', cats)
            L = L[0] if L else ''
            
            m1 = pat_ending_pos_form.match(cats)
            m2 = pat_pos_form.match(cats)
            m3 = pat_ending_pos.match(cats)
            m4 = pat_ending_pos2.match(cats)
            if m1:
                forms = (m1.group(1)).split()
            elif m2:
                forms = (m2.group(1)).split()
            elif m3 or m4:
                forms = ['_']  # no form (in case of adpositions and adverbs)
            else:
                # Unexpected format of analysis line
                if error_on_unexp:
                    raise Exception('(!) Unexpected format of analysis line: ' + line)
                else:
                    postag = 'X'
                    forms = ['_']
                    print('(!) Unexpected format of analysis line: ' + line, file=sys.stderr)
            if '#' in cats:
                deprels = re.findall('(@\S+)', cats)
                deprel = deprels[0] if deprels else 'xxx'
                heads = re.findall('#\d+\s*->\s*\d+', cats)[0]
                forms = get_analyzed_forms(forms)
                return {'ending': L, 'partofspeech': postag, 'feats': forms, 'deprel':deprel, 'head':heads}
            else:
                visl_info = re.findall('(<[^>]+>)', cats)
                forms = get_analyzed_forms(forms)
                return {'ending': L, 'partofspeech': postag, 'feats': forms, 'visl_info' : visl_info}

## Visl row using vislcg3 parser

In [34]:
text = Text('Marile meeldib see raamat.')
text.analyse('syntax_preprocessing')
results2 = export_CG3(text)
vislcgRulesDir = os.path.relpath(os.path.join(PACKAGE_PATH, 'taggers', 'syntax', 'files'))
vislcg_path = '/usr/bin/vislcg3'
pipeline2 = VISLCG3Pipeline(rules_dir=vislcgRulesDir, vislcg_cmd=vislcg_path)
results2 = list(filter(None, pipeline2.process_lines(results2).split('\n'))) #process_lines tulemus võiks olla list

In [35]:
results2

['"<s>"',
 '"<Marile>"',
 '\t"mari" Lle S com sg all cap @ADVL #1->2',
 '"<meeldib>"',
 '\t"meeldi" Lb V main indic pres ps3 sg ps af @FMV #2->0',
 '"<see>"',
 '\t"see" L0 P dem sg nom @NN> #3->4',
 '"<raamat>"',
 '\t"raamat" L0 S com sg nom @SUBJ #4->2',
 '"<.>"',
 '\t"." Z Fst CLB #5->5',
 '"</s>"']

In [36]:
for line in results2:
    processed_line = process_visl_line(line)
    if processed_line is not None: # None == token v <s> etc
        print(line)
        print(processed_line)
        print('\n')

	"mari" Lle S com sg all cap @ADVL #1->2
{'ending': 'le', 'feats': {'number': 'sg', 'case': 'all', 'substantive_type': 'com', 'capitalized': 'cap'}, 'deprel': '@ADVL', 'partofspeech': 'S', 'head': '#1->2'}


	"meeldi" Lb V main indic pres ps3 sg ps af @FMV #2->0
{'ending': 'b', 'feats': {'number': 'sg', 'negation': 'af', 'voice': 'ps', 'tense': 'pres', 'person': 'ps3', 'verb_type': 'main', 'mood': 'indic'}, 'deprel': '@FMV', 'partofspeech': 'V', 'head': '#2->0'}


	"see" L0 P dem sg nom @NN> #3->4
{'ending': '0', 'feats': {'number': 'sg', 'pronoun_type': 'dem', 'case': 'nom'}, 'deprel': '@NN>', 'partofspeech': 'P', 'head': '#3->4'}


	"raamat" L0 S com sg nom @SUBJ #4->2
{'ending': '0', 'feats': {'number': 'sg', 'case': 'nom', 'substantive_type': 'com'}, 'deprel': '@SUBJ', 'partofspeech': 'S', 'head': '#4->2'}


	"." Z Fst CLB #5->5
{'ending': '', 'feats': {'punctuation_type': 'Fst'}, 'deprel': 'xxx', 'partofspeech': 'Z', 'head': '#5->5'}




## Visl row without parser

In [37]:
text = Text('Marile meeldib see raamat.')
text.analyse('syntax_preprocessing')
results2 = export_CG3(text)

In [38]:
results2

['"<s>"',
 '"<Marile>"',
 '    "mari" Lle S com sg all cap',
 '"<meeldib>"',
 '    "meeldi" Lb V mod indic pres ps3 sg ps af <FinV> <Intr> <All>',
 '    "meeldi" Lb V aux indic pres ps3 sg ps af <FinV> <Intr> <All>',
 '    "meeldi" Lb V main indic pres ps3 sg ps af <FinV> <Intr> <All>',
 '"<see>"',
 '    "see" L0 P dem sg nom',
 '"<raamat>"',
 '    "raamat" L0 S com sg nom',
 '"<.>"',
 '    "." Z Fst',
 '"</s>"']

In [39]:
for line in results2:
    processed_line = process_visl_line(line)
    if processed_line is not None: # None == token v <s> etc
        print(line)
        print(processed_line)
        print('\n')

    "mari" Lle S com sg all cap
{'visl_info': [], 'ending': 'le', 'feats': {'number': 'sg', 'case': 'all', 'substantive_type': 'com', 'capitalized': 'cap'}, 'partofspeech': 'S'}


    "meeldi" Lb V mod indic pres ps3 sg ps af <FinV> <Intr> <All>
{'visl_info': ['<FinV>', '<Intr>', '<All>'], 'ending': 'b', 'feats': {'number': 'sg', 'negation': 'af', 'voice': 'ps', 'tense': 'pres', 'person': 'ps3', 'verb_type': 'mod', 'mood': 'indic'}, 'partofspeech': 'V'}


    "meeldi" Lb V aux indic pres ps3 sg ps af <FinV> <Intr> <All>
{'visl_info': ['<FinV>', '<Intr>', '<All>'], 'ending': 'b', 'feats': {'number': 'sg', 'negation': 'af', 'voice': 'ps', 'tense': 'pres', 'person': 'ps3', 'verb_type': 'aux', 'mood': 'indic'}, 'partofspeech': 'V'}


    "meeldi" Lb V main indic pres ps3 sg ps af <FinV> <Intr> <All>
{'visl_info': ['<FinV>', '<Intr>', '<All>'], 'ending': 'b', 'feats': {'number': 'sg', 'negation': 'af', 'voice': 'ps', 'tense': 'pres', 'person': 'ps3', 'verb_type': 'main', 'mood': 'indic'}, '