In [1]:
from estnltk import Text

from pandas import read_csv
from estnltk.taggers import RegexTagger

from estnltk.spans import Span, SpanList
from estnltk.layer import Layer
from estnltk.finite_grammar.layer_graph import print_nodes
import csv
from estnltk.finite_grammar import PhraseListTagger

from estnltk.taggers import GapTagger
from estnltk.taggers import EnvelopingGapTagger
from estnltk.taggers import MergeTagger

from estnltk.layer_operations.flatten import flatten

from estnltk.finite_grammar.layer_graph import layer_to_graph, plot_graph
#from estnltk.finite_grammar.grammar import parse_graph
#from estnltk.finite_grammar.layer_graph import graph_to_parse_trees

import re
from collections import defaultdict
import pickle

from estnltk.resolve_layer_dag import make_resolver

In [2]:
specification = [
    ('parem',),
    ('vasak',), 
    ('eesmine',),
    ('tagumine',),
    ('külgmine',)
    ]

In [3]:
bodypart = [
    ('neer',),
    ('põlv',), 
    ('kops',),
    ('aju',),
    ('külgvatsake',)
    ]

In [4]:
location = [
    ('tagasein',),
    
    ]

In [5]:
def location_decorator(text, span, phrase):
    return {'match': phrase, 'grammar_symbol': 'LOCATION', 
            'form': span.form, 'partofspeech': span.partofspeech}

In [6]:
location_tagger = PhraseListTagger(layer_name='location',
                              input_layer = 'morph_analysis',
                               input_attribute='lemma',
                             phrase_list=location,
                               decorator = location_decorator,
                             attributes=('match', 'grammar_symbol', 'form', 'partofspeech'),
                             conflict_resolving_strategy= 'MAX'
                              )

In [7]:
def specification_decorator(text, span, phrase):
    return {'match': phrase, 'grammar_symbol': 'SPECIFICATION', 
            'form': span.form, 'partofspeech': span.partofspeech}

In [8]:
specification_tagger = PhraseListTagger(layer_name='specification',
                              input_layer = 'morph_analysis',
                               input_attribute='lemma',
                             phrase_list=specification,
                               decorator = specification_decorator,
                             attributes=('match', 'grammar_symbol', 'form', 'partofspeech'),
                             conflict_resolving_strategy= 'MAX'
                              )

In [9]:
def bodypart_decorator(text, span, phrase):
    return {'match': phrase, 'grammar_symbol': 'BODYPART', 
            'form': span.form, 'partofspeech': span.partofspeech}

In [10]:
bodypart_tagger = PhraseListTagger(layer_name='bodypart',
                              input_layer = 'morph_analysis',
                               input_attribute='lemma',
                             phrase_list=bodypart,
                               decorator = bodypart_decorator,
                             attributes=('match', 'grammar_symbol', 'form', 'partofspeech'),
                             conflict_resolving_strategy= 'MAX'
                              )

In [11]:
taggers = {}
taggers['bodypart'] = bodypart_tagger
#taggers['location_tagger'] = location_tagger
taggers['specification'] = specification_tagger
taggers['location'] = location_tagger

In [12]:
def gaps_decorator(text:str):
    return {'gap_length':len(text), 'grammar_symbol': 'RANDOM_TEXT'}

In [13]:
gaps_tagger = EnvelopingGapTagger(layer_name='gaps',
                                 input_layers=['bodypart', 
                                               'location',
                                               'specification'
                                              ],
                                 enveloped_layer='morph_analysis',
                                 decorator=gaps_decorator,
                                 attributes=['grammar_symbol'])

In [14]:
merge_tagger = MergeTagger(layer_name='grammar_tags',
                           input_layers=['bodypart',
                                       'location',
                                         'specification',
                                      'gaps'],
                           attributes=('grammar_symbol', 'value', 'form', 'partofspeech'))

In [15]:
def tag_sent(sent):
    sent = Text(sent)
    sent.analyse('morphology')
    for tagger in taggers:
        taggers[tagger].tag(sent)

    gaps_tagger.tag(sent)
    merge_tagger.tag(sent)
    return sent

In [16]:
from estnltk.finite_grammar.grammar import Rule, Grammar

rules = []



rules.append(Rule('COMPLEMENT', 'SPECIFICATION'))
#rules.append(Rule('COMPLEMENT', 'SPECIFICATION'))

rules.append(Rule('MAIN', 'BODYPART', group = 'g1', priority = 1))
rules.append(Rule('MAIN', 'LOCATION', group = 'g1', priority = 1))


rules.append(Rule('COMP_MAIN', 'MAIN MAIN'))#, decorator = complement_decorator))

grammar = Grammar(start_symbols=['COMP_MAIN'
                                
                                ], rules=rules,# max_depth = 4, 
                 legal_attributes=['form', 'type', 'symboltype', 'partofspeech'])
grammar



Grammar:
	start: COMP_MAIN
	terminals: BODYPART, LOCATION, SPECIFICATION
	nonterminals: COMPLEMENT, COMP_MAIN, MAIN
	legal attributes: frozenset({'partofspeech', 'type', 'form', 'symboltype'})
	depth_limit: inf
	width_limit: inf
Rules:
	COMPLEMENT -> SPECIFICATION	: 0, val: default_validator, dec: default_decorator, scoring: default_scoring
	MAIN -> BODYPART	: 1, val: default_validator, dec: default_decorator, scoring: default_scoring
	MAIN -> LOCATION	: 1, val: default_validator, dec: default_decorator, scoring: default_scoring
	COMP_MAIN -> MAIN MAIN	: 0, val: default_validator, dec: default_decorator, scoring: default_scoring

In [17]:
from estnltk.taggers import GrammarParsingTagger
parsing_tagger = GrammarParsingTagger(grammar=grammar,
                              layer_of_tokens='grammar_tags',
                              name_attribute='grammar_symbol', # the default
                              layer_name='parse', # the default
                              attributes=['form', 'partofspeech', 'type', 'grammar_symbol', 
                                          'span_count', 'name', '_priority_', '_group_'], # default: ()
                              output_nodes=None, # by default grammar.start_symbols are used
                              resolve_support_conflicts=True, # the default
                              resolve_start_end_conflicts=True, # the default
                              resolve_terminals_conflicts=True # the default
                             )

In [18]:
from estnltk.taggers.grammar_parsing.grammar_parsing_tagger import GrammarParsingTagger
parsing_tagger = GrammarParsingTagger(layer_name='parse',
                                      attributes=['form', 'type',  'symboltype', 'partofspeech'],
                                      layer_of_tokens='grammar_tags',
                                      grammar=grammar,
                                      output_nodes={'COMP_MAIN', 'COMPLEMENT', 'MAIN','COMP_'})

In [19]:
i = 'põlve tagasein'

In [20]:
j = tag_sent(i)

In [21]:
j.grammar_tags

layer name,attributes,parent,enveloping,ambiguous,span count
grammar_tags,"grammar_symbol, value, form, partofspeech",,morph_analysis,False,2

text,start,end,grammar_symbol,value,form,partofspeech
põlve,0,5,BODYPART,,"[[sg g, sg g]]","[[S, S]]"
tagasein,6,14,LOCATION,,[[sg n]],[[S]]


In [22]:
parsing_tagger.tag(j)

TypeError: unhashable type: 'list'