# Introduction to finite grammar

In [1]:
from estnltk import Text

text = Text('Veski 5, Elva, Tartumaa.').tag_layer(['words'])

In [2]:
from estnltk.taggers import AddressPartTagger

address_part_tagger = AddressPartTagger()
address_part_tagger.tag(text)
text.address_parts

layer name,attributes,parent,enveloping,ambiguous,span count
address_parts,"grammar_symbol, type",,,True,4

text,grammar_symbol,type
Veski,ASULA,asula
,ASULA,asula
,TÄNAV,tänav
5,MAJA,
Elva,ASULA,asula
,TÄNAV,tänav
Tartumaa,MAAKOND,maakond


## Rule
reeglite tegemine

SEQ reeglid

In [3]:
from estnltk.finite_grammar import Rule, Grammar


Rule('ADDRESS', 'TÄNAV MAJA ASULA', group='g0', priority=3)

ADDRESS -> TÄNAV MAJA ASULA	: 3, val: default_validator, dec: default_decorator, scoring: default_scoring

## Grammar

In [4]:
grammar = Grammar(start_symbols=['ADDRESS'], 
                  rules=None, # the default, deprecated
                  depth_limit=float('inf'), # the default
                  width_limit=float('inf'), # the default
                  legal_attributes=None # the default
                  )

grammar.add(Rule('ADDRESS', 'TÄNAV MAJA ASULA', group='g0', priority=3))
grammar.add(Rule('ADDRESS', 'TÄNAV MAJA',       group='g0', priority=3))
grammar


Grammar:
	start: ADDRESS
	terminals: ASULA, MAJA, TÄNAV
	nonterminals: ADDRESS
	legal attributes: frozenset()
	depth_limit: inf
	width_limit: inf
Rules:
	ADDRESS -> TÄNAV MAJA ASULA	: 3, val: default_validator, dec: default_decorator, scoring: default_scoring
	ADDRESS -> TÄNAV MAJA	: 3, val: default_validator, dec: default_decorator, scoring: default_scoring

## GrammarParsingTagger

In [5]:
from estnltk.taggers import GrammarParsingTagger

tagger = GrammarParsingTagger(grammar=grammar,
                              layer_of_tokens='address_parts',
                              name_attribute='grammar_symbol', # the default
                              layer_name='addresses_1', # default: 'parse'
                              attributes=(), # default: ()
                              output_ambiguous=True # default False, True recommended
                              )
tagger

name,output layer,output attributes,input layers
GrammarParsingTagger,addresses_1,(),['address_parts']

0,1
grammar,"\nGrammar:\n\tstart: ADDRESS\n\tterminals: ASULA, MAJA, TÄNAV\n\tnonterminals: ADDRESS\n ..., type: <class 'estnltk.finite_grammar.grammar.Grammar'>"
name_attribute,grammar_symbol
output_nodes,{'ADDRESS'}
resolve_support_conflicts,True
resolve_start_end_conflicts,True
resolve_terminals_conflicts,True
ambiguous,True
gap_validator,


In [6]:
tagger.tag(text)

text
"Veski 5, Elva, Tartumaa."

layer name,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,7
words,normalized_form,,,False,7
address_parts,"grammar_symbol, type",,,True,4
addresses_1,,,address_parts,True,2


In [7]:
text.addresses_1

layer name,attributes,parent,enveloping,ambiguous,span count
addresses_1,,,address_parts,True,2

text
"['Veski', '5']"
"['Veski', '5', 'Elva']"


# Priorities

In [8]:
grammar = Grammar(start_symbols=['ADDRESS'], 
                  rules=None, # the default, deprecated
                  depth_limit=float('inf'), # the default
                  width_limit=float('inf'), # the default
                  legal_attributes=None # the default
                  )

grammar.add(Rule('ADDRESS', 'TÄNAV MAJA ASULA', group='g0', priority=2))
grammar.add(Rule('ADDRESS', 'TÄNAV MAJA',       group='g0', priority=3))
tagger = GrammarParsingTagger(grammar=grammar,
                              layer_of_tokens='address_parts',
                              name_attribute='grammar_symbol', # the default
                              layer_name='addresses_2', # the default
                              attributes=(), # default: ()
                              output_ambiguous=True # default False
                              )
tagger.tag(text)
text.addresses_2

layer name,attributes,parent,enveloping,ambiguous,span count
addresses_2,,,address_parts,True,1

text
"['Veski', '5', 'Elva']"


# Decorators

In [9]:
def address_decorator(nodes):
    asula = ''
    maakond = ''
    t2nav = ''
    indeks = ''
    maja = ''
    for node in nodes:
        if node.name == 'ASULA':
            asula = node.text#[0]
        elif node.name == 'TÄNAV':
            t2nav = node.text#[0]
        elif node.name == 'MAAKOND':
            maakond = node.text#[0]
        elif node.name == 'MAJA':
            maja = node.text#[0]
        elif node.name == 'INDEKS':
            indeks = node.text#[0]
    return {'grammar_symbol': 'ADDRESS',
            'ASULA': asula,
            'TÄNAV': t2nav,
            'INDEKS': indeks,
            'MAAKOND': maakond,
            'MAJA': maja}

grammar = Grammar(start_symbols=['ADDRESS'], 
                  rules=None, # the default, deprecated
                  depth_limit=float('inf'), # the default
                  width_limit=float('inf'), # the default
                  legal_attributes=['INDEKS', 'grammar_symbol', 'MAJA', 'TÄNAV', 'MAAKOND', 'ASULA']
                  )

grammar.add(Rule('ADDRESS', 'TÄNAV MAJA ASULA', group='g0', priority=3, decorator=address_decorator))
grammar.add(Rule('ADDRESS', 'TÄNAV MAJA',       group='g0', priority=3, decorator=address_decorator))
tagger = GrammarParsingTagger(grammar=grammar,
                              layer_of_tokens='address_parts',
                              name_attribute='grammar_symbol',
                              layer_name='addresses_3',
                              attributes=('INDEKS', 'grammar_symbol', 'MAJA', 'TÄNAV', 'MAAKOND', 'ASULA'),
                              output_nodes=None,
                              resolve_support_conflicts=True,
                              resolve_start_end_conflicts=True,
                              resolve_terminals_conflicts=True,
                              output_ambiguous=False # default False
                              )
tagger.tag(text)
text.addresses_3

layer name,attributes,parent,enveloping,ambiguous,span count
addresses_3,"INDEKS, grammar_symbol, MAJA, TÄNAV, MAAKOND, ASULA",,address_parts,False,2

text,INDEKS,grammar_symbol,MAJA,TÄNAV,MAAKOND,ASULA
"['Veski', '5']",,ADDRESS,5,Veski,,
"['Veski', '5', 'Elva']",,ADDRESS,5,Veski,,Elva


# Validators

In [10]:
text = Text('Inimesed, kes töötavad Tartus Ülikooli 5, Elva haiglas \
            ja Tõravere observatooriumis, söövad esmaspäeviti õunu.').tag_layer(['words'])
address_part_tagger.tag(text)

grammar = Grammar(start_symbols=['ADDRESS'],
                  legal_attributes=['INDEKS', 'grammar_symbol', 'MAJA', 'TÄNAV', 'MAAKOND', 'ASULA']
                  )

grammar.add(Rule('ADDRESS', 'TÄNAV MAJA ASULA', group='g0', priority=3, decorator=address_decorator))
grammar.add(Rule('ADDRESS', 'TÄNAV MAJA',       group='g0', priority=3, decorator=address_decorator))
tagger = GrammarParsingTagger(grammar=grammar,
                              layer_of_tokens='address_parts',
                              name_attribute='grammar_symbol',
                              layer_name='addresses_4',
                              attributes=('INDEKS', 'grammar_symbol', 'MAJA', 'TÄNAV', 'MAAKOND', 'ASULA')
                              )
tagger.tag(text)
text.addresses_4

layer name,attributes,parent,enveloping,ambiguous,span count
addresses_4,"INDEKS, grammar_symbol, MAJA, TÄNAV, MAAKOND, ASULA",,address_parts,False,2

text,INDEKS,grammar_symbol,MAJA,TÄNAV,MAAKOND,ASULA
"['Ülikooli', '5']",,ADDRESS,5,Ülikooli,,
"['Ülikooli', '5', 'Elva']",,ADDRESS,5,Ülikooli,,Elva


In [11]:
text = Text('Inimesed, kes töötavad Tartus Ülikooli 5, Elva haiglas \
            ja Tõravere observatooriumis, söövad esmaspäeviti õunu.').tag_layer(['words'])
address_part_tagger.tag(text)

town_streets = {'Elva': {'Veski', 'Tuletõrje'},
                'Tartu': {'Veski', 'Ülikooli'}}

def validator(node):
    street = node[0].text
    town = node[2].text
    if town in town_streets:
        return street in town_streets[town]
    return True

grammar = Grammar(start_symbols=['ADDRESS'], 
                  legal_attributes=['INDEKS', 'grammar_symbol', 'MAJA', 'TÄNAV', 'MAAKOND', 'ASULA']
                  )

grammar.add(Rule('ADDRESS', 'TÄNAV MAJA ASULA', group='g0', priority=3, decorator=address_decorator, validator=validator))
grammar.add(Rule('ADDRESS', 'TÄNAV MAJA',       group='g0', priority=3, decorator=address_decorator))
tagger = GrammarParsingTagger(grammar=grammar,
                              layer_of_tokens='address_parts',
                              name_attribute='grammar_symbol',
                              layer_name='addresses_4',
                              attributes=('INDEKS', 'grammar_symbol', 'MAJA', 'TÄNAV', 'MAAKOND', 'ASULA'),
                              output_ambiguous=True
                              )
tagger.tag(text)
text.addresses_4

layer name,attributes,parent,enveloping,ambiguous,span count
addresses_4,"INDEKS, grammar_symbol, MAJA, TÄNAV, MAAKOND, ASULA",,address_parts,True,1

text,INDEKS,grammar_symbol,MAJA,TÄNAV,MAAKOND,ASULA
"['Ülikooli', '5']",,ADDRESS,5,Ülikooli,,


## `SEQ` rules

In [12]:
def address_decorator(nodes):
    asula = ''
    maakond = ''
    t2nav = ''
    indeks = ''
    maja = ''
    for node in nodes:
        if node.name == 'ASULA':
            asula = node.text#[0]
        elif node.name == 'TÄNAV':
            t2nav = node.text#[0]
        elif node.name == 'MAAKOND':
            maakond = node.text#[0]
        elif node.name == 'MAJA':
            maja = node.text#[0]
        elif node.name == 'SEQ(MAJA)':
            maja = [n.text for n in node.support]
        elif node.name == 'INDEKS':
            indeks = node.text#[0]
    return {'grammar_symbol': 'ADDRESS',
            'ASULA': asula,
            'TÄNAV': t2nav,
            'INDEKS': indeks,
            'MAAKOND': maakond,
            'MAJA': maja}

text = Text('Veekatkestus Tartu Veski tänava majades 3, 5, 7.').tag_layer(['words'])
address_part_tagger.tag(text)

grammar = Grammar(start_symbols=['ADDRESS'],
                  legal_attributes=['INDEKS', 'grammar_symbol', 'MAJA', 'TÄNAV', 'MAAKOND', 'ASULA']
                  )
def scoring(node):
    return len(node[2].support)

grammar.add(Rule('ADDRESS', 'ASULA TÄNAV SEQ(MAJA)', group='g0', priority=3, decorator=address_decorator, scoring=scoring))
tagger = GrammarParsingTagger(grammar=grammar,
                              layer_of_tokens='address_parts',
                              name_attribute='grammar_symbol',
                              layer_name='addresses_4',
                              attributes=('INDEKS', 'grammar_symbol', 'MAJA', 'TÄNAV', 'MAAKOND', 'ASULA'),
                              output_ambiguous=True
                              )
tagger.tag(text)
text.addresses_4

layer name,attributes,parent,enveloping,ambiguous,span count
addresses_4,"INDEKS, grammar_symbol, MAJA, TÄNAV, MAAKOND, ASULA",,address_parts,True,3

text,INDEKS,grammar_symbol,MAJA,TÄNAV,MAAKOND,ASULA
"['Tartu', 'Veski', '3']",,ADDRESS,['3'],Veski,,Tartu
"['Tartu', 'Veski', '3', '5']",,ADDRESS,"['3', '5']",Veski,,Tartu
"['Tartu', 'Veski', '3', '5', '7']",,ADDRESS,"['3', '5', '7']",Veski,,Tartu
