In [1]:
import csv
from collections import Counter
import re
from grammarextractor.taggers import RobustDateNumberTagger
from estnltk.taggers import GapTagger
from estnltk.finite_grammar import Rule, Grammar
from estnltk.taggers import GrammarParsingTagger
from estnltk.taggers import RegexTagger
from estnltk import Text

In [2]:
examples = [
    '5 p',
    '5 pos'
]

In [3]:
vocabulary = [
{'grammar_symbol': 'NUMBER',
 'regex_type': 'anynumber',
 '_regex_pattern_': r'(^|[^0-9.])([0-9]+(\s?[,.]\s?[0-9]+)?)',
 '_group_': 0,
 '_priority_': 1,
 '_validator_': lambda m: True,
 'value': lambda m: re.sub('\s?[.,]\s?','.', m.group(2))},
    
   
    {'grammar_symbol': 'POS',
 'regex_type': 'pos',
 '_regex_pattern_': r'(POSITIVE|POSITIIVNE|POSIT|POS|[Pp]ositiivne|[Pp]osit|[Pp]os|\+)',
 '_group_': 0,
 '_priority_': 0,
 '_validator_': lambda m: True,
 'value': lambda m: m.group(0)}
  
]

In [4]:
regex_tagger = RegexTagger(vocabulary=vocabulary,
                              output_attributes=('grammar_symbol', 'regex_type', 'value', '_priority_'),
                              conflict_resolving_strategy='MAX',
                              priority_attribute='_priority_',
                              overlapped=True,
                              ambiguous = True,
                              output_layer='numbers')

In [5]:
def trim(t:str) -> str:
    return t.strip().strip(',').strip()

In [6]:
gap_tagger = GapTagger(output_layer='gaps',
                       input_layers=['numbers'],
                       trim=trim)

In [7]:
for ex in examples:
    text = Text(ex)
    regex_tagger.tag(text)
    print(text.text)
    print(text.numbers.text)
    print('----------------------')

5 p
['5']
----------------------
5 pos
['5', 'pos']
----------------------


In [8]:
grammar = Grammar(start_symbols=['MEASUREMENT', 'EVAL_MEASUREMENT'], 
                  legal_attributes=('TYPE',) # the default
                  )

In [9]:
def dec1(nodes):
    return {'TYPE': 'MEASUREMENT'}

In [10]:
def dec2(nodes):
    return {'TYPE': 'EVAL_MEASUREMENT'}

In [11]:
grammar.add_rule('NUM', 'NUMBER',  group = 'g0', priority = 1, decorator = dec1)
grammar.add_rule('NUM', 'NUMBER2',  group = 'g0', priority = 1, decorator = dec1)
grammar.add_rule('NUM', 'NUMBER3',  group = 'g0', priority = 1, decorator = dec1)
grammar.add_rule('MEASUREMENT', 'NUM',  group = 'g0', priority = 1, decorator = dec1)
grammar.add_rule('EVAL_MEASUREMENT', 'NUM POS',  group = 'g0', priority = 0, decorator = dec2)

In [12]:
gram_tagger = GrammarParsingTagger(grammar=grammar,
                              layer_of_tokens='numbers',
                              layer_name='measurements', 
                              output_ambiguous=True, # default False, True recommended
                                   attributes = ('TYPE', )
                              )

In [13]:
gram_tagged = []
for ex in examples:
    ex = Text(ex)
    regex_tagger.tag(ex)
    gap_tagger.tag(ex)
    gram_tagger.tag(ex)
    gram_tagged.append(ex)

In [14]:
gram_tagged[0].numbers

layer name,attributes,parent,enveloping,ambiguous,span count
numbers,"grammar_symbol, regex_type, value, _priority_",,,True,1

text,grammar_symbol,regex_type,value,_priority_
5,NUMBER,anynumber,5,1


In [15]:
gram_tagged[0].measurements

layer name,attributes,parent,enveloping,ambiguous,span count
measurements,TYPE,,numbers,True,1

text,TYPE
['5'],MEASUREMENT


In [16]:
gram_tagged[1].numbers

layer name,attributes,parent,enveloping,ambiguous,span count
numbers,"grammar_symbol, regex_type, value, _priority_",,,True,2

text,grammar_symbol,regex_type,value,_priority_
5,NUMBER,anynumber,5,1
pos,POS,pos,pos,0


In [17]:
gram_tagged[1].measurements

layer name,attributes,parent,enveloping,ambiguous,span count
measurements,TYPE,,numbers,True,1

text,TYPE
"['5', 'pos']",EVAL_MEASUREMENT
