In [1]:
from estnltk import Text
from estnltk.taggers import RegexTagger, SpanTagger, PhraseTagger
import csv
import re
from collections import defaultdict, Counter
from estnltk.finite_grammar import Rule, Grammar
from estnltk.taggers import Vocabulary
from estnltk.taggers import Atomizer
from estnltk.taggers import MergeTagger

In [2]:
# testdata
hosp_addresses = []
with open("hospital_addresses.csv", "r") as fin:
    reader = csv.reader(fin)
    header = next(reader)
    for row in reader:
        if row[0] not in hosp_addresses:
            hosp_addresses.append(row[0])

In [3]:
house_nr_voc = [
{'grammar_symbol': 'HOUSE',
 'regex_type': 'house_nr',
 '_regex_pattern_': r'([0-9]{1,3}([abcdefghijkABCDEFGHIJK])?/?){1,3}(\s*-\s*[0-9]{1,3})?',
 '_group_': 0,
 '_priority_': 1,
 '_validator_': lambda m: not re.search(r'[0-9]{4,}', m.group(0)),
 'value': lambda m: re.search(r'([0-9]{1,3}([abcdefghijkABCDEFGHIJK])?/?){1,3}(\s*-\s*[0-9]{1,3})?', 
                              m.group(0)).group(0)}]


In [4]:
house_nr_tagger = RegexTagger(vocabulary = house_nr_voc, output_attributes = ('grammar_symbol', 'regex_type', 'value'))

In [5]:
vocabulary_file = 'name_vocabulary.csv'

In [6]:
place_name_tagger = PhraseTagger(output_layer='phrases',
                      input_layer='words',
                      input_attribute='text',
                      vocabulary=vocabulary_file,
                      output_attributes=['type', 'grammar_symbol'],
                      conflict_resolving_strategy='ALL')
                      #priority_attribute='_priority_')

In [7]:
spec_word_vocabulary = 'spec_word_voc.csv'

In [8]:
spec_voc_tagger = SpanTagger(
    output_layer='spec_word',
    input_layer = 'words', 
    input_attribute = 'text',
    output_attributes = ('type', 'grammar_symbol'),
    vocabulary = spec_word_vocabulary)

In [9]:
def tag_sent(sent):
    t = Text(sent).analyse('segmentation')
    house_nr_tagger.tag(t)
    place_name_tagger.tag(t)
    spec_voc_tagger.tag(t)
    return t

In [10]:
b = hosp_addresses[5]

In [11]:
c = tag_sent(b)

In [12]:
atomizer = Atomizer(output_layer='some_layer',
                    input_layer='regexes',
                    output_attributes=['grammar_symbol', 'regex_type', 'value'], # default: None
                    enveloping=None # default: None
                   )

In [13]:
atomizer2 = Atomizer(output_layer='some_layer2',
                    input_layer='phrases',
                    output_attributes=['grammar_symbol', 'type'], # default: None
                    enveloping=None # default: None
                   )

In [14]:
atomizer3 = Atomizer(output_layer='some_layer3',
                    input_layer='spec_word',
                    output_attributes=['grammar_symbol', 'type'], # default: None
                    enveloping=None # default: None
                   )

In [15]:
atomizer(c)

text
Pärnu mnt.102C

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,2
words,normalized_form,,,False,4
phrases,"type, grammar_symbol",,words,False,1
regexes,"grammar_symbol, regex_type, value",,,False,1
some_layer,"grammar_symbol, regex_type, value",,,False,1
spec_word,"type, grammar_symbol",words,,False,1


In [16]:
atomizer2(c)

text
Pärnu mnt.102C

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,2
words,normalized_form,,,False,4
phrases,"type, grammar_symbol",,words,False,1
regexes,"grammar_symbol, regex_type, value",,,False,1
some_layer,"grammar_symbol, regex_type, value",,,False,1
some_layer2,"grammar_symbol, type",,,False,1
spec_word,"type, grammar_symbol",words,,False,1


In [17]:
atomizer3(c)

text
Pärnu mnt.102C

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,2
words,normalized_form,,,False,4
phrases,"type, grammar_symbol",,words,False,1
regexes,"grammar_symbol, regex_type, value",,,False,1
some_layer,"grammar_symbol, regex_type, value",,,False,1
some_layer2,"grammar_symbol, type",,,False,1
some_layer3,"grammar_symbol, type",,,False,1
spec_word,"type, grammar_symbol",words,,False,1


In [18]:
c

text
Pärnu mnt.102C

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,2
words,normalized_form,,,False,4
phrases,"type, grammar_symbol",,words,False,1
regexes,"grammar_symbol, regex_type, value",,,False,1
some_layer,"grammar_symbol, regex_type, value",,,False,1
some_layer2,"grammar_symbol, type",,,False,1
some_layer3,"grammar_symbol, type",,,False,1
spec_word,"type, grammar_symbol",words,,False,1


In [19]:
merge_tagger = MergeTagger(output_layer='grammar_tags',
                           input_layers=['some_layer',
                                         'some_layer2', 
                                         'some_layer3'],
                           output_attributes=('grammar_symbol',))

In [20]:
merge_tagger.tag(c)

text
Pärnu mnt.102C

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,2
words,normalized_form,,,False,4
grammar_tags,grammar_symbol,,,False,3
phrases,"type, grammar_symbol",,words,False,1
regexes,"grammar_symbol, regex_type, value",,,False,1
some_layer,"grammar_symbol, regex_type, value",,,False,1
some_layer2,"grammar_symbol, type",,,False,1
some_layer3,"grammar_symbol, type",,,False,1
spec_word,"type, grammar_symbol",words,,False,1


In [21]:
c.grammar_tags

layer name,attributes,parent,enveloping,ambiguous,span count
grammar_tags,grammar_symbol,,,False,3

text,grammar_symbol
Pärnu,LINN
mnt,SPEC
102C,HOUSE
