Use [RISE](https://github.com/damianavila/RISE) to display the slides.

```bash
pip install RISE
```

kuu - month <br>
kärbes - fly <br>
mees - man <br>
mesi - honey <br>
oli - was <br>
päev - day <br>
tund - hour <br>
toakärbes - housefly

<H2 style="text-align: center;">
Estonian Natural Language Toolkit
</H2>


<H1 style="text-align: center;">
Paul Tammo
</H1>

<H4 style="text-align: center;">
University of Tartu, STACC, Fujitsu
</H4>

<H4 style="text-align: center;">
PyCon Oct 3, 2019, Tallinn
</H4>

# Source

https://github.com/estnltk/estnltk/

# Installation
Optionally create and activate conda environment. Use Python 3.5 or 3.6.
```bash
conda create -n estnltk python=3.6 -y
conda activate estnltk
```
Install EstNLTK 1.6 using precompiled packages in <br> https://anaconda.org/estnltk/estnltk
```bash
conda install -c estnltk -c conda-forge estnltk
```

# Structure of the `Text` object

<center><img src='text_object.png'></center>

In [1]:
from estnltk import Text

text = Text('Mees oli kärbes.').analyse('morphology')
text.morph_analysis

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, end...",words,,True,4

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Mees,Mees,mees,mees,['mees'],0,,sg n,S
oli,oli,olema,ole,['ole'],i,,s,V
kärbes,kärbes,kärbes,kärbes,['kärbes'],0,,sg n,S
.,.,.,.,['.'],,,,Z



# Longer example

from https://et.wikipedia.org/wiki/Harilik_toakärbes

In [2]:
text = Text(
'Emane toakärbes muneb kõdunevale aluspinnale munad – 100–120 tükki korraga. '
'Toakärbse munade areng vältab vahel vaid 8 tundi, harva 3 päeva või kauem. '
'Mõne päeva möödudes on vaglad 8–12 mm pikad. '
'Täiskasvanud toakärbse eluiga on 15–20 päeva, kuid nad võivad elada kuni 2 kuud.')

text.analyse('morphology')

text
"Emane toakärbes muneb kõdunevale aluspinnale munad – 100–120 tükki korraga. Toakärbse munade areng vältab vahel vaid 8 tundi, harva 3 päeva või kauem. Mõne päeva möödudes on vaglad 8–12 mm pikad. Täiskasvanud toakärbse eluiga on 15–20 päeva, kuid nad võivad elada kuni 2 kuud."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,4
words,normalized_form,,,True,56
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,56


# Iterate text and search

In [3]:
for sentence in text.sentences:
    for word in sentence:
        if 'toakärbes' == word.text:
        # try also:
        # if 'toakärbse' == word.text.lower():
        # if 'toakärb' in word.text:
        # if 'toakärbes' in word.lemma:
            print(sentence.enclosing_text)
            break

Emane toakärbes muneb kõdunevale aluspinnale munad – 100–120 tükki korraga.


# Taggers create layers

In [4]:
from estnltk import Layer
from estnltk.taggers import Tagger, GrammarParsingTagger

class TemporalTokenTagger(Tagger):
    """Tag tokens of temporal expressions""" 
    conf_param = []
    
    def __init__(self):
        self.input_layers = ['morph_analysis']
        self.output_layer = 'temporal_tokens'
        self.output_attributes = ['token_type']
    
    def _make_layer(self, text, layers, status):
        layer = Layer(name=self.output_layer, attributes=self.output_attributes, text_object=text)

        for span in layers['morph_analysis']:
            for annotation in span.annotations:
                if annotation['lemma'] in {'sekund', 'minut', 'tund', 'päev', 'nädal', 'kuu', 
                                           'aasta', 'sajand'}:
                    layer.add_annotation(span.base_span, token_type='UNIT')
                    break
                if annotation['lemma'].isnumeric():
                    layer.add_annotation(span.base_span, token_type='NUMBER')
                    break
            else:
                layer.add_annotation(span.base_span, token_type='GAP')
        return layer

In [5]:
temporal_token_tagger = TemporalTokenTagger()
temporal_token_tagger.tag(text)

text
"Emane toakärbes muneb kõdunevale aluspinnale munad – 100–120 tükki korraga. Toakärbse munade areng vältab vahel vaid 8 tundi, harva 3 päeva või kauem. Mõne päeva möödudes on vaglad 8–12 mm pikad. Täiskasvanud toakärbse eluiga on 15–20 päeva, kuid nad võivad elada kuni 2 kuud."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,4
words,normalized_form,,,True,56
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,56
temporal_tokens,token_type,,,False,56


In [6]:
def token_highlighter(x):
    mapping = {'GAP': 'white', 'NUMBER': 'aqua', 'UNIT': 'yellow'}
    return mapping[x[1].annotations[0]['token_type']]

text.temporal_tokens.display(mapping_dict={'background': token_highlighter})

# GrammarParsingTagger
Tag temporal expressions with `GrammarParsingTagger`

In [7]:
from estnltk.finite_grammar import Grammar
from estnltk.taggers import GrammarParsingTagger

grammar = Grammar(start_symbols=['TEMP'])
grammar.add_rule('TEMP', ['NUMBER', 'UNIT'])

temporal_expression_tagger = GrammarParsingTagger(grammar=grammar, 
                              layer_of_tokens='temporal_tokens',
                              name_attribute='token_type',
                              output_layer='temporal_expressions')
temporal_expression_tagger.tag(text)
text.temporal_expressions.display(mapping_dict={'background': lambda s: 'yellow'})

# That was just a demo

# TimexTagger

In [8]:
from estnltk.taggers import TimexTagger

timex_tagger = TimexTagger()

timex_tagger.tag(text)
text.timexes.display(mapping_dict={'background': lambda s: 'yellow'})

In [9]:
text.timexes

layer name,attributes,parent,enveloping,ambiguous,span count
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,words,False,4

text,tid,type,value,temporal_function,anchor_time_id,mod,quant,freq,begin_point,end_point,part_of_interval
"['8', 'tundi']",t1,DURATION,PT8H,False,,,,,,,
"['3', 'päeva']",t2,DURATION,P3D,False,,,,,,,
"['20', 'päeva']",t3,DURATION,P20D,False,,,,,,,
"['2', 'kuud']",t4,DURATION,P2M,False,,,,,,,


# PostgreSQL interface

# Setup

In [10]:
from estnltk.storage.postgres import PostgresStorage, create_schema, delete_schema

storage = PostgresStorage(dbname='test_db',
                          password=None,
                          pgpass_file='~/.pgpass',
                          schema='my_schema')
create_schema(storage)
collection = storage['my_collection'].create()

INFO:storage.py:42: connecting to host: 'localhost', port: '5432', dbname: 'test_db', user: 'pault'
INFO:storage.py:58: schema: 'my_schema', temporary: False, role: 'pault'
INFO:collection.py:107: new empty collection 'my_collection' created


# Populate the collection

In [11]:
from estnltk.layer_operations import split_by_sentences

# the fail 'housefly.txt' contains plain text from https://et.wikipedia.org/wiki/Harilik_toakärbes
with open('housefly.txt') as text_file:
    text = Text(text_file.read())

text.analyse('morphology')

texts = split_by_sentences(text)

with collection.insert() as collection_insert:
    for text in texts:
        collection_insert(text)

INFO:collection.py:325: inserted 100 texts into the collection 'my_collection'


In [12]:
collection

Unnamed: 0,layer_type,attributes,ambiguous,parent,enveloping,meta
morph_analysis,attached,"(normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech)",True,words,,[]
sentences,attached,(),False,,words,[]
words,attached,"(normalized_form,)",True,,,[]


# Tag temporal expressions

In [13]:
collection.create_layer(tagger=temporal_token_tagger)

INFO:collection.py:817: collection: 'my_collection'
INFO:collection.py:836: preparing to create a new layer: 'temporal_tokens'
INFO:collection.py:869: inserting data into the 'temporal_tokens' layer table
INFO:collection.py:904: layer created: 'temporal_tokens'


In [14]:
collection.create_layer(tagger=temporal_expression_tagger)

INFO:collection.py:817: collection: 'my_collection'
INFO:collection.py:836: preparing to create a new layer: 'temporal_expressions'
INFO:collection.py:869: inserting data into the 'temporal_expressions' layer table
INFO:collection.py:904: layer created: 'temporal_expressions'


In [15]:
collection

Unnamed: 0,layer_type,attributes,ambiguous,parent,enveloping,meta
morph_analysis,attached,"(normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech)",True,words,,[]
sentences,attached,(),False,,words,[]
temporal_expressions,detached,(),False,,temporal_tokens,[]
temporal_tokens,detached,"(token_type,)",False,,,[]
words,attached,"(normalized_form,)",True,,,[]


# Iterate the collection

In [16]:
examples = (text for _, text in collection.select(layers=['temporal_expressions']) 
            if len(text.temporal_expressions) > 0)

In [17]:
text = next(examples)
text.temporal_expressions.display(mapping_dict={'background': lambda s: 'yellow'})

In [18]:
from estnltk.storage.postgres import JsonbTextQuery

q = JsonbTextQuery('morph_analysis', lemma='kärbes') | \
    JsonbTextQuery('morph_analysis', lemma='toakärbes')

In [19]:
def fly_highlighter(segment):
    if 'toakärbes' in segment[1].lemma or 'kärbes' in segment[1].lemma:
        return 'yellow'
    return 'white'

examples = (text for _, text in collection.select(query=q, layers=['morph_analysis']))

In [20]:
text = next(examples)
text.morph_analysis.display(mapping_dict={'background': fly_highlighter})

# Teardown

In [21]:
delete_schema(storage)
storage.close()

# Project manager
Sven Laur

# Developers

Rasmus Maide,
Siim Orasmaa,
Timo Petmanson,
Uku Raudvere,
Dage Särg,
Paul Tammo,
Aleksandr Tkatšenko

# Consulting
Heiki-Jaan Kaalep,
Kadri Muschinek,
Kairit Sirts,
Tarmo Vaino

<font size="7">
$$\lim_{n \to \infty} \textrm C(++)^n = \textrm{Python}$$
</font>