In [1]:
from estnltk.text import Layer, Span, Text

# Low-level layer operations
## Create a layer

In [2]:
layer = Layer(name='test_layer',
              attributes=['a'],
              parent='words', 
              ambiguous=True
              )

Layer must have a name. The default values are:
```python
name=None,
attributes:Union[Tuple, List]=tuple(),
parent:str=None,
enveloping:str=None,
ambiguous:bool=None
```

## Add spans and attributes

In [3]:
text = Text('Tere maailm!').tag_layer()

for word in text.words:
    layer.add_span(Span(parent=word)).a = 1
    layer.add_span(Span(parent=word)).a = 2

text['test_layer'] = layer
text.test_layer

SL[SL[Span(Tere, {'a': 1}),
Span(Tere, {'a': 2})],
SL[Span(maailm, {'a': 1}),
Span(maailm, {'a': 2})],
SL[Span(!, {'a': 1}),
Span(!, {'a': 2})]]

# Rewrite

Rewriter is a class that contains a `rewrite` method. If `rewrite` returns `None`, the corresponding span is not created.

In [4]:
class TestRewriter:
    def rewrite(self, record):
        if record['start'] == 0:
            return None
        return record

t = Text('Tere maailm!')
t.tag_layer(('words',))
test_layer = t['words'].rewrite(source_attributes=[],
                                target_attributes=[],
                                rules = TestRewriter(),
                                name='test_layer')
t['test_layer'] = test_layer
t.test_layer

SL[Span(maailm, {}),
Span(!, {})]

# Tag phrases

In [5]:
import itertools

from estnltk import Text
from estnltk.text import Layer


class UppercasePhraseTagger:
    # demo tagger, mis markeerib ära lause piirides järjestikused jooksud suurtähtedega sõnu

    def tag(self, text:Text) -> Text:

        uppercases = []
        prevstart = 0
        for sentence in (text.sentences.words):
            for idx, word in enumerate(sentence.words):
                if word.text.upper() == word.text and word.text.lower() != word.text:
                    uppercases.append((idx + prevstart, word))
            prevstart += len(sentence)

        from operator import itemgetter
        from itertools import groupby
        rs = []
        for k, g in groupby(enumerate(uppercases), lambda i: i[0] - i[1][0]):
            r = map(itemgetter(1), g)
            rs.append(list(r))


        spans = [[j for _, j in i] for i in rs if len(i) > 1]
        l = Layer(enveloping='words', name='uppercasephrase', attributes=['phrasetext', 'tag'])

        for idx, s in enumerate(spans):
            sps = l._add_spans_to_enveloping(s)
            sps.phrasetext =  ' '.join([i.text for i in s]).lower()
            sps.tag = idx
        text._add_layer(l)

        return text


w = UppercasePhraseTagger()
t = w.tag(Text('Minu KARU ON PUNANE. MIS värvi SINU KARU on? Kuidas PALUN?').tag_layer(['words', 'sentences']))
t.tag_layer(['morf_analysis'])
print(t.uppercasephrase.get_attributes(['phrasetext', 'text']))
#[[('karu on punane', 'KARU'), ('karu on punane', 'ON'), ('karu on punane', 'PUNANE')], [('sinu karu', 'SINU'), ('sinu karu', 'KARU')]]

print(t.phrasetext)
#['karu on punane', 'sinu karu']

print(t.uppercasephrase.lemma)
# [[['karu'], ['olema', 'olema'], ['punane']], [['sina'], ['karu']]]

print([i.text for i in t.words if i not in list(itertools.chain(*t.uppercasephrase.spans))])
# ['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']

mapping ={i:[j for j in t.uppercasephrase.spans if i in j][0] for i in list(itertools.chain(*t.uppercasephrase.spans))}
print([i.text for i in t.words if i not in list(itertools.chain(*t.uppercasephrase.spans))])
# ['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']

print([i.text if i not in mapping.keys() else mapping[i].tag for i in t.words])
# ['Minu', 0, 0, 0, '.', 'MIS', 'värvi', 1, 1, 'on', '?', 'Kuidas', 'PALUN', '?']

[[('karu on punane', 'KARU'), ('karu on punane', 'ON'), ('karu on punane', 'PUNANE')], [('sinu karu', 'SINU'), ('sinu karu', 'KARU')]]
['karu on punane', 'sinu karu']
[[['karu'], ['olema', 'olema'], ['punane']], [['sina'], ['karu']]]
['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']
['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']
['Minu', 0, 0, 0, '.', 'MIS', 'värvi', 1, 1, 'on', '?', 'Kuidas', 'PALUN', '?']
