In [1]:
from estnltk import Text
from estnltk.text import Span, SpanList, Layer

# Low-level layer operations
## Create a layer
```python
Layer(
      name:str=None,
      attributes:Union[Tuple, List]=tuple(),
      parent:str=None,
      enveloping:str=None,
      ambiguous:bool=False
     )
```
**name**: the name of the layer, required argument<br/>
**attributes**: the list of the attributes of the layer<br/>
**parent**: the name of the parent layer<br/>
**enveloping**: the layer is enveloping<br/>
**ambiguous**: the layer is ambiguous

### Has a parent, is not enveloping, is  ambiguous

In [2]:
layer = Layer(name='test_layer',
              attributes=['a'],
              parent='words', 
              ambiguous=True
              )

Add spans and attributes

In [3]:
text = Text('Tere maailm!').tag_layer(['words'])

for word in text.words:
    layer.add_span(Span(parent=word)).a = 1
    layer.add_span(Span(parent=word)).a = 2

text['test_layer'] = layer
text['test_layer']

text,a
Tere,1
,2
maailm,1
,2
!,1
,2


### Does not have a parent, Is enveloping, is not  ambiguous

In [4]:
t = 'Kolmkümmend kolm on suurem kui kaheksateist.'
text = Text(t)
text.tag_layer(['words'])

layer = Layer(name='test_layer', enveloping='words', attributes=['value'], ambiguous=False)

spl = SpanList()
spl.spans = text.words[0:2]
spl.value = '33'
layer.add_span(spl)
spl = SpanList()
spl.spans = text.words[5:6]
spl.value = '18'
layer.add_span(spl)

text['test_layer'] = layer
text['test_layer']

text,value
Kolmkümmend kolm,33
kaheksateist,18


The next demo phrase tagger marks all consequtive uppercase words inside a sentence.

In [5]:
import itertools

class UppercasePhraseTagger:
    def tag(self, text:Text) -> Text:
        uppercases = []
        prevstart = 0
        for sentence in (text.sentences.words):
            for idx, word in enumerate(sentence.words):
                if word.text.upper() == word.text and word.text.lower() != word.text:
                    uppercases.append((idx + prevstart, word))
            prevstart += len(sentence)

        from operator import itemgetter
        from itertools import groupby
        rs = []
        for k, g in groupby(enumerate(uppercases), lambda i: i[0] - i[1][0]):
            r = map(itemgetter(1), g)
            rs.append(list(r))

        spans = [[j for _, j in i] for i in rs if len(i) > 1]
        l = Layer(name='uppercasephrase', enveloping='words', attributes=['phrasetext', 'tag'])

        for idx, s in enumerate(spans):
            sps = l._add_spans_to_enveloping(s)
            sps.phrasetext =  ' '.join([i.text for i in s]).lower()
            sps.tag = idx
        text._add_layer(l)

        return text


w = UppercasePhraseTagger()
t = Text('Minu KARU ON PUNANE. MIS värvi SINU KARU on? Kuidas PALUN?')
t.tag_layer(['words', 'sentences'])
w.tag(t)

text
Minu KARU ON PUNANE. MIS värvi SINU KARU on? Kuidas PALUN?

layer,attributes,parent,enveloping,ambiguous,number of spans
tokens,,,,False,14
compound_tokens,type,,tokens,False,0
words,,,,False,14
sentences,,,words,False,3
uppercasephrase,"phrasetext, tag",,words,False,2


In [6]:
t['uppercasephrase']

text,phrasetext,tag
KARU ON PUNANE,karu on punane,0
SINU KARU,sinu karu,1


In [7]:
t.tag_layer(['morph_analysis'])
print(t.uppercasephrase.get_attributes(['phrasetext', 'text']))
#[[('karu on punane', 'KARU'), ('karu on punane', 'ON'), ('karu on punane', 'PUNANE')], [('sinu karu', 'SINU'), ('sinu karu', 'KARU')]]

print(t.phrasetext)
#['karu on punane', 'sinu karu']

print(t.uppercasephrase.lemma)
# [[['karu'], ['olema', 'olema'], ['punane']], [['sina'], ['karu']]]

print([i.text for i in t.words if i not in list(itertools.chain(*t.uppercasephrase.spans))])
# ['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']

mapping ={i:[j for j in t.uppercasephrase.spans if i in j][0] for i in list(itertools.chain(*t.uppercasephrase.spans))}
print([i.text for i in t.words if i not in list(itertools.chain(*t.uppercasephrase.spans))])
# ['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']

print([i.text if i not in mapping.keys() else mapping[i].tag for i in t.words])
# ['Minu', 0, 0, 0, '.', 'MIS', 'värvi', 1, 1, 'on', '?', 'Kuidas', 'PALUN', '?']

[[('karu on punane', 'KARU'), ('karu on punane', 'ON'), ('karu on punane', 'PUNANE')], [('sinu karu', 'SINU'), ('sinu karu', 'KARU')]]
['karu on punane', 'sinu karu']
[[['karu'], ['olema', 'olema'], ['punane']], [['sina'], ['karu']]]
['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']
['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']
['Minu', 0, 0, 0, '.', 'MIS', 'värvi', 1, 1, 'on', '?', 'Kuidas', 'PALUN', '?']


# Rewrite

Rewriter is a class that contains a `rewrite` method. If `rewrite` returns `None`, the corresponding span is not created.

In [8]:
class TestRewriter:
    def rewrite(self, record):
        if record['start'] == 0:
            return None
        return record

t = Text('Tere maailm!')
t.tag_layer(('words',))
test_layer = t['words'].rewrite(source_attributes=[],
                                target_attributes=[],
                                rules = TestRewriter(),
                                name='test_layer')
t['test_layer'] = test_layer
t['test_layer']

text
maailm
!
