In [1]:
from estnltk import EnvelopingBaseSpan
from estnltk import Annotation, Span, EnvelopingSpan, Layer, Text, Tagger

# Low-level layer operations
## Create a layer
```python
Layer(name: str,
      attributes: Sequence[str] = (),
      text_object: Text = None,
      parent: str = None,
      enveloping: str = None,
      ambiguous: bool = False,
      default_values: dict = None
      )
```
**name**: the name of the layer, required argument<br/>
**attributes**: the list of the attributes of the layer<br/>
**text_object**: the `Text` object of the layer<br/>
**parent**: the name of the parent layer<br/>
**enveloping**: the layer is enveloping<br/>
**ambiguous**: the layer is ambiguous<br/>
**defauld_values**: default values of the attributes

If `parent is not None`, then `enveloping is None`.<br>
If `enveloping is not None`, then `parent is None`.

### Has no parent, is not enveloping, is  not ambiguous

In [2]:
text = Text('Tere, maailm!')

layer = Layer(name='test_layer',
              text_object=text,
              attributes=['attr_1', 'attr_2']
              )

layer.add_annotation(( 0,  4), attr_1='TERE', attr_2='lambda a: a.layer[1]')
layer.add_annotation(( 6, 12), attr_1='MAAILM', attr_2='lambda a: 12+88')
layer.add_annotation((12, 13), attr_1='!')

layer

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,"attr_1, attr_2",,,False,3

text,attr_1,attr_2
Tere,TERE,lambda a: a.layer[1]
maailm,MAAILM,lambda a: 12+88
!,!,


### Has no parent, is not enveloping, is ambiguous
The code below differs from the code above by one line:
```python
ambiguous=True
```
Note that in this case equal spans are grouped together.

In [3]:
text = Text('Tere, maailm!')

layer = Layer(name='test_layer',
              text_object=text,
              attributes=['attr_1'],
              ambiguous=True
              )
layer.add_annotation((0,  4), attr_1='TERE')
layer.add_annotation((6, 12), attr_1='MAAILM')
layer.add_annotation((6, 12), attr_1='MaaIlm')

layer

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,attr_1,,,True,2

text,attr_1
Tere,TERE
maailm,MAAILM
,MaaIlm


### Has a parent, is not enveloping, is  ambiguous

In [4]:
text = Text('Tere, maailm!').tag_layer(['words'])

layer = Layer(name='test_layer',
              text_object=text,
              attributes=['a'],
              parent='words', 
              ambiguous=True
              )
for word in text.words:
    span = Span(base_span=word.base_span, layer=layer)

    span.add_annotation(Annotation(span, a=1))
    span.add_annotation(Annotation(span, a=2))
    
    layer.add_span(span)

layer

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,a,words,,True,4

text,a
Tere,1
,2
",",1
,2
maailm,1
,2
!,1
,2


### Does not have a parent, is enveloping, is not  ambiguous

In [5]:
t = 'Kolmkümmend kolm on suurem kui kaheksateist.'
text = Text(t)
text.tag_layer(['words'])

layer = Layer(name='test_layer',
              text_object=text,
              enveloping='words',
              attributes=['value'],
              ambiguous=False)

span = EnvelopingSpan(EnvelopingBaseSpan(s.base_span for s in text.words[0:2]), layer=layer)
span.add_annotation(Annotation(span, value=33))
layer.add_span(span)

span = EnvelopingSpan(EnvelopingBaseSpan(([text.words[1].base_span, text.words[4].base_span])), layer=layer)
span.add_annotation(Annotation(span, value=0))
layer.add_span(span)

span = EnvelopingSpan(EnvelopingBaseSpan((text.words[5].base_span, )), layer=layer)
span.add_annotation(Annotation(span, value='18'))
layer.add_span(span)

layer

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,value,,words,False,3

text,value
"['Kolmkümmend', 'kolm']",33
"['kolm', 'kui']",0
['kaheksateist'],18


The next demo phrase tagger marks all consequtive uppercase words inside a sentence.

In [6]:
import itertools

class UppercasePhraseTagger(Tagger):
    """Tag uppercase phrases

    """
    conf_param = []
    
    def __init__(self):
        self.input_layers=['words', 'sentences']
        self.output_layer='uppercasephrase'
        self.output_attributes=['phrasetext', 'tag']
    
    def _make_layer(self, text: Text, layers, status) -> Layer:
        layer = Layer(name=self.output_layer, 
                      text_object=text, 
                      enveloping='words', 
                      attributes=self.output_attributes)

        uppercases = []
        prevstart = 0
        for sentence in text.sentences:
            for idx, word in enumerate(sentence.words):
                if word.text.upper() == word.text and word.text.lower() != word.text:
                    uppercases.append((idx + prevstart, word))
            prevstart += len(sentence)

        from operator import itemgetter
        from itertools import groupby
        rs = []
        for k, g in groupby(enumerate(uppercases), lambda i: i[0] - i[1][0]):
            r = map(itemgetter(1), g)
            rs.append(list(r))

        spans = [[j for _, j in i] for i in rs if len(i) > 1]

        for idx, s in enumerate(spans):
            layer.add_annotation(s, phrasetext = ' '.join([i.text for i in s]).lower(), tag=idx)

        return layer


tagger = UppercasePhraseTagger()
text = Text('Minu KARU ON PUNANE. MIS värvi SINU KARU on? Kuidas PALUN?')
text.tag_layer(['words', 'sentences'])
tagger.tag(text)

text
Minu KARU ON PUNANE. MIS värvi SINU KARU on? Kuidas PALUN?

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,3
tokens,,,,False,14
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,14
uppercasephrase,"phrasetext, tag",,words,False,2


In [7]:
text.uppercasephrase

layer name,attributes,parent,enveloping,ambiguous,span count
uppercasephrase,"phrasetext, tag",,words,False,2

text,phrasetext,tag
"['KARU', 'ON', 'PUNANE']",karu on punane,0
"['SINU', 'KARU']",sinu karu,1


In [8]:
text.tag_layer(['morph_analysis'])

print(text.phrasetext)
#['karu on punane', 'sinu karu']

print(text.uppercasephrase.lemma)
# [[['karu'], ['olema', 'olema'], ['punane']], [['sina'], ['karu']]]

print([i.text for i in text.words if i not in list(itertools.chain(*text.uppercasephrase.spans))])
# ['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']

print([i.text for i in text.words if i not in list(itertools.chain(*text.uppercasephrase.spans))])
# ['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']

mapping = {i: j for j in text.uppercasephrase for i in j.base_span}
print([mapping[i.base_span].tag if i.base_span in mapping else i.text for i in text.words])
# ['Minu', 0, 0, 0, '.', 'MIS', 'värvi', 1, 1, 'on', '?', 'Kuidas', 'PALUN', '?']

['karu on punane', 'sinu karu']
[AmbiguousAttributeList([['karu'], ['olema', 'olema'], ['punane']], ('lemma',)), AmbiguousAttributeList([['sina'], ['karu']], ('lemma',))]
['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']
['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']
['Minu', 0, 0, 0, '.', 'MIS', 'värvi', 1, 1, 'on', '?', 'Kuidas', 'PALUN', '?']


Here is an example how to create a new layer on top of an enveloping layer.

In [9]:
text = Text('Esimene lause? Teine lause! Kolmas lause.').analyse('segmentation')
sentence_types = {'.': 'statement', '?': 'question', '!': 'exclamation'}

layer = Layer(name='sentence_type', parent='sentences', attributes=('type',), text_object=text)
for s in text['sentences']:
    layer.add_annotation(s, type=sentence_types.get(s[-1].text))

layer

layer name,attributes,parent,enveloping,ambiguous,span count
sentence_type,type,sentences,,False,3

text,type
"['Esimene', 'lause', '?']",question
"['Teine', 'lause', '!']",exclamation
"['Kolmas', 'lause', '.']",statement


## Descendant layers and ancestor layers

Descendant layers are downward in the chain of `parent` and `enveloping` relations, ancestor layers are upward in the tree of `parent` and `enveloping` relations.

In [10]:
text = Text('')

layer_1 = Layer('layer_1', text_object=text)
layer_2 = Layer('layer_2', text_object=text, parent='layer_1')
layer_3 = Layer('layer_3', text_object=text, parent='layer_2')
layer_4 = Layer('layer_4', text_object=text, enveloping='layer_2')
layer_5 = Layer('layer_5', text_object=text, enveloping='layer_2')
layer_6 = Layer('layer_6', text_object=text, parent='layer_5')

text.add_layer(layer_1)
text.add_layer(layer_2)
text.add_layer(layer_3)
text.add_layer(layer_4)
text.add_layer(layer_5)
text.add_layer(layer_6)

layer_2.descendant_layers()

['layer_1']

In [11]:
layer_2.ancestor_layers()

['layer_3', 'layer_4', 'layer_5', 'layer_6']

# Attribute lists
Immutable lists for layer attribute value representation.

In [12]:
from estnltk.layer import AttributeList
from estnltk.layer import AmbiguousAttributeList
from estnltk.layer import AttributeTupleList
from estnltk.layer import AmbiguousAttributeTupleList

In [13]:
al = AttributeList([1,2,3,4], 'attr')
al

Unnamed: 0,attr
0,1
1,2
2,3
3,4


In [14]:
aal = AmbiguousAttributeList([[1,2], [3,4], [5]], 'attr')
aal

Unnamed: 0,attr
0.0,1
,2
1.0,3
,4
2.0,5


In [15]:
atl = AttributeTupleList([[1,2,3], [4,5,6], [7,8,9]], ['attr_1', 'attr_2', 'attr_3'])
atl.index = False
atl

Unnamed: 0,attr_1,attr_2,attr_3
0,1,2,3
1,4,5,6
2,7,8,9


In [16]:
aatl = AmbiguousAttributeTupleList([[[1,2], [3,4]], [[5,6], [7,8], [9,10]], [[11,12]]],
                                   ['attr_1', 'attr_2'])
aatl

Unnamed: 0,attr_1,attr_2
0.0,1,2
,3,4
1.0,5,6
,7,8
,9,10
2.0,11,12
