In [1]:
from estnltk import Span, EnvelopingSpan, Layer
from estnltk import Text

# Low-level layer operations
## Create a layer
```python
Layer(
      name:str=None,
      attributes:Union[Tuple, List]=tuple(),
      parent:str=None,
      enveloping:str=None,
      ambiguous:bool=False
     )
```
**name**: the name of the layer, required argument<br/>
**attributes**: the list of the attributes of the layer<br/>
**parent**: the name of the parent layer<br/>
**enveloping**: the layer is enveloping<br/>
**ambiguous**: the layer is ambiguous

If `parent is not None`, then `enveloping is None`.<br>
If `enveloping is not None`, then `parent is None`.

### Has no parent, is not enveloping, is  not ambiguous

In [2]:
text = Text('Tere, maailm!')

layer = Layer(name='test_layer',
              attributes=['attr_1']
              )

layer.add_span(Span(start= 0, end= 4, legal_attributes=['attr_1'], attr_1='TERE'))
layer.add_span(Span(start= 6, end=12, legal_attributes=['attr_1'], attr_1='MAAILM'))
layer.add_span(Span(start=12, end=13, legal_attributes=['attr_1'], attr_1='!'))

text['test_layer'] = layer
text['test_layer']

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,attr_1,,,False,3

text,attr_1
Tere,TERE
maailm,MAAILM
!,!


### Has no parent, is not enveloping, is ambiguous
The code below differs from the code above by one line:
```python
ambiguous=True
```
Note that in this case equal spans are grouped together.

In [3]:
text = Text('Tere, maailm!')

layer = Layer(name='test_layer',
              attributes=['attr_1'],
              ambiguous=True
              )
layer.add_span(Span(start=0, end= 4, legal_attributes=['attr_1'], attr_1='TERE'))
layer.add_span(Span(start=6, end=12, legal_attributes=['attr_1'], attr_1='MAAILM'))
layer.add_span(Span(start=6, end=12, legal_attributes=['attr_1'], attr_1='MaaIlm'))

text['test_layer'] = layer
text['test_layer']

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,attr_1,,,True,2

text,attr_1
Tere,TERE
maailm,MAAILM
,MaaIlm


### Has a parent, is not enveloping, is  ambiguous

In [5]:
text = Text('Tere, maailm!').tag_layer(['words'])

layer = Layer(name='test_layer',
              attributes=['a'],
              parent='words', 
              ambiguous=True
              )
for word in text.words:
    layer.add_span(Span(parent=word, legal_attributes=['a'], a=1))
    # or the same in 3 lines
    span = Span(parent=word)
    setattr(span, 'a', 2)
    layer.add_span(span)

text['test_layer'] = layer
text['test_layer']

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,a,words,,True,4

text,a
Tere,1
,2
",",1
,2
maailm,1
,2
!,1
,2


### Does not have a parent, Is enveloping, is not  ambiguous

In [6]:
t = 'Kolmkümmend kolm on suurem kui kaheksateist.'
text = Text(t)
text.tag_layer(['words'])

layer = Layer(name='test_layer', enveloping='words', attributes=['value'], ambiguous=False)

spl = EnvelopingSpan(spans=text.words[0:2])
spl.value = '33'
layer.add_span(spl)

spl = EnvelopingSpan(spans=[text.words[1], text.words[4]])
spl.value = 0
layer.add_span(spl)

spl = EnvelopingSpan(spans=text.words[5:6])
spl.value = '18'
layer.add_span(spl)

text['test_layer'] = layer
text['test_layer']

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,value,,words,False,3

text,value
"['Kolmkümmend', 'kolm']",33
"['kolm', 'kui']",0
['kaheksateist'],18


The next demo phrase tagger marks all consequtive uppercase words inside a sentence.

In [7]:
import itertools

class UppercasePhraseTagger:
    def tag(self, text:Text) -> Text:
        uppercases = []
        prevstart = 0
        for sentence in text.sentences:
            for idx, word in enumerate(sentence.words):
                if word.text.upper() == word.text and word.text.lower() != word.text:
                    uppercases.append((idx + prevstart, word))
            prevstart += len(sentence)

        from operator import itemgetter
        from itertools import groupby
        rs = []
        for k, g in groupby(enumerate(uppercases), lambda i: i[0] - i[1][0]):
            r = map(itemgetter(1), g)
            rs.append(list(r))

        spans = [[j for _, j in i] for i in rs if len(i) > 1]
        l = Layer(name='uppercasephrase', enveloping='words', attributes=['phrasetext', 'tag'])

        for idx, s in enumerate(spans):
            sps = l.add_span(EnvelopingSpan(spans=s))
            sps.phrasetext = ' '.join([i.text for i in s]).lower()
            sps.tag = idx
        text['uppercasephrase'] = l

        return text


w = UppercasePhraseTagger()
t = Text('Minu KARU ON PUNANE. MIS värvi SINU KARU on? Kuidas PALUN?')
t.tag_layer(['words', 'sentences'])
w.tag(t)

text
Minu KARU ON PUNANE. MIS värvi SINU KARU on? Kuidas PALUN?

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,3
tokens,,,,False,14
words,normalized_form,,,False,14
uppercasephrase,"phrasetext, tag",,words,False,2


In [8]:
t['uppercasephrase']

layer name,attributes,parent,enveloping,ambiguous,span count
uppercasephrase,"phrasetext, tag",,words,False,2

text,phrasetext,tag
"['KARU', 'ON', 'PUNANE']",karu on punane,0
"['SINU', 'KARU']",sinu karu,1


In [9]:
t.tag_layer(['morph_analysis'])
print(t.uppercasephrase.get_attributes(['phrasetext', 'text']))
#[[('karu on punane', 'KARU'), ('karu on punane', 'ON'), ('karu on punane', 'PUNANE')], [('sinu karu', 'SINU'), ('sinu karu', 'KARU')]]

print(t.phrasetext)
#['karu on punane', 'sinu karu']

print(t.uppercasephrase.lemma)
# [[['karu'], ['olema', 'olema'], ['punane']], [['sina'], ['karu']]]

print([i.text for i in t.words if i not in list(itertools.chain(*t.uppercasephrase.spans))])
# ['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']

mapping ={i:[j for j in t.uppercasephrase.spans if i in j][0] for i in list(itertools.chain(*t.uppercasephrase.spans))}
print([i.text for i in t.words if i not in list(itertools.chain(*t.uppercasephrase.spans))])
# ['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']

print([i.text if i not in mapping.keys() else mapping[i].tag for i in t.words])
# ['Minu', 0, 0, 0, '.', 'MIS', 'värvi', 1, 1, 'on', '?', 'Kuidas', 'PALUN', '?']

[[('karu on punane', 'KARU'), ('karu on punane', 'ON'), ('karu on punane', 'PUNANE')], [('sinu karu', 'SINU'), ('sinu karu', 'KARU')]]
['karu on punane', 'sinu karu']
[[['karu'], ['olema', 'olema'], ['punane']], [['sina'], ['karu']]]
['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']
['Minu', '.', 'MIS', 'värvi', 'on', '?', 'Kuidas', 'PALUN', '?']
['Minu', 0, 0, 0, '.', 'MIS', 'värvi', 1, 1, 'on', '?', 'Kuidas', 'PALUN', '?']


# Rewriting

Rewriter is a class that contains a `rewrite` method. This method is used to convert the input layer to the resulting layer span by span. If the layer is not ambiguous, the span data is passed as a dict, and if the layer is ambiguous, the span data is passed as a list of dicts. If `rewrite` returns `None`, the corresponding span is not created.

Rewriting a layer `L` creates a new layer `N` such that the parent of `N` is `L`.

In the next example 'test_layer' contains spans with uppercase values in the 'upper' attribute for all but the first word.

In [10]:
class TestRewriter:
    def rewrite(self, record):
        if record['start'] == 0:
            return None
        record['upper'] = record['text'].upper()
        return record

t = Text('Tere maailm!')
t.tag_layer(['words'])
test_layer = t['words'].rewrite(source_attributes=('text',),
                                target_attributes=('upper',),
                                rules = TestRewriter(),
                                name='test_layer')
t['test_layer'] = test_layer
t['test_layer']

layer name,attributes,parent,enveloping,ambiguous,span count
test_layer,upper,words,,False,2

text,upper
maailm,MAAILM
!,!


The previous example demonstrates the case when the rewritten layer and the resulting layer are not ambiguous.

In the next example an ambiguous layer is rewritten and the result is also an ambiguous layer. The demo rewriter makes a copy of the first analysis of every word.

In [11]:
class AmbiguousLayerRewriter:
    def rewrite(self, records):
        records.append(records[0])
        return records

t = Text('Tere maailm!').analyse('morphology')
input_layer = t['morph_analysis']
t['demo'] = input_layer.rewrite(source_attributes=input_layer.attributes,
                                target_attributes=input_layer.attributes,
                                rules=AmbiguousLayerRewriter(),                                       
                                name='demo', 
                                ambiguous=True)
t['demo']

layer name,attributes,parent,enveloping,ambiguous,span count
demo,"lemma, root, root_tokens, ending, clitic, form, partofspeech",morph_analysis,,True,3

text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Tere,tere,tere,"('tere',)",0.0,,,I
,tere,tere,"('tere',)",0.0,,,I
maailm,maailm,maa_ilm,"('maa', 'ilm')",0.0,,sg n,S
,maailm,maa_ilm,"('maa', 'ilm')",0.0,,sg n,S
!,!,!,"('!',)",,,,Z
,!,!,"('!',)",,,,Z


The rewriting mechanism does not work with enveloping layers. Here is an example how to create a new layer on top of enveloping layer.

In [12]:
text = Text('Esimene lause? Teine lause! Kolmas lause.').analyse('segmentation')
sentence_types={'.':'statement', '?':'question', '!':'exclamation'}
layer = Layer(name='sentence_type', parent='sentences', attributes=('type',))
for s in text['sentences']:
    s.type = sentence_types.get(s[-1].text)
    layer.add_span(s)

text['sentence_type'] = layer
text['sentence_type']

layer name,attributes,parent,enveloping,ambiguous,span count
sentence_type,type,sentences,,False,3

text,type
"['Esimene', 'lause', '?']",question
"['Teine', 'lause', '!']",exclamation
"['Kolmas', 'lause', '.']",statement


# Attribute lists
Immutable lists for layer attribute value representation.

In [13]:
from estnltk.layer import AttributeList
from estnltk.layer import AmbiguousAttributeList
from estnltk.layer import AttributeTupleList
from estnltk.layer import AmbiguousAttributeTupleList

In [14]:
al = AttributeList([1,2,3,4], 'attr')
al

Unnamed: 0,attr
0,1
1,2
2,3
3,4


In [15]:
aal = AmbiguousAttributeList([[1,2], [3,4], [5]], 'attr')
aal

Unnamed: 0,attr
0.0,1
,2
1.0,3
,4
2.0,5


In [16]:
atl = AttributeTupleList([[1,2,3], [4,5,6], [7,8,9]], ['attr_1', 'attr_2', 'attr_3'])
atl.index = False
atl

Unnamed: 0,attr_1,attr_2,attr_3
0,1,2,3
1,4,5,6
2,7,8,9


In [17]:
aatl = AmbiguousAttributeTupleList([[[1,2], [3,4]], [[5,6], [7,8], [9,10]], [[11,12]]],
                                   ['attr_1', 'attr_2'])
aatl

Unnamed: 0,attr_1,attr_2
0.0,1,2
,3,4
1.0,5,6
,7,8
,9,10
2.0,11,12
