In [1]:
from estnltk import Text
from estnltk.converters.conll_importer import conll_to_text, add_layer_from_conll

In [2]:
text = conll_to_text('a.conll', 'gold')
add_layer_from_conll('a.conll', text, 'parsed')

text
Milliseks kujuneb Riigikassa ja Ühispanga vahekord ? Minu arvates on Eesti pangandus tehnoloogiliselt maailma tasemel .

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,2
words,,,,False,16
gold,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",,,False,16
parsed,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",,,False,16


In [3]:
from estnltk.layer_operations import get_enclosing_spans


def is_boring(span):
    # span on igav, kui ta ei ole verb
    return span.xpostag != 'V'


def get_fragment(span):
    # fragment on span koos oma järglastega
    return sorted((span, *span.children))


for span in text.gold:
    # liigume üle gold kihi, igavad spanid järtame vahele,
    # teised spanid prindime välja koos fragmendi ja lausega, millesse nad kuuluvad
    if is_boring(span):
        continue

    word = text.words.get(span)
    print(word)

    fragment = get_fragment(span)
    print([span.text for span in fragment])

    for sentence in get_enclosing_spans(text.sentences, word):
        # get_enclosing_spans töötab ebaefektiivselt, parem on seda mitte kasutada
        # see tähendab, et mõistlikum on itereerida üle lausete ja iga lause korral üle tema sõnade
        print(sentence)
        print()

Span(start=10, end=17, text='kujuneb')
['Milliseks', 'kujuneb', 'Riigikassa', 'vahekord']
ES[Span(start=0, end=9, text='Milliseks'),
Span(start=10, end=17, text='kujuneb'),
Span(start=18, end=28, text='Riigikassa'),
Span(start=29, end=31, text='ja'),
Span(start=32, end=41, text='Ühispanga'),
Span(start=42, end=50, text='vahekord'),
Span(start=51, end=52, text='?')]

Span(start=58, end=65, text='arvates')
['Minu', 'arvates']
ES[Span(start=53, end=57, text='Minu'),
Span(start=58, end=65, text='arvates'),
Span(start=66, end=68, text='on'),
Span(start=69, end=74, text='Eesti'),
Span(start=75, end=84, text='pangandus'),
Span(start=85, end=101, text='tehnoloogiliselt'),
Span(start=102, end=109, text='maailma'),
Span(start=110, end=117, text='tasemel'),
Span(start=118, end=119, text='.')]

Span(start=66, end=68, text='on')
['arvates', 'on', 'pangandus', 'tehnoloogiliselt', 'tasemel']
ES[Span(start=53, end=57, text='Minu'),
Span(start=58, end=65, text='arvates'),
Span(start=66, end=68, text='o

In [4]:
# itereerime üle lausete ja iga lause korral leiame temale vastava gold kihi alamkihi

for sentence in text.sentences:
    syntax_tree = text.gold.get(list(sentence.base_span))
    syntax_tree.display()

syntax_tree

layer name,attributes,parent,enveloping,ambiguous,span count
gold,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",,,False,9

text,id,lemma,upostag,xpostag,feats,head,deprel,deps,misc,parent_span,children
Minu,1,mina,P,Ppers,"OrderedDict([('ps1', ''), ('sg', ''), ('gen', '')])",2,@ADVL,,,Span(arvates),()
arvates,2,arva,V,V,"OrderedDict([('ger', '')])",3,@ADVL,,,Span(on),"('Span(Minu)',)"
on,3,ole,V,V,"OrderedDict([('indic', ''), ('pres', ''), ('ps3', ''), ('sg', '')])",0,ROOT,,,,"('Span(arvates)', 'Span(pangandus)', 'Span(tehnoloogiliselt)', 'Span(tasemel)')"
Eesti,4,Eesti,S,H,"OrderedDict([('sg', ''), ('gen', '')])",5,@NN>,,,Span(pangandus),()
pangandus,5,pangandus,S,S,"OrderedDict([('sg', ''), ('nom', '')])",3,@SUBJ,,,Span(on),"('Span(Eesti)',)"
tehnoloogiliselt,6,tehnoloogilise=lt,D,D,,3,@ADVL,,,Span(on),()
maailma,7,maa_ilm,S,S,"OrderedDict([('sg', ''), ('gen', '')])",8,@NN>,,,Span(tasemel),()
tasemel,8,tase,S,S,"OrderedDict([('sg', ''), ('ad', '')])",3,@ADVL,,,Span(on),"('Span(maailma)', 'Span(.)')"
.,9,.,Z,Z,"OrderedDict([('Fst', '')])",8,@Punc,,,Span(tasemel),()


In [5]:
# itereerime üle lausete ja iga lause sees üle sõnade ja iga sõna puhul vaatleme parsed kihi vastavat elementi

parsed = text.parsed

for sentence in text.sentences:
    for word in sentence:
        span = parsed.get(word)
        print(span.text, span.head, span.deprel, span.xpostag, sep='\t\t')
    print('-'*80)

Milliseks		2		@ADVL		P
kujuneb		0		ROOT		V
Riigikassa		2		@SUBJ		H
ja		6		@J		Jc
Ühispanga		6		@NN>		S
vahekord		2		@SUBJ		S
?		6		@Punc		Z
--------------------------------------------------------------------------------
Minu		2		@ADVL		Ppers
arvates		3		@ADVL		V
on		0		ROOT		V
Eesti		5		@NN>		H
pangandus		3		@SUBJ		S
tehnoloogiliselt		3		@ADVL		D
maailma		8		@NN>		S
tasemel		3		@ADVL		S
.		8		@Punc		Z
--------------------------------------------------------------------------------


In [6]:
# teeme lausest ja conll kihist conll stringi

from estnltk.converters.conll_exporter import sentence_to_conll

print(sentence_to_conll(sentence_span=text.sentences[1], conll_layer=text.parsed))

1	Minu	mina	P	Ppers	ps1|sg|gen	2	@ADVL	_	_
2	arvates	arva	V	V	ger	3	@ADVL	_	_
3	on	ole	V	V	indic|pres|ps3|sg	0	ROOT	_	_
4	Eesti	Eesti	S	H	sg|gen	5	@NN>	_	_
5	pangandus	pangandus	S	S	sg|nom	3	@SUBJ	_	_
6	tehnoloogiliselt	tehnoloogilise=lt	D	D	_	3	@ADVL	_	_
7	maailma	maa_ilm	S	S	sg|gen	8	@NN>	_	_
8	tasemel	tase	S	S	sg|ad	3	@ADVL	_	_
9	.	.	Z	Z	Fst	8	@Punc	_	_




In [7]:
text.parsed

layer name,attributes,parent,enveloping,ambiguous,span count
parsed,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",,,False,16

text,id,lemma,upostag,xpostag,feats,head,deprel,deps,misc,parent_span,children
Milliseks,1,milline,P,P,"OrderedDict([('rel', ''), ('sg', ''), ('tr', '')])",2,@ADVL,,,Span(kujuneb),()
kujuneb,2,kujune,V,V,"OrderedDict([('indic', ''), ('pres', ''), ('ps3', ''), ('sg', '')])",0,ROOT,,,,"('Span(Milliseks)', 'Span(Riigikassa)', 'Span(vahekord)')"
Riigikassa,3,Riigi_kassa,S,H,"OrderedDict([('sg', ''), ('nom', '')])",2,@SUBJ,,,Span(kujuneb),()
ja,4,ja,J,Jc,,6,@J,,,Span(vahekord),()
Ühispanga,5,ühis_pank,S,S,"OrderedDict([('sg', ''), ('gen', '')])",6,@NN>,,,Span(vahekord),()
vahekord,6,vahe_kord,S,S,"OrderedDict([('sg', ''), ('nom', '')])",2,@SUBJ,,,Span(kujuneb),"('Span(ja)', 'Span(Ühispanga)', 'Span(?)')"
?,7,?,Z,Z,"OrderedDict([('Int', '')])",6,@Punc,,,Span(vahekord),()
Minu,1,mina,P,Ppers,"OrderedDict([('ps1', ''), ('sg', ''), ('gen', '')])",2,@ADVL,,,Span(arvates),()
arvates,2,arva,V,V,"OrderedDict([('ger', '')])",3,@ADVL,,,Span(on),"('Span(Minu)',)"
on,3,ole,V,V,"OrderedDict([('indic', ''), ('pres', ''), ('ps3', ''), ('sg', '')])",0,ROOT,,,,"('Span(arvates)', 'Span(pangandus)', 'Span(tehnoloogiliselt)', 'Span(tasemel)')"


In [8]:
from estnltk import Tagger, Layer, EnvelopingSpan

class FragmentTagger(Tagger):
    """
    Tags fragments on syntax layer.
    """
    conf_param = []
    input_layers = ['gold']
    output_layer = 'fragments'
    output_attributes = ['attr1']
    
    def __init__(self):
        # self.conf_par
        pass

    def _make_layer(self, text, layers, status):
        layer = Layer(name=self.output_layer, attributes=self.output_attributes, text_object=text, 
                      enveloping=self.input_layers[0])

        for attr, span in enumerate(text.gold):
            if is_boring(span):
                continue
            spans = get_fragment(span)
            layer.add_annotation(spans, attr1=attr)

        return layer
    
tagger = FragmentTagger()
tagger

name,output layer,output attributes,input layers
FragmentTagger,fragments,"('attr1',)","('gold',)"


In [9]:
tagger.tag(text)
text.fragments

layer name,attributes,parent,enveloping,ambiguous,span count
fragments,attr1,,gold,False,3

text,attr1
"['Milliseks', 'kujuneb', 'Riigikassa', 'vahekord']",1
"['Minu', 'arvates']",8
"['arvates', 'on', 'pangandus', 'tehnoloogiliselt', 'tasemel']",9


In [10]:
# itereerime üle gold kihi ja kontrollime, kas parsed kihi vastavatel spanidel on sama 'head' väärtus

parsed_layer = text.parsed
for gold_span in text.gold:
    parsed_span = parsed_layer.get(gold_span)
    assert parsed_span.head == gold_span.head

In [11]:
# itereerime üle fragmentide kihi
# iga fragmendi osa kohta küsime gold kihi spani ja prindime välja

for fragment in text.fragments:
    for token in fragment:
        gold_span = text.gold.get(token)
        print(gold_span)
    print()

Span(start=0, end=9, text='Milliseks')
Span(start=10, end=17, text='kujuneb')
Span(start=18, end=28, text='Riigikassa')
Span(start=42, end=50, text='vahekord')

Span(start=53, end=57, text='Minu')
Span(start=58, end=65, text='arvates')

Span(start=58, end=65, text='arvates')
Span(start=66, end=68, text='on')
Span(start=75, end=84, text='pangandus')
Span(start=85, end=101, text='tehnoloogiliselt')
Span(start=110, end=117, text='tasemel')



In [12]:
text

text
Milliseks kujuneb Riigikassa ja Ühispanga vahekord ? Minu arvates on Eesti pangandus tehnoloogiliselt maailma tasemel .

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,2
words,,,,False,16
fragments,attr1,,gold,False,3
gold,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",,,False,16
parsed,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",,,False,16


In [13]:
# nii saab ka teha
for fragment in text.fragments:
    text.gold.get(list(fragment.base_span)).display()

In [14]:
from estnltk.layer_operations import split_by

# teeme tekstobjekti lausete kaupa tükkideks ja prindime välja nendele lausetele vastavad tekstid,
# mille fragmentide kiht pole tühi
# split_by ei ole efektiivne funktsioon, võimalusel tasub teda vältida

for sentence in split_by(text, layer='sentences', layers_to_keep=['fragments', 'words', 'gold']):
    if len(sentence.fragments) == 0:
        continue
    print(sentence)

Text(text='Milliseks kujuneb Riigikassa ja Ühispanga vahekord ?')
Text(text='Minu arvates on Eesti pangandus tehnoloogiliselt maailma tasemel .')
