In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from estnltk.taggers.standard.syntax.phrase_extraction.phrase_extractor import PhraseExtractor

## Käsitsi märgendatud süntaxi fail

In [3]:
from estnltk.converters.conll.conll_importer import conll_to_text
from estnltk_core.layer_operations import split_by_sentences


In [4]:
phrase_tagger1 = PhraseExtractor(deprel="obl", input_type="conll_syntax", 
                                syntax_layer="conll_syntax", output_layer="obl_phrases", morph_layer="words")

In [5]:
input_file = "./test_files/aja_ee199920_osa_1_ud211.conllu"
text_obj = conll_to_text( input_file, syntax_layer='conll_syntax' )
texts2 = split_by_sentences(text=text_obj,
                               layers_to_keep=list(text_obj.layers),
                               trim_overlapping=True
                               )

for txt in texts2:
    phrase_tagger1.tag(txt)


In [6]:
sentence1 = texts2[5]
sentence1.obl_phrases

layer name,attributes,parent,enveloping,ambiguous,span count
obl_phrases,"entity_type, free_entity, is_valid, root",,words,False,2

text,entity_type,free_entity,is_valid,root
['uurijatel'],,,,"Span('uurijatel', [{'id': 4, 'lemma': 'uurija', 'upostag': 'NOUN', 'xpostag': 'S ..., type: <class 'estnltk_core.layer.span.Span'>"
"['rahapesu', 'vastu']",,,,"Span('rahapesu', [{'id': 7, 'lemma': 'raha_pesu', 'upostag': 'NOUN', 'xpostag': ..., type: <class 'estnltk_core.layer.span.Span'>"


In [7]:
assert len(sentence1.obl_phrases) == 2, len(sentence1.obl_phrases)
assert list(sentence1.obl_phrases[0].text) == ['uurijatel'], list(sentence1.obl_phrases[0].text)
assert len(sentence1.obl_phrases[1]) == 2, len(sentence1.obl_phrases[1])
assert list(sentence1.obl_phrases[1].text) == ['rahapesu', 'vastu'], list(sentence1.obl_phrases[1].text)

In [8]:
sentence2 = texts2[100]
sentence2.obl_phrases

layer name,attributes,parent,enveloping,ambiguous,span count
obl_phrases,"entity_type, free_entity, is_valid, root",,words,False,2

text,entity_type,free_entity,is_valid,root
"['29.', 'mail']",,,,"Span('mail', [{'id': 5, 'lemma': 'mai', 'upostag': 'NOUN', 'xpostag': 'S', 'feat ..., type: <class 'estnltk_core.layer.span.Span'>"
['Väike-Maarjas'],,,,"Span('Väike-Maarjas', [{'id': 6, 'lemma': 'Väike-Maarja', 'upostag': 'PROPN', 'x ..., type: <class 'estnltk_core.layer.span.Span'>"


In [9]:
assert len(sentence2.obl_phrases) == 2, len(sentence2.obl_phrases)
assert list(sentence2.obl_phrases[0].text) == ['29.', 'mail'], list(sentence2.obl_phrases[0].text)
assert len(sentence2.obl_phrases[1]) == 1, len(sentence2.obl_phrases[1])
assert list(sentence2.obl_phrases[1].text) == ['Väike-Maarjas'], list(sentence2.obl_phrases[1].text)

## Koondkorpuse lausetega test

In [10]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from estnltk_neural.taggers.syntax.stanza_tagger.stanza_tagger import StanzaSyntaxTagger
from estnltk import Text

In [11]:
model_path = r"./estnltk_neural/taggers/syntax/stanza_tagger/stanza_resources/"
input_type="morph_extended"
stanza_tagger = StanzaSyntaxTagger(input_type=input_type, input_morph_layer=input_type, add_parent_and_children=True, resources_path=model_path)

In [12]:
phrase_tagger2 = PhraseExtractor(deprel="obl", input_type="stanza_syntax", 
                                syntax_layer="stanza_syntax", output_layer="obl_phrases")

In [13]:
txt1 = Text('Kolme aastaga on Eminem  alias  Marshall Mathers III ( 30 ) kindlalt meie teadvusesse sööbinud .')
txt1.tag_layer('morph_extended')
stanza_tagger.tag( txt1 )
phrase_tagger2.tag( txt1 )

text
Kolme aastaga on Eminem alias Marshall Mathers III ( 30 ) kindlalt meie teadvusesse sööbinud .

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,16
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,16
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,16
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,16
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_extended,,False,16
obl_phrases,"entity_type, free_entity, is_valid, root",,morph_analysis,False,2


In [14]:
assert len(txt1.obl_phrases) == 2, len(txt1.obl_phrases)
assert list(txt1.obl_phrases[0].text) == ['Kolme', 'aastaga'], list(txt1.obl_phrases[0].text)
assert len(txt1.obl_phrases[0]) == 2, len(txt1.obl_phrases[0])
assert len(txt1.obl_phrases[1]) == 2, len(txt1.obl_phrases[1])
assert list(txt1.obl_phrases[1].text) == ['meie', 'teadvusesse'], list(txt1.obl_phrases[1].text)

In [15]:
txt2 = Text('Aga ma sain sellest nõiaringist välja .')
txt2.tag_layer('morph_extended')
stanza_tagger.tag( txt2 )
phrase_tagger2.tag( txt2 )

text
Aga ma sain sellest nõiaringist välja .

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,7
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,7
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,7
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_extended,,False,7
obl_phrases,"entity_type, free_entity, is_valid, root",,morph_analysis,False,2


In [16]:
assert len(txt2.obl_phrases) == 2, len(txt2.obl_phrases)
assert list(txt2.obl_phrases[0].text) == ['sellest'], list(txt2.obl_phrases[0].text)
assert len(txt2.obl_phrases[0]) == 1, len(txt2.obl_phrases[0])
assert len(txt2.obl_phrases[1]) == 1, len(txt2.obl_phrases[1])
assert list(txt2.obl_phrases[1].text) == ['nõiaringist'], list(txt2.obl_phrases[1].text)

In [17]:
txt3 = Text('Oleksin võinud vangi sattuda .')
txt3.tag_layer('morph_extended')
stanza_tagger.tag( txt3 )
phrase_tagger2.tag( txt3 )

text
Oleksin võinud vangi sattuda .

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,5
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_extended,,False,5
obl_phrases,"entity_type, free_entity, is_valid, root",,morph_analysis,False,1


In [18]:
assert len(txt3.obl_phrases) == 1, len(txt3.obl_phrases)
assert list(txt3.obl_phrases[0].text) == ['vangi'], list(txt3.obl_phrases[0].text)
assert len(txt3.obl_phrases[0]) == 1, len(txt3.obl_phrases[0])