In [160]:
import csv
import pandas as pd
import time
import sklearn
import random
import sqlite3
from estnltk import Text
from estnltk_neural.taggers import StanzaSyntaxTagger
from estnltk.taggers import NerTagger
from estnltk.converters import text_to_json

In [2]:
from estnltk import Layer
from estnltk.taggers import Tagger
from collections import defaultdict
from collections import OrderedDict
import copy

In [3]:
def get_ner(ner_layer, word_layer, span):
        nertag = None
        if len(ner_layer) > 0:
            word = word_layer.get(span)
            for n in ner_layer:
                for part in n:
                    if part==word:
                        nertag=n.nertag
        if nertag:
            return nertag
        return 'OTHER'
    
def get_POS(word_layer, span):
    infinite_verb_forms = ['da', 'des', 'ma', 'maks', 'mas', 'mast', 'mata', 'nud', 'tav', 'tud', 'v']
    # if POS is ambiguous, only unique tags are kept, e.g. ['V', 'A', 'A'] -> ['V', 'A']
    pos_list = []
    word = word_layer.get(span)
    for i in range(len(word.morph_analysis['partofspeech'])):
        if word.morph_analysis['partofspeech'][i] == 'V':
            if word.morph_analysis['form'][i] in infinite_verb_forms:
                pos_list.append('V_inf')
            elif word.form[i] == 'neg':
                pos_list.append('V_neg')
            else:
                pos_list.append('V_fin')
        else:
            pos_list.append(word.morph_analysis['partofspeech'][i])
    
    if len(pos_list) > 1:
        char_unique = [char for indx, char in enumerate(pos_list) if char not in pos_list[:indx]]
        if len(char_unique) < 2:
            return char_unique[0]
        return '|'.join(char_unique)
    return pos_list[0]

In [139]:
class PhrasePatternTagger(Tagger):
    """Tags phrases that match given syntax and part-of-speech pattern rules, and their corresponding patterns.""" 
    
    conf_param = ['rules_file', 'ruleset_map']
    
    def __init__(self, rules_file: str,
                       output_layer='phrase_patterns',
                       morph_analysis_layer='morph_analysis',
                       words_layer='words',
                       syntax_layer='stanza_syntax',
                       ner_layer='ner'):
        
        self.input_layers = [morph_analysis_layer, words_layer, syntax_layer, ner_layer]
        self.output_layer = output_layer
        self.output_attributes = ['extraction_pattern', 'ner_pattern', 'pattern_id', 'score', 'phrase_pattern_id', 'phrase_class']
        self.rules_file = rules_file

        ruleset_map = defaultdict(list)
        
        with open(rules_file, encoding='UTF-8') as csv_file:
            reader = csv.DictReader(csv_file)
            for row in reader:
                info = [row['ID'], row['POS_pattern'], row['NER_pattern']]
                ruleset_map[row['tree']].append(info)
                
        self.ruleset_map = ruleset_map

    def _make_layer_template(self):
        layer = Layer(name=self.output_layer,
                      text_object=None,
                      attributes=self.output_attributes,
                      enveloping=self.input_layers[1],
                      ambiguous=True)
        return layer
        
    def _make_layer(self, text, layers, status):
        layer = self._make_layer_template()
        layer.text_object = text
        
        for i in range(len(layers[self.input_layers[2]])): # Iterate over 'stanza_syntax' layer
            pattern_spans = []
            ids = []

            pattern_spans.append(layers[self.input_layers[2]][i])
            ids.append([layers[self.input_layers[2]][i]['id'], layers[self.input_layers[2]][i]['head']])
                
            for j in range(i + 1, len(layers[self.input_layers[2]])):
                tree = []
                pos = []
                ner = []
                for k in range(len(pattern_spans)):
                    if layers[self.input_layers[2]][j] not in pattern_spans:
                        if layers[self.input_layers[2]][j] in pattern_spans[k]['children'] or pattern_spans[k] in layers[self.input_layers[2]][j]['children'] or layers[self.input_layers[2]][j]['parent_span'] != None and layers[self.input_layers[2]][j]['parent_span'] == pattern_spans[k]['parent_span']:
                            pattern_spans.append(layers[self.input_layers[2]][j])
                            ids.append([layers[self.input_layers[2]][j]['id'], layers[self.input_layers[2]][j]['head']])
                
                # fixing word and head ID values
                ids_for_pattern = copy.deepcopy(ids)
                for k in range(len(ids_for_pattern)):
                    temp = ids_for_pattern[k][0]
                    ids_for_pattern[k][0] = k+1
                    for l in range(len(ids)):
                        if ids[l][1] == temp:
                            ids_for_pattern[l][1] = ids_for_pattern[k][0]
            
                word_ids = [word_id[0] for word_id in ids_for_pattern]
                for k in range(len(ids_for_pattern)):
                    if ids_for_pattern[k][0] == ids_for_pattern[k][1]:
                        ids_for_pattern[k][1] = 0
                    elif ids_for_pattern[k][1] not in word_ids:
                        ids_for_pattern[k][1] = 0
                
                # finding the root of current pattern and setting its deprel value as such
                for k in range(len(pattern_spans)):
                    deprel = pattern_spans[k].deprel
                    if ids_for_pattern[k][1] == 0 and deprel != 'root':
                        deprel = 'root'
                    tree.append([str(ids_for_pattern[k][0]), str(ids_for_pattern[k][1]), deprel])
                    # POS-tag is taken from morph_analysis layer
                    pos.append(get_POS(layers[self.input_layers[1]], pattern_spans[k]))
                    # nertag is taken from ner layer
                    ner.append(get_ner(layers[self.input_layers[-1]], layers[self.input_layers[1]], pattern_spans[k]))                     
                    
                pattern = [" ".join(word_info) for word_info in tree]
                #print(pattern)
                # check if tree pattern exists in ruleset map
                if ",".join(pattern) in self.ruleset_map.keys():
                    #print(pattern, 'yes')
                    pos_pattern = "-".join(pos)
                    ner_pattern = "-".join(ner)
                    # check if POS-sequence and NER-sequence exist in ruleset map with given tree pattern
                    for el in self.ruleset_map[",".join(pattern)]:
                        #print(el[1], pos_pattern, el[2], ner_pattern)
                        if el[1] == pos_pattern and el[2] == ner_pattern:
                            #print(pattern, 'yesyes')
                            # add annotation
                            layer.add_annotation([span.base_span for span in pattern_spans], 
                                                 extraction_pattern=",".join([",".join(pattern), pos_pattern]),
                                                 ner_pattern=ner_pattern,
                                                 pattern_id=el[0],
                                                 score=None,
                                                 phrase_pattern_id=None,
                                                 phrase_class=None)
                            #tree = []
                            #pos = []
                            #ner = []

                #if j == len(layers[self.input_layers[2]]) - 1:
                    #pattern_spans = []
                    #ids = []
                    #pos = []
                    #ner = []
                    #break             
                
        return layer
    


In [140]:
pattern_tagger = PhrasePatternTagger(rules_file='indicator_patterns_ner_tree_pos_updated.csv')
pattern_tagger

name,output layer,output attributes,input layers
PhrasePatternTagger,phrase_patterns,"('extraction_pattern', 'ner_pattern', 'pattern_id', 'score', 'phrase_pattern_id', 'phrase_class')","('morph_analysis', 'words', 'stanza_syntax', 'ner')"

0,1
rules_file,indicator_patterns_ner_tree_pos_updated.csv
ruleset_map,"defaultdict(<class 'list'>, {'string': [['int64', 'string', 'string']], '1 2 nmo ..., type: <class 'collections.defaultdict'>, length: 7"


In [27]:
stanza_tagger = StanzaSyntaxTagger(input_type='morph_analysis', input_morph_layer='morph_analysis',
                                   add_parent_and_children=True)

Downloading resources index: 20.1kB [00:00, 20.1MB/s]


In [29]:
ner_tagger = NerTagger()

In [141]:
test_sent1 = Text('Kohtuasjade lahendamine avaldusega').tag_layer()
stanza_tagger.tag(test_sent1)
ner_tagger.tag(test_sent1)
pattern_tagger.tag(test_sent1)

text
Kohtuasjade lahendamine avaldusega

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
ner,nertag,,words,False,0
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2


In [142]:
test_sent1.phrase_patterns

layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Kohtuasjade', 'lahendamine']","1 2 nmod,2 0 root,S-S",OTHER-OTHER,1,,,
"['Kohtuasjade', 'lahendamine', 'avaldusega']","1 2 nmod,2 0 root,3 2 nmod,S-S-S",OTHER-OTHER-OTHER,129,,,


In [143]:
test_sent2 = Text('suhete direktor Piret Mürk').tag_layer()
stanza_tagger.tag(test_sent2)
ner_tagger.tag(test_sent2)
pattern_tagger.tag(test_sent2)

text
suhete direktor Piret Mürk

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,4
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,4
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,4
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,4
ner,nertag,,words,False,0
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2


In [144]:
test_sent2.phrase_patterns

layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['suhete', 'direktor']","1 2 nmod,2 0 root,S-S",OTHER-OTHER,1,,,
"['suhete', 'direktor', 'Piret', 'Mürk']","1 2 nmod,2 0 root,3 2 appos,4 3 flat,S-S-H-H",OTHER-OTHER-OTHER-OTHER,156,,,


In [145]:
test_df = pd.read_pickle('phrase_examples_filtered_sub10000/atomic_phrases.pkl')

In [146]:
test_df

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='külmakuudega'),1,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,obl_phrase,"(6, 18)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
1,Text(text='libedavastast graniidipuru'),2,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(22, 48)",0,...,A-S,"(1, 2, 0)","((0, 2, root),(2, 1, amod))","(S-A,(0, 2, root),(2, 1, amod))","(0-0,S-A,(0, 2, root),(2, 1, amod))",A-S,"(1, 2, 0)","((0, 2, root),(2, 1, amod))","(S-A,(0, 2, root),(2, 1, amod))","(0-0,S-A,(0, 2, root),(2, 1, amod))"
2,Text(text='juba vallid'),2,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(114, 125)",0,...,D-S,"(1, 2, 0)","((0, 2, root),(2, 1, advmod))","(S-D,(0, 2, root),(2, 1, advmod))","(0-0,S-D,(0, 2, root),(2, 1, advmod))",D-S,"(1, 2, 0)","((0, 2, root),(2, 1, advmod))","(S-D,(0, 2, root),(2, 1, advmod))","(0-0,S-D,(0, 2, root),(2, 1, advmod))"
3,Text(text='tänavapuhastajad'),1,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(139, 155)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
4,Text(text='külma'),1,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(211, 216)",0,...,A,"(1, 0)","((0, 1, root))","(A,(0, 1, root))","(0,A,(0, 1, root))",A,"(1, 0)","((0, 1, root))","(A,(0, 1, root))","(0,A,(0, 1, root))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36060,Text(text='kooridele'),1,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,obl_phrase,"(36, 45)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
36061,Text(text='mis'),1,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,nsubj_phrase,"(59, 62)",0,...,P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))",P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))"
36062,Text(text='ka koorijuhi palgamäära'),3,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,obj_phrase,"(83, 106)",0,...,D-S-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, advmod),(3, 2, nmod))","(S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))","(0-0-0,S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))",D-S-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, advmod),(3, 2, nmod))","(S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))","(0-0-0,S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))"
36063,Text(text='Laulupeo'),1,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,nmod_phrase,"(0, 8)",0,...,H,"(1, 0)","((0, 1, root))","(H,(0, 1, root))","(0,H,(0, 1, root))",H,"(1, 0)","((0, 1, root))","(H,(0, 1, root))","(0,H,(0, 1, root))"


In [147]:
test_500 = []

for idx, row in test_df.iterrows():
    if row['phrase_length'] > 1:
        test_500.append(row['phrase'])
    if len(test_500) == 500:
        break

In [148]:
for phrase in test_500:
    pattern_tagger.tag(phrase)

In [149]:
n_found = 0
for phrase in test_500:
    try:
        if len(phrase.phrase_patterns) > 0 and len(phrase.ner) > 0:
            display(phrase)
            display(phrase.phrase_patterns)
            n_found+=1
    except:
        break
print(n_found)

text
ETA andmetel

0,1
document_creation_time,2024-03-10T00:43
document_id,86904
phrase_start_end,"(0, 12)"
phrase_type,obl_phrase
sentence_id,2039231
sentence_startend,"(3175, 3301)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['ETA', 'andmetel']","1 2 nmod,2 0 root,Y-S",ORG-OTHER,20,,,


text
Riia kesklinnas asuva kohviku kõrval

0,1
document_creation_time,2024-03-10T00:43
document_id,585094
phrase_start_end,"(0, 36)"
phrase_type,obl_phrase
sentence_id,13238380
sentence_startend,"(7, 77)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Riia', 'kesklinnas']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,
"['kesklinnas', 'asuva', 'kohviku']","1 2 obl,2 3 acl,3 0 root,S-A-S",OTHER-OTHER-OTHER,94,,,


text
Eesti sõjaväes

0,1
document_creation_time,2024-03-10T00:43
document_id,559734
phrase_start_end,"(30, 44)"
phrase_type,obl_phrase
sentence_id,12713570
sentence_startend,"(3856, 3934)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'sõjaväes']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
Eesti infoserverite

0,1
document_creation_time,2024-03-10T00:43
document_id,3723
phrase_start_end,"(29, 48)"
phrase_type,nmod_phrase
sentence_id,302968
sentence_startend,"(3677, 3742)"
subcorpus,tea

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'infoserverite']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
Krimmi esimeses eestlaste

0,1
document_creation_time,2024-03-10T00:43
document_id,525323
phrase_start_end,"(0, 25)"
phrase_type,obl_phrase
sentence_id,11951022
sentence_startend,"(0, 180)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['esimeses', 'eestlaste']","1 2 amod,2 0 root,O-S",OTHER-OTHER,72,,,


text
Eesti metsadest

0,1
document_creation_time,2024-03-10T00:44
document_id,381317
phrase_start_end,"(0, 15)"
phrase_type,obl_phrase
sentence_id,7658264
sentence_startend,"(6480, 6513)"
subcorpus,aja_ee

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'metsadest']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
Viru tänavale ehitatava büroohoone puhul

0,1
document_creation_time,2024-03-10T00:44
document_id,431141
phrase_start_end,"(21, 61)"
phrase_type,obl_phrase
sentence_id,9487957
sentence_startend,"(2598, 2762)"
subcorpus,aja_pm

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Viru', 'tänavale']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,
"['tänavale', 'ehitatava', 'büroohoone']","1 2 obl,2 3 acl,3 0 root,S-A-S",OTHER-OTHER-OTHER,94,,,


text
Tallinna abilinnapeade kohale

0,1
document_creation_time,2024-03-10T00:44
document_id,50055
phrase_start_end,"(39, 68)"
phrase_type,nmod_phrase
sentence_id,1301691
sentence_startend,"(12, 170)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Tallinna', 'abilinnapeade']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
Vene välisminister Igor Ivanov

0,1
document_creation_time,2024-03-10T00:44
document_id,343855
phrase_start_end,"(12, 42)"
phrase_type,nsubj_phrase
sentence_id,6449477
sentence_startend,"(234, 346)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,4
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,4
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,4
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,4
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Vene', 'välisminister']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,
"['Vene', 'välisminister', 'Igor', 'Ivanov']","1 2 nmod,2 0 root,3 2 appos,4 3 flat,H-S-H-H",LOC-OTHER-OTHER-OTHER,160,,,


text
Gruusia valimissohi

0,1
document_creation_time,2024-03-10T00:44
document_id,343855
phrase_start_end,"(65, 84)"
phrase_type,nmod_phrase
sentence_id,6449477
sentence_startend,"(234, 346)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Gruusia', 'valimissohi']","1 2 nmod,2 0 root,G-S",LOC-OTHER,30,,,


text
Norra kuningliku perekonna

0,1
document_creation_time,2024-03-10T00:44
document_id,514574
phrase_start_end,"(0, 26)"
phrase_type,nmod_phrase
sentence_id,11729628
sentence_startend,"(190, 293)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['kuningliku', 'perekonna']","1 2 amod,2 0 root,A-S",OTHER-OTHER,64,,,


text
Eesti meistrivõistluste

0,1
document_creation_time,2024-03-10T00:44
document_id,600800
phrase_start_end,"(0, 23)"
phrase_type,nmod_phrase
sentence_id,13580753
sentence_startend,"(1544, 1715)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'meistrivõistluste']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
valitsuse otsust paigutada muuseumlaev Suur Tõll Tallinna reisisadama Admiraliteedibasseini

0,1
document_creation_time,2024-03-10T00:44
document_id,556469
phrase_start_end,"(24, 115)"
phrase_type,obj_phrase
sentence_id,12643943
sentence_startend,"(179, 296)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,9
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,9
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,9
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,2
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,9
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,4


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,4

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['valitsuse', 'otsust']","1 2 nmod,2 0 root,S-S",OTHER-OTHER,1,,,
"['Suur', 'Tõll']","1 2 nmod,2 0 root,H-H",PER-PER,27,,,
"['Tallinna', 'reisisadama']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,
"['reisisadama', 'Admiraliteedibasseini']","1 2 nmod,2 0 root,S-S",OTHER-OTHER,1,,,


text
Vene õigeusu

0,1
document_creation_time,2024-03-10T00:44
document_id,44022
phrase_start_end,"(27, 39)"
phrase_type,nmod_phrase
sentence_id,1188139
sentence_startend,"(2593, 2766)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Vene', 'õigeusu']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
Vene õigeusu

0,1
document_creation_time,2024-03-10T00:44
document_id,44022
phrase_start_end,"(74, 86)"
phrase_type,nmod_phrase
sentence_id,1188139
sentence_startend,"(2593, 2766)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Vene', 'õigeusu']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
sadade Eesti elanike

0,1
document_creation_time,2024-03-10T00:44
document_id,96905
phrase_start_end,"(49, 69)"
phrase_type,nmod_phrase
sentence_id,2241879
sentence_startend,"(876, 1078)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'elanike']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
Maailma Malefondi nimel

0,1
document_creation_time,2024-03-10T00:44
document_id,439118
phrase_start_end,"(148, 171)"
phrase_type,nmod_phrase
sentence_id,9732461
sentence_startend,"(4099, 4368)"
subcorpus,aja_pm

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Maailma', 'Malefondi']","1 2 nmod,2 0 root,S-S",ORG-ORG,3,,,


text
NASA teadlased

0,1
document_creation_time,2024-03-10T00:44
document_id,573206
phrase_start_end,"(0, 14)"
phrase_type,nsubj_phrase
sentence_id,12987931
sentence_startend,"(634, 747)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['NASA', 'teadlased']","1 2 nmod,2 0 root,Y-S",ORG-OTHER,20,,,


text
Eesti Energias keskkonnaosakonna inspektorina töötav Lehtla

0,1
document_creation_time,2024-03-10T00:44
document_id,322939
phrase_start_end,"(61, 120)"
phrase_type,nsubj_phrase
sentence_id,6048777
sentence_startend,"(992, 1161)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,6
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,6
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,6
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,6
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,3


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,3

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'Energias']","1 2 nmod,2 0 root,H-S",ORG-ORG,12,,,
"['keskkonnaosakonna', 'inspektorina']","1 2 nmod,2 0 root,S-S",OTHER-OTHER,1,,,
"['inspektorina', 'töötav', 'Lehtla']","1 2 obl,2 3 acl,3 0 root,S-A-S",OTHER-OTHER-OTHER,94,,,


text
Sarapi sõnul

0,1
document_creation_time,2024-03-10T00:44
document_id,4040
phrase_start_end,"(21, 33)"
phrase_type,obl_phrase
sentence_id,348780
sentence_startend,"(3645, 3826)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Sarapi', 'sõnul']","1 2 nmod,2 0 root,H-S",PER-OTHER,15,,,


text
Saksamaa tulevane liidukantsler Angela Merkel

0,1
document_creation_time,2024-03-10T00:44
document_id,494320
phrase_start_end,"(0, 45)"
phrase_type,nsubj_phrase
sentence_id,11258567
sentence_startend,"(0, 132)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['tulevane', 'liidukantsler']","1 2 amod,2 0 root,A-S",OTHER-OTHER,64,,,


text
Eesti kapitalil

0,1
document_creation_time,2024-03-10T00:44
document_id,319650
phrase_start_end,"(115, 130)"
phrase_type,obl_phrase
sentence_id,5990094
sentence_startend,"(514, 646)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'kapitalil']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


text
"seegi , et Kaitseliidu Fond likvideeritakse"

0,1
document_creation_time,2024-03-10T00:44
document_id,553342
phrase_start_end,"(19, 62)"
phrase_type,nsubj_phrase
sentence_id,12575574
sentence_startend,"(2283, 2347)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,6
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,6
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,6
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,6
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Kaitseliidu', 'Fond']","1 2 nmod,2 0 root,S-S",ORG-ORG,3,,,


text
Kaitsepolitsei süüdistuse järgi

0,1
document_creation_time,2024-03-10T00:44
document_id,649894
phrase_start_end,"(0, 31)"
phrase_type,obl_phrase
sentence_id,15270447
sentence_startend,"(517, 698)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Kaitsepolitsei', 'süüdistuse']","1 2 nmod,2 0 root,H-S",ORG-OTHER,14,,,


text
Eesti meistriks

0,1
document_creation_time,2024-03-10T00:44
document_id,213061
phrase_start_end,"(30, 45)"
phrase_type,xcomp_phrase
sentence_id,4096934
sentence_startend,"(258, 335)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'meistriks']","1 2 nmod,2 0 root,H-S",LOC-OTHER,11,,,


25


In [10]:
# older version output
n_found = 0
for phrase in test_500:
    try:
        if len(phrase.phrase_patterns) > 0 and len(phrase.ner) > 0:
            display(phrase)
            display(phrase.phrase_patterns)
            n_found+=1
    except:
        break
print(n_found)

text
ETA andmetel

0,1
document_creation_time,2024-03-10T00:43
document_id,86904
phrase_start_end,"(0, 12)"
phrase_type,obl_phrase
sentence_id,2039231
sentence_startend,"(3175, 3301)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['ETA', 'andmetel']","1 2 nmod,2 0 root,Y-S,ORG-OTHER",20,,,


text
Riia kesklinnas asuva kohviku kõrval

0,1
document_creation_time,2024-03-10T00:43
document_id,585094
phrase_start_end,"(0, 36)"
phrase_type,obl_phrase
sentence_id,13238380
sentence_startend,"(7, 77)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Riia', 'kesklinnas']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Eesti sõjaväes

0,1
document_creation_time,2024-03-10T00:43
document_id,559734
phrase_start_end,"(30, 44)"
phrase_type,obl_phrase
sentence_id,12713570
sentence_startend,"(3856, 3934)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'sõjaväes']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Eesti infoserverite

0,1
document_creation_time,2024-03-10T00:43
document_id,3723
phrase_start_end,"(29, 48)"
phrase_type,nmod_phrase
sentence_id,302968
sentence_startend,"(3677, 3742)"
subcorpus,tea

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'infoserverite']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Krimmi esimeses eestlaste

0,1
document_creation_time,2024-03-10T00:43
document_id,525323
phrase_start_end,"(0, 25)"
phrase_type,obl_phrase
sentence_id,11951022
sentence_startend,"(0, 180)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['esimeses', 'eestlaste']","1 2 amod,2 0 root,O-S,OTHER-OTHER",72,,,


text
Eesti metsadest

0,1
document_creation_time,2024-03-10T00:44
document_id,381317
phrase_start_end,"(0, 15)"
phrase_type,obl_phrase
sentence_id,7658264
sentence_startend,"(6480, 6513)"
subcorpus,aja_ee

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'metsadest']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Viru tänavale ehitatava büroohoone puhul

0,1
document_creation_time,2024-03-10T00:44
document_id,431141
phrase_start_end,"(21, 61)"
phrase_type,obl_phrase
sentence_id,9487957
sentence_startend,"(2598, 2762)"
subcorpus,aja_pm

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Viru', 'tänavale']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Tallinna abilinnapeade kohale

0,1
document_creation_time,2024-03-10T00:44
document_id,50055
phrase_start_end,"(39, 68)"
phrase_type,nmod_phrase
sentence_id,1301691
sentence_startend,"(12, 170)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Tallinna', 'abilinnapeade']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Vene välisminister Igor Ivanov

0,1
document_creation_time,2024-03-10T00:44
document_id,343855
phrase_start_end,"(12, 42)"
phrase_type,nsubj_phrase
sentence_id,6449477
sentence_startend,"(234, 346)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,4
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,4
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,4
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,4
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Vene', 'välisminister']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Gruusia valimissohi

0,1
document_creation_time,2024-03-10T00:44
document_id,343855
phrase_start_end,"(65, 84)"
phrase_type,nmod_phrase
sentence_id,6449477
sentence_startend,"(234, 346)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Gruusia', 'valimissohi']","1 2 nmod,2 0 root,G-S,LOC-OTHER",30,,,


text
Norra kuningliku perekonna

0,1
document_creation_time,2024-03-10T00:44
document_id,514574
phrase_start_end,"(0, 26)"
phrase_type,nmod_phrase
sentence_id,11729628
sentence_startend,"(190, 293)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['kuningliku', 'perekonna']","1 2 amod,2 0 root,A-S,OTHER-OTHER",64,,,


text
Eesti meistrivõistluste

0,1
document_creation_time,2024-03-10T00:44
document_id,600800
phrase_start_end,"(0, 23)"
phrase_type,nmod_phrase
sentence_id,13580753
sentence_startend,"(1544, 1715)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'meistrivõistluste']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
valitsuse otsust paigutada muuseumlaev Suur Tõll Tallinna reisisadama Admiraliteedibasseini

0,1
document_creation_time,2024-03-10T00:44
document_id,556469
phrase_start_end,"(24, 115)"
phrase_type,obj_phrase
sentence_id,12643943
sentence_startend,"(179, 296)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,9
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,9
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,9
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,2
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,9
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,4


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,4

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['valitsuse', 'otsust']","1 2 nmod,2 0 root,S-S,OTHER-OTHER",1,,,
"['Suur', 'Tõll']","1 2 nmod,2 0 root,H-H,PER-PER",27,,,
"['Tallinna', 'reisisadama']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,
"['reisisadama', 'Admiraliteedibasseini']","1 2 nmod,2 0 root,S-S,OTHER-OTHER",1,,,


text
Vene õigeusu

0,1
document_creation_time,2024-03-10T00:44
document_id,44022
phrase_start_end,"(27, 39)"
phrase_type,nmod_phrase
sentence_id,1188139
sentence_startend,"(2593, 2766)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Vene', 'õigeusu']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Vene õigeusu

0,1
document_creation_time,2024-03-10T00:44
document_id,44022
phrase_start_end,"(74, 86)"
phrase_type,nmod_phrase
sentence_id,1188139
sentence_startend,"(2593, 2766)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Vene', 'õigeusu']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
sadade Eesti elanike

0,1
document_creation_time,2024-03-10T00:44
document_id,96905
phrase_start_end,"(49, 69)"
phrase_type,nmod_phrase
sentence_id,2241879
sentence_startend,"(876, 1078)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'elanike']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
Maailma Malefondi nimel

0,1
document_creation_time,2024-03-10T00:44
document_id,439118
phrase_start_end,"(148, 171)"
phrase_type,nmod_phrase
sentence_id,9732461
sentence_startend,"(4099, 4368)"
subcorpus,aja_pm

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Maailma', 'Malefondi']","1 2 nmod,2 0 root,S-S,ORG-ORG",3,,,


text
NASA teadlased

0,1
document_creation_time,2024-03-10T00:44
document_id,573206
phrase_start_end,"(0, 14)"
phrase_type,nsubj_phrase
sentence_id,12987931
sentence_startend,"(634, 747)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['NASA', 'teadlased']","1 2 nmod,2 0 root,Y-S,ORG-OTHER",20,,,


text
Eesti Energias keskkonnaosakonna inspektorina töötav Lehtla

0,1
document_creation_time,2024-03-10T00:44
document_id,322939
phrase_start_end,"(61, 120)"
phrase_type,nsubj_phrase
sentence_id,6048777
sentence_startend,"(992, 1161)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,6
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,6
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,6
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,6
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,2

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'Energias']","1 2 nmod,2 0 root,H-S,ORG-ORG",12,,,
"['keskkonnaosakonna', 'inspektorina']","1 2 nmod,2 0 root,S-S,OTHER-OTHER",1,,,


text
Sarapi sõnul

0,1
document_creation_time,2024-03-10T00:44
document_id,4040
phrase_start_end,"(21, 33)"
phrase_type,obl_phrase
sentence_id,348780
sentence_startend,"(3645, 3826)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Sarapi', 'sõnul']","1 2 nmod,2 0 root,H-S,PER-OTHER",15,,,


text
Saksamaa tulevane liidukantsler Angela Merkel

0,1
document_creation_time,2024-03-10T00:44
document_id,494320
phrase_start_end,"(0, 45)"
phrase_type,nsubj_phrase
sentence_id,11258567
sentence_startend,"(0, 132)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['tulevane', 'liidukantsler']","1 2 amod,2 0 root,A-S,OTHER-OTHER",64,,,


text
Eesti kapitalil

0,1
document_creation_time,2024-03-10T00:44
document_id,319650
phrase_start_end,"(115, 130)"
phrase_type,obl_phrase
sentence_id,5990094
sentence_startend,"(514, 646)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'kapitalil']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


text
"seegi , et Kaitseliidu Fond likvideeritakse"

0,1
document_creation_time,2024-03-10T00:44
document_id,553342
phrase_start_end,"(19, 62)"
phrase_type,nsubj_phrase
sentence_id,12575574
sentence_startend,"(2283, 2347)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,6
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,6
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,6
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,6
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Kaitseliidu', 'Fond']","1 2 nmod,2 0 root,S-S,ORG-ORG",3,,,


text
Kaitsepolitsei süüdistuse järgi

0,1
document_creation_time,2024-03-10T00:44
document_id,649894
phrase_start_end,"(0, 31)"
phrase_type,obl_phrase
sentence_id,15270447
sentence_startend,"(517, 698)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Kaitsepolitsei', 'süüdistuse']","1 2 nmod,2 0 root,H-S,ORG-OTHER",14,,,


text
Eesti meistriks

0,1
document_creation_time,2024-03-10T00:44
document_id,213061
phrase_start_end,"(30, 45)"
phrase_type,xcomp_phrase
sentence_id,4096934
sentence_startend,"(258, 335)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Eesti', 'meistriks']","1 2 nmod,2 0 root,H-S,LOC-OTHER",11,,,


25


In [150]:
con = sqlite3.connect('correct_noun_phrases.db')

In [151]:
cur = con.cursor()
cur.execute("SELECT * FROM correct_phrase_patterns")
rows = cur.fetchall()

In [152]:
tagged_phrases = defaultdict(list)

random.shuffle(rows)

start = time.time()

for row in rows:
    text = Text(row[8]).tag_layer('morph_analysis')
    ner_tagger.tag(text)
    stanza_tagger.tag(text)
    pattern_tagger.tag(text)
    if len(text.phrase_patterns) > 0:
        for pattern in text.phrase_patterns:
            if len(tagged_phrases[pattern['extraction_pattern'][0]]) < 100:
                tagged_phrases[pattern['extraction_pattern'][0]].append(text)
                
print(f"{len(rows)} fraasi märgendamiseks kulus PhrasePatternTaggeril {time.time()-start} sekundit")

21492 fraasi märgendamiseks kulus PhrasePatternTaggeril 802.0159752368927 sekundit


In [153]:
for el in tagged_phrases:
    print(el, tagged_phrases[el])

1 2 nmod,2 0 root,H-Y [Text(text='Pärnu KEKi'), Text(text='Kangelaste pst'), Text(text='Société a'), Text(text='Tallinna JK'), Text(text='Société a'), Text(text='Vene NFSV'), Text(text='Shell AS'), Text(text='Palkehituse AS'), Text(text='Budapesti EMil'), Text(text='Vene NFSV'), Text(text='Tartu VK'), Text(text='Budapesti EMil'), Text(text='Soome MM'), Text(text='Eesti ASi'), Text(text='Vene NFSV'), Text(text='Société a'), Text(text='Pärnu KEKi'), Text(text='Brüsseli GP-etapi'), Text(text='Punase RET-i'), Text(text='Tallinna FC'), Text(text='Eesti AS'), Text(text='Eesti NATO-püüdlusi'), Text(text='Eesti ASi'), Text(text='Eesti NSV-s'), Text(text='Soome MM'), Text(text='Piima pH'), Text(text='Eesti NATO-püüdlusi'), Text(text='Iraaki vms'), Text(text='Tartu VK')]
1 2 amod,2 0 root,C-S [Text(text='rikkamate rokistaaridega'), Text(text='konkurentsivõimelisemaks muutmisel'), Text(text='üldisemat seisukohavõttu'), Text(text='soodsamale hinnale'), Text(text='varasematele arusaamadele'), Text(

In [154]:
for el in tagged_phrases:
    print(el, len(tagged_phrases[el]))

1 2 nmod,2 0 root,H-Y 29
1 2 amod,2 0 root,C-S 100
1 2 nmod,2 0 root,S-S 100
1 2 amod,2 0 root,O-S 100
1 2 nmod,2 0 root,P-S 100
1 2 obl,2 3 acl,3 0 root,S-A|V_inf-S 100
1 2 amod,2 0 root,A-S 100
1 2 nmod,2 0 root,H-S 100
1 2 nmod,2 0 root,3 2 appos,4 3 flat,H-S-H-H 100
1 2 nmod,2 0 root,3 2 nmod,S-S-S 100
1 2 amod,2 0 root,G-S 100
1 2 nmod,2 0 root,Y-S 100
1 2 obl,2 3 acl,3 0 root,H-A-S 51
1 2 nmod,2 0 root,3 2 appos,4 3 flat,S-S-H-H 100
1 2 nmod,2 0 root,H-H 100
1 2 obl,2 3 acl,3 0 root,S-V_inf|A-S 85
1 2 nmod,2 0 root,Y-N 40
1 2 amod,2 0 root,P-S 100
1 2 nmod,2 0 root,3 2 nmod,H-S-H 12
1 2 obl,2 3 acl,3 0 root,S-A-S 100
1 2 nmod,2 0 root,Y-Y 5
1 2 amod,2 0 root,U-S 100
1 2 amod,2 0 root,O|P-S 14
1 2 obl,2 3 acl,3 0 root,S-A|V_inf-A 4
1 2 nmod,2 0 root,G-S 33
1 0 root,2 1 flat,3 1 flat,H-H-H 64
1 2 amod,2 0 root,H-H 3
1 2 amod,2 0 root,A-H 58
1 2 nmod,2 0 root,S-N 6
1 2 nmod,2 0 root,H-N 4
1 2 nmod,2 0 root,3 2 nmod,P-S-Y 4
1 2 amod,2 0 root,P|O-S 100
1 2 nmod,2 0 root,3 2 nmod,P-S-S

In [155]:
con.close()

In [156]:
ruleset_map = defaultdict(list)
        
with open('indicator_patterns_ner_tree_pos_updated.csv', encoding='UTF-8') as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        info = [row['ID'], row['POS_pattern'], row['NER_pattern']]
        ruleset_map[row['tree']].append(info)

In [157]:
len(ruleset_map)

7

In [158]:
for el in ruleset_map:
    print(el)

string
1 2 nmod,2 0 root
1 2 amod,2 0 root
1 2 obl,2 3 acl,3 0 root
1 2 nmod,2 0 root,3 2 nmod
1 2 nmod,2 0 root,3 2 appos,4 3 flat
1 0 root,2 1 flat,3 1 flat


In [159]:
con = sqlite3.connect("tagged_noun_phrases.db")
cur = con.cursor()
cur.execute('pragma encoding=UTF8')
cur.execute("CREATE TABLE tagged_phrases(ID INTEGER PRIMARY KEY, extraction_pattern TEXT, ner_pattern TEXT, pattern_id INTEGER, phrase TEXT)")

<sqlite3.Cursor at 0x23a21c3dec0>

In [165]:
for el in tagged_phrases:
    for i in range(len(tagged_phrases[el])):
        phrase_json = text_to_json(tagged_phrases[el][i])
        cur.execute("""INSERT INTO tagged_phrases
                                (extraction_pattern, ner_pattern, pattern_id, phrase)
                                VALUES (?, ?, ?, ?);""", (el, tagged_phrases[el][i].phrase_patterns['ner_pattern'][0][0], tagged_phrases[el][i].phrase_patterns['pattern_id'][0][0], phrase_json))
    
        con.commit()

con.close()

In [13]:
df_10000 = pd.read_pickle('data_subset_10000.pkl')

In [14]:
df_10000

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='külmakuudega'),1,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,obl_phrase,"(6, 18)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
1,Text(text='neist teede äärtesse'),3,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,obl_phrase,"(84, 104)",0,...,P-S-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, det),(3, 2, nmod))","(S-P-S,(0, 3, root),(2, 1, det),(3, 2, nmod))","(0-0-0,S-P-S,(0, 3, root),(2, 1, det),(3, 2, nmod))",P-S-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, det),(3, 2, nmod))","(S-P-S,(0, 3, root),(2, 1, det),(3, 2, nmod))","(0-0-0,S-P-S,(0, 3, root),(2, 1, det),(3, 2, nmod))"
2,Text(text='libedavastast graniidipuru'),2,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(22, 48)",0,...,A-S,"(1, 2, 0)","((0, 2, root),(2, 1, amod))","(S-A,(0, 2, root),(2, 1, amod))","(0-0,S-A,(0, 2, root),(2, 1, amod))",A-S,"(1, 2, 0)","((0, 2, root),(2, 1, amod))","(S-A,(0, 2, root),(2, 1, amod))","(0-0,S-A,(0, 2, root),(2, 1, amod))"
3,Text(text='juba vallid'),2,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(114, 125)",0,...,D-S,"(1, 2, 0)","((0, 2, root),(2, 1, advmod))","(S-D,(0, 2, root),(2, 1, advmod))","(0-0,S-D,(0, 2, root),(2, 1, advmod))",D-S,"(1, 2, 0)","((0, 2, root),(2, 1, advmod))","(S-D,(0, 2, root),(2, 1, advmod))","(0-0,S-D,(0, 2, root),(2, 1, advmod))"
4,Text(text='tänavapuhastajad'),1,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(139, 155)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44990,Text(text='mis'),1,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,nsubj_phrase,"(59, 62)",0,...,P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))",P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))"
44991,"Text(text='kategooria , mis määrab edaspidiseks ka koorijuhi palgamäära')",8,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,obj_phrase,"(46, 106)",0,...,S-Z-P-V-A-D-S-S,"(1, 0, 2, 4, 3, 5, 6, 8, 7)","((0, 1, root),(1, 4, acl:relcl),(4, 2, punct),(4, 3, nsubj),(4, 5, xcomp),(4, 8, obj),(8, 6, advmod),(8, 7, nmod))","(S-V-Z-P-A-S-D-S,(0, 1, root),(1, 4, acl:relcl),(4, 2, punct),(4, 3, nsubj),(4, 5, xcomp),(4, 8, obj),(8, 6, advmod),(8, 7, nmod))","(0-0-0-0-0-0-0-0,S-V-Z-P-A-S-D-S,(0, 1, root),(1, 4, acl:relcl),(4, 2, punct),(4, 3, nsubj),(4, 5, xcomp),(4, 8, obj),(8, 6, advmod),(8, 7, nmod))",S-Z-P-V_fin-A-D-S-S,"(1, 0, 2, 4, 3, 5, 6, 8, 7)","((0, 1, root),(1, 4, acl:relcl),(4, 2, punct),(4, 3, nsubj),(4, 5, xcomp),(4, 8, obj),(8, 6, advmod),(8, 7, nmod))","(S-V_fin-Z-P-A-S-D-S,(0, 1, root),(1, 4, acl:relcl),(4, 2, punct),(4, 3, nsubj),(4, 5, xcomp),(4, 8, obj),(8, 6, advmod),(8, 7, nmod))","(0-0-0-0-0-0-0-0,S-V_fin-Z-P-A-S-D-S,(0, 1, root),(1, 4, acl:relcl),(4, 2, punct),(4, 3, nsubj),(4, 5, xcomp),(4, 8, obj),(8, 6, advmod),(8, 7, nmod))"
44992,Text(text='ka koorijuhi palgamäära'),3,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,obj_phrase,"(83, 106)",0,...,D-S-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, advmod),(3, 2, nmod))","(S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))","(0-0-0,S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))",D-S-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, advmod),(3, 2, nmod))","(S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))","(0-0-0,S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))"
44993,Text(text='Laulupeo'),1,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,nmod_phrase,"(0, 8)",0,...,H,"(1, 0)","((0, 1, root))","(H,(0, 1, root))","(0,H,(0, 1, root))",H,"(1, 0)","((0, 1, root))","(H,(0, 1, root))","(0,H,(0, 1, root))"


In [15]:
df_shuffled = sklearn.utils.shuffle(df_10000).reset_index(drop=True)

In [10]:
test_500 = []

for idx, row in df_shuffled.iterrows():
    if row['phrase_length'] > 1:
        test_500.append(row['phrase'])
    if len(test_500) == 500:
        break

In [11]:
start = time.time()

for text in test_500:
    pattern_tagger.tag(text)
    
print(f"500 fraasi märgendamiseks kulus PhrasePatternTaggeril {time.time()-start} sekundit")

500 fraasi märgendamiseks kulus PhrasePatternTaggeril 156.0999767780304 sekundit


In [16]:
test_600 = []

for idx, row in df_shuffled.iterrows():
    if row['phrase_length'] > 1:
        test_600.append(row['phrase'])
    if len(test_600) == 600:
        break

In [17]:
start = time.time()

for text in test_600:
    pattern_tagger.tag(text)
    
print(f"600 fraasi märgendamiseks kulus PhrasePatternTaggeril {time.time()-start} sekundit")

600 fraasi märgendamiseks kulus PhrasePatternTaggeril 2559.332098007202 sekundit
