In [1]:
import csv
import pandas as pd

In [2]:
from estnltk import Layer
from estnltk.taggers import Tagger
from collections import defaultdict
import copy

In [3]:
def get_ner(ner_layer, word_layer, span):
        nertag = None
        if len(ner_layer) > 0:
            word = word_layer.get(span)
            for n in ner_layer:
                for part in n:
                    if part==word:
                        nertag=n.nertag
        if nertag:
            return nertag
        return 'OTHER'
    
def get_POS(word_layer, span):
    infinite_verb_forms = ['da', 'des', 'ma', 'maks', 'mas', 'mast', 'mata', 'nud', 'tav', 'tud', 'v']
    # if POS is ambiguous, only unique tags are kept, e.g. ['V', 'A', 'A'] -> ['V', 'A']
    pos_list = []
    word = word_layer.get(span)
    for i in range(len(word.morph_analysis['partofspeech'])):
        if word.morph_analysis['partofspeech'][i] == 'V':
            if word.morph_analysis['form'][i] in infinite_verb_forms:
                pos_list.append('V_inf')
            elif word.form[i] == 'neg':
                pos_list.append('V_neg')
            else:
                pos_list.append('V_fin')
        else:
            pos_list.append(word.morph_analysis['partofspeech'][i])
    
    if len(pos_list) > 1:
        char_unique = [char for indx, char in enumerate(pos_list) if char not in pos_list[:indx]]
        if len(char_unique) < 2:
            return char_unique[0]
        return '|'.join(char_unique)
    return pos_list[0]

In [40]:
class PhrasePatternTagger(Tagger):
    """Tags phrases that match given syntax and part-of-speech pattern rules, and their corresponding patterns.""" 
    
    conf_param = ['rules_file', 'ruleset_map']
    
    def __init__(self, rules_file: str,
                       output_layer='phrase_patterns',
                       morph_analysis_layer='morph_analysis',
                       words_layer='words',
                       syntax_layer='stanza_syntax',
                       ner_layer='ner'):
        
        self.input_layers = [morph_analysis_layer, words_layer, syntax_layer, ner_layer]
        self.output_layer = output_layer
        self.output_attributes = ['extraction_pattern', 'ner_pattern']
        self.rules_file = rules_file

        ruleset_map = defaultdict(list)
        
        with open(rules_file, encoding='UTF-8') as csv_file:
            reader = csv.DictReader(csv_file)
            for row in reader:
                ruleset_map[row['tree']].append(row['POS_pattern'])
                
        self.ruleset_map = ruleset_map

    def _make_layer_template(self):
        layer = Layer(name=self.output_layer,
                      text_object=None,
                      attributes=self.output_attributes,
                      enveloping=self.input_layers[1],
                      ambiguous=True)
        return layer
        
    def _make_layer(self, text, layers, status):
        layer = self._make_layer_template()
        layer.text_object = text
        
        for i in range(len(layers[self.input_layers[2]])): # Iterate over 'stanza_syntax' layer
            pattern_spans = []
            tree = []
            ids = []
            pos = []
            ner = []

            pattern_spans.append(layers[self.input_layers[2]][i])
            ids.append([layers[self.input_layers[2]][i]['id'], layers[self.input_layers[2]][i]['head']])
                
            for j in range(i + 1, len(layers[self.input_layers[2]])):
                for k in range(len(pattern_spans)):
                    if layers[self.input_layers[2]][j] in pattern_spans[k]['children'] or pattern_spans[k] in layers[self.input_layers[2]][j]['children'] or layers[self.input_layers[2]][j]['parent_span'] != None and layers[self.input_layers[2]][j]['parent_span'] == pattern_spans[k]['parent_span']:
                        pattern_spans.append(layers[self.input_layers[2]][j])
                        ids.append([layers[self.input_layers[2]][j]['id'], layers[self.input_layers[2]][j]['head']])
        
                ids_for_pattern = copy.deepcopy(ids)
                for k in range(len(ids_for_pattern)):
                    temp = ids_for_pattern[k][0]
                    ids_for_pattern[k][0] = k+1
                    for l in range(len(ids)):
                        if ids[l][1] == temp:
                            ids_for_pattern[l][1] = ids_for_pattern[k][0]
            
                word_ids = [word_id[0] for word_id in ids_for_pattern]
                for k in range(len(ids_for_pattern)):
                    if ids_for_pattern[k][0] == ids_for_pattern[k][1]:
                        ids_for_pattern[k][1] = 0
                    elif ids_for_pattern[k][1] not in word_ids:
                        ids_for_pattern[k][1] = 0
            
                for k in range(len(pattern_spans)):
                    deprel = pattern_spans[k].deprel
                    if ids_for_pattern[k][1] == 0 and deprel != 'root':
                        deprel = 'root'
                    tree.append([str(ids_for_pattern[k][0]), str(ids_for_pattern[k][1]), deprel])
                    # POS-tag is taken from morph_analysis layer
                    pos.append(get_POS(layers[self.input_layers[1]], pattern_spans[k]))
                    # nertag is taken from ner layer
                    ner.append(get_ner(layers[self.input_layers[-1]], layers[self.input_layers[1]], pattern_spans[k]))                     
                    
                pattern = [" ".join(word_info) for word_info in tree]
                # check if tree pattern exists in ruleset map
                if ",".join(pattern) in self.ruleset_map.keys():
                    pos_pattern = "-".join(pos)
                    # check if POS-sequence exists in ruleset map with given tree pattern
                    for p in self.ruleset_map[",".join(pattern)]:
                        if p == pos_pattern:
                            # at the moment, ner pattern is not specified in ruleset map, so it will not be checked
                            ner_pattern = "-".join(ner)
                            # add annotation
                            layer.add_annotation([span.base_span for span in pattern_spans], 
                                                 extraction_pattern=",".join([",".join(pattern), pos_pattern]), 
                                                 ner_pattern=ner_pattern)                  

                if j == len(layers[self.input_layers[2]]) - 1:
                    pattern_spans = []
                    tree = []
                    ids = []
                    pos = []
                    ner = []
                    break             
                
        return layer
    


In [41]:
pattern_tagger = PhrasePatternTagger(rules_file='indicator_patterns_ner_tree_pos_updated.csv')
pattern_tagger

name,output layer,output attributes,input layers
PhrasePatternTagger,phrase_patterns,"('extraction_pattern', 'ner_pattern')","('morph_analysis', 'words', 'stanza_syntax', 'ner')"

0,1
rules_file,indicator_patterns_ner_tree_pos_updated.csv
ruleset_map,"defaultdict(<class 'list'>, {'string': ['string'], '1 2 nmod,2 0 root': ['S-S', ..., type: <class 'collections.defaultdict'>, length: 7"


In [42]:
test_df = pd.read_pickle('phrase_examples_filtered_sub10000/atomic_phrases.pkl')

In [43]:
test_df

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='külmakuudega'),1,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,obl_phrase,"(6, 18)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
1,Text(text='libedavastast graniidipuru'),2,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(22, 48)",0,...,A-S,"(1, 2, 0)","((0, 2, root),(2, 1, amod))","(S-A,(0, 2, root),(2, 1, amod))","(0-0,S-A,(0, 2, root),(2, 1, amod))",A-S,"(1, 2, 0)","((0, 2, root),(2, 1, amod))","(S-A,(0, 2, root),(2, 1, amod))","(0-0,S-A,(0, 2, root),(2, 1, amod))"
2,Text(text='juba vallid'),2,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(114, 125)",0,...,D-S,"(1, 2, 0)","((0, 2, root),(2, 1, advmod))","(S-D,(0, 2, root),(2, 1, advmod))","(0-0,S-D,(0, 2, root),(2, 1, advmod))",D-S,"(1, 2, 0)","((0, 2, root),(2, 1, advmod))","(S-D,(0, 2, root),(2, 1, advmod))","(0-0,S-D,(0, 2, root),(2, 1, advmod))"
3,Text(text='tänavapuhastajad'),1,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(139, 155)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
4,Text(text='külma'),1,2024-03-10T00:43,5858921,312319,"(0, 226)",aja_EPL,nsubj_phrase,"(211, 216)",0,...,A,"(1, 0)","((0, 1, root))","(A,(0, 1, root))","(0,A,(0, 1, root))",A,"(1, 0)","((0, 1, root))","(A,(0, 1, root))","(0,A,(0, 1, root))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36060,Text(text='kooridele'),1,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,obl_phrase,"(36, 45)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
36061,Text(text='mis'),1,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,nsubj_phrase,"(59, 62)",0,...,P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))",P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))"
36062,Text(text='ka koorijuhi palgamäära'),3,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,obj_phrase,"(83, 106)",0,...,D-S-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, advmod),(3, 2, nmod))","(S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))","(0-0-0,S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))",D-S-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, advmod),(3, 2, nmod))","(S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))","(0-0-0,S-D-S,(0, 3, root),(3, 1, advmod),(3, 2, nmod))"
36063,Text(text='Laulupeo'),1,2024-03-10T01:35,4265502,224097,"(1430, 1538)",aja_EPL,nmod_phrase,"(0, 8)",0,...,H,"(1, 0)","((0, 1, root))","(H,(0, 1, root))","(0,H,(0, 1, root))",H,"(1, 0)","((0, 1, root))","(H,(0, 1, root))","(0,H,(0, 1, root))"


In [44]:
test_500 = []

for idx, row in test_df.iterrows():
    if row['phrase_length'] > 1:
        test_500.append(row['phrase'])
    if len(test_500) == 500:
        break

In [45]:
for phrase in test_500:
    pattern_tagger.tag(phrase)

In [46]:
n_found = 0
for phrase in test_500:
    try:
        if len(phrase.phrase_patterns) > 0 and len(phrase.ner) > 0:
            display(phrase)
            display(phrase.phrase_patterns)
            n_found+=1
    except:
        break
print(n_found)

text
ETA andmetel

0,1
document_creation_time,2024-03-10T00:43
document_id,86904
phrase_start_end,"(0, 12)"
phrase_type,obl_phrase
sentence_id,2039231
sentence_startend,"(3175, 3301)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['ETA', 'andmetel']","1 2 nmod,2 0 root,Y-S",ORG-OTHER


text
Riia kesklinnas asuva kohviku kõrval

0,1
document_creation_time,2024-03-10T00:43
document_id,585094
phrase_start_end,"(0, 36)"
phrase_type,obl_phrase
sentence_id,13238380
sentence_startend,"(7, 77)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Riia', 'kesklinnas']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Eesti sõjaväes

0,1
document_creation_time,2024-03-10T00:43
document_id,559734
phrase_start_end,"(30, 44)"
phrase_type,obl_phrase
sentence_id,12713570
sentence_startend,"(3856, 3934)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Eesti', 'sõjaväes']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Eesti infoserverite

0,1
document_creation_time,2024-03-10T00:43
document_id,3723
phrase_start_end,"(29, 48)"
phrase_type,nmod_phrase
sentence_id,302968
sentence_startend,"(3677, 3742)"
subcorpus,tea

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Eesti', 'infoserverite']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Krimmi esimeses eestlaste

0,1
document_creation_time,2024-03-10T00:43
document_id,525323
phrase_start_end,"(0, 25)"
phrase_type,obl_phrase
sentence_id,11951022
sentence_startend,"(0, 180)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['esimeses', 'eestlaste']","1 2 amod,2 0 root,O-S",OTHER-OTHER


text
Eesti metsadest

0,1
document_creation_time,2024-03-10T00:44
document_id,381317
phrase_start_end,"(0, 15)"
phrase_type,obl_phrase
sentence_id,7658264
sentence_startend,"(6480, 6513)"
subcorpus,aja_ee

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Eesti', 'metsadest']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Viru tänavale ehitatava büroohoone puhul

0,1
document_creation_time,2024-03-10T00:44
document_id,431141
phrase_start_end,"(21, 61)"
phrase_type,obl_phrase
sentence_id,9487957
sentence_startend,"(2598, 2762)"
subcorpus,aja_pm

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Viru', 'tänavale']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Tallinna abilinnapeade kohale

0,1
document_creation_time,2024-03-10T00:44
document_id,50055
phrase_start_end,"(39, 68)"
phrase_type,nmod_phrase
sentence_id,1301691
sentence_startend,"(12, 170)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Tallinna', 'abilinnapeade']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Vene välisminister Igor Ivanov

0,1
document_creation_time,2024-03-10T00:44
document_id,343855
phrase_start_end,"(12, 42)"
phrase_type,nsubj_phrase
sentence_id,6449477
sentence_startend,"(234, 346)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,4
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,4
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,4
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,4
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Vene', 'välisminister']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Gruusia valimissohi

0,1
document_creation_time,2024-03-10T00:44
document_id,343855
phrase_start_end,"(65, 84)"
phrase_type,nmod_phrase
sentence_id,6449477
sentence_startend,"(234, 346)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Gruusia', 'valimissohi']","1 2 nmod,2 0 root,G-S",LOC-OTHER


text
Norra kuningliku perekonna

0,1
document_creation_time,2024-03-10T00:44
document_id,514574
phrase_start_end,"(0, 26)"
phrase_type,nmod_phrase
sentence_id,11729628
sentence_startend,"(190, 293)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['kuningliku', 'perekonna']","1 2 amod,2 0 root,A-S",OTHER-OTHER


text
Eesti meistrivõistluste

0,1
document_creation_time,2024-03-10T00:44
document_id,600800
phrase_start_end,"(0, 23)"
phrase_type,nmod_phrase
sentence_id,13580753
sentence_startend,"(1544, 1715)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Eesti', 'meistrivõistluste']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
valitsuse otsust paigutada muuseumlaev Suur Tõll Tallinna reisisadama Admiraliteedibasseini

0,1
document_creation_time,2024-03-10T00:44
document_id,556469
phrase_start_end,"(24, 115)"
phrase_type,obj_phrase
sentence_id,12643943
sentence_startend,"(179, 296)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,9
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,9
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,9
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,2
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,9
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,4


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,4

text,extraction_pattern,ner_pattern
"['valitsuse', 'otsust']","1 2 nmod,2 0 root,S-S",OTHER-OTHER
"['Suur', 'Tõll']","1 2 nmod,2 0 root,H-H",PER-PER
"['Tallinna', 'reisisadama']","1 2 nmod,2 0 root,H-S",LOC-OTHER
"['reisisadama', 'Admiraliteedibasseini']","1 2 nmod,2 0 root,S-S",OTHER-OTHER


text
Vene õigeusu

0,1
document_creation_time,2024-03-10T00:44
document_id,44022
phrase_start_end,"(27, 39)"
phrase_type,nmod_phrase
sentence_id,1188139
sentence_startend,"(2593, 2766)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Vene', 'õigeusu']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Vene õigeusu

0,1
document_creation_time,2024-03-10T00:44
document_id,44022
phrase_start_end,"(74, 86)"
phrase_type,nmod_phrase
sentence_id,1188139
sentence_startend,"(2593, 2766)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Vene', 'õigeusu']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
sadade Eesti elanike

0,1
document_creation_time,2024-03-10T00:44
document_id,96905
phrase_start_end,"(49, 69)"
phrase_type,nmod_phrase
sentence_id,2241879
sentence_startend,"(876, 1078)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Eesti', 'elanike']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
Maailma Malefondi nimel

0,1
document_creation_time,2024-03-10T00:44
document_id,439118
phrase_start_end,"(148, 171)"
phrase_type,nmod_phrase
sentence_id,9732461
sentence_startend,"(4099, 4368)"
subcorpus,aja_pm

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Maailma', 'Malefondi']","1 2 nmod,2 0 root,S-S",ORG-ORG


text
NASA teadlased

0,1
document_creation_time,2024-03-10T00:44
document_id,573206
phrase_start_end,"(0, 14)"
phrase_type,nsubj_phrase
sentence_id,12987931
sentence_startend,"(634, 747)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['NASA', 'teadlased']","1 2 nmod,2 0 root,Y-S",ORG-OTHER


text
Eesti Energias keskkonnaosakonna inspektorina töötav Lehtla

0,1
document_creation_time,2024-03-10T00:44
document_id,322939
phrase_start_end,"(61, 120)"
phrase_type,nsubj_phrase
sentence_id,6048777
sentence_startend,"(992, 1161)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,6
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,6
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,6
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,6
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,2


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,2

text,extraction_pattern,ner_pattern
"['Eesti', 'Energias']","1 2 nmod,2 0 root,H-S",ORG-ORG
"['keskkonnaosakonna', 'inspektorina']","1 2 nmod,2 0 root,S-S",OTHER-OTHER


text
Sarapi sõnul

0,1
document_creation_time,2024-03-10T00:44
document_id,4040
phrase_start_end,"(21, 33)"
phrase_type,obl_phrase
sentence_id,348780
sentence_startend,"(3645, 3826)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Sarapi', 'sõnul']","1 2 nmod,2 0 root,H-S",PER-OTHER


text
Saksamaa tulevane liidukantsler Angela Merkel

0,1
document_creation_time,2024-03-10T00:44
document_id,494320
phrase_start_end,"(0, 45)"
phrase_type,nsubj_phrase
sentence_id,11258567
sentence_startend,"(0, 132)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,5
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,5
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,5
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,5
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['tulevane', 'liidukantsler']","1 2 amod,2 0 root,A-S",OTHER-OTHER


text
Eesti kapitalil

0,1
document_creation_time,2024-03-10T00:44
document_id,319650
phrase_start_end,"(115, 130)"
phrase_type,obl_phrase
sentence_id,5990094
sentence_startend,"(514, 646)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Eesti', 'kapitalil']","1 2 nmod,2 0 root,H-S",LOC-OTHER


text
"seegi , et Kaitseliidu Fond likvideeritakse"

0,1
document_creation_time,2024-03-10T00:44
document_id,553342
phrase_start_end,"(19, 62)"
phrase_type,nsubj_phrase
sentence_id,12575574
sentence_startend,"(2283, 2347)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,6
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,6
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,6
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,6
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Kaitseliidu', 'Fond']","1 2 nmod,2 0 root,S-S",ORG-ORG


text
nn Kesk-Euroopa

0,1
document_creation_time,2024-03-10T00:44
document_id,331001
phrase_start_end,"(25, 40)"
phrase_type,nsubj_phrase
sentence_id,6201479
sentence_startend,"(1609, 1745)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,4
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,4
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,4
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,4
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['nn', 'Kesk']","1 2 nmod,2 0 root,Y-H",OTHER-LOC


text
Kaitsepolitsei süüdistuse järgi

0,1
document_creation_time,2024-03-10T00:44
document_id,649894
phrase_start_end,"(0, 31)"
phrase_type,obl_phrase
sentence_id,15270447
sentence_startend,"(517, 698)"
subcorpus,aja_sloleht

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Kaitsepolitsei', 'süüdistuse']","1 2 nmod,2 0 root,H-S",ORG-OTHER


text
Eesti meistriks

0,1
document_creation_time,2024-03-10T00:44
document_id,213061
phrase_start_end,"(30, 45)"
phrase_type,xcomp_phrase
sentence_id,4096934
sentence_startend,"(258, 335)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,2
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,2
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,2
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,2
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1


layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern",,words,True,1

text,extraction_pattern,ner_pattern
"['Eesti', 'meistriks']","1 2 nmod,2 0 root,H-S",LOC-OTHER


26
