In [1]:
import sys
sys.path.append("..\\methods")

In [9]:
import os

from estnltk import Text

from estnltk_neural.taggers import StanzaSyntaxTagger

from noun_phrase_extraction import create_df
from POS_sequence import *
from graph_methods import *

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import csv

In [3]:
FILE_NAME = '100000_random_sentences.csv'

In [4]:
def get_data(file_path):
    with open(file_path, encoding='UTF-8') as csv_file:
        rows = []
        reader = csv.DictReader(csv_file)
        for row in reader:
            rows.append(row)
            
    return rows
        

In [5]:
rows = get_data('100000_random_sentences.csv')

In [6]:
MAX_LEN = len(rows)

In [7]:
stanza_tagger = stanza_tagger = StanzaSyntaxTagger(input_type='morph_analysis', input_morph_layer='morph_analysis',
                                   add_parent_and_children=True)

In [8]:
STEP_SIZE = 5000
file_nr = 1
for i in range(0, len(rows), STEP_SIZE):
    print(f'file range {i}-{i+STEP_SIZE}; pickle file nr {file_nr}')
    if i + STEP_SIZE < MAX_LEN:
        rows = get_data(FILE_NAME)[i:i+STEP_SIZE]
    else:
        rows = get_data(FILE_NAME)[i:]
    df = pd.DataFrame()
    for row in rows:
        text = Text(row['text']).tag_layer('morph_analysis')
        text.meta['sentence_id'] = row['sentence_id']
        text.meta['document_id'] = row['document_id']
        text.meta['sentence_startend'] = tuple([row['sentence_start'], row['sentence_end']])
        text.meta['subcorpus'] = row['subcorpus']
        stanza_tagger.tag(text)
        temp = create_df(text)
        df = pd.concat([df, temp], ignore_index=True)
    df.to_pickle(f"./processed_data_pickle/{file_nr}_noun_phrase_data_100000.pkl")
    file_nr += 1

file range 0-5000; pickle file nr 1
file range 5000-10000; pickle file nr 2
file range 10000-15000; pickle file nr 3
file range 15000-20000; pickle file nr 4
file range 20000-25000; pickle file nr 5
file range 25000-30000; pickle file nr 6
file range 30000-35000; pickle file nr 7
file range 35000-40000; pickle file nr 8
file range 40000-45000; pickle file nr 9
file range 45000-50000; pickle file nr 10
file range 50000-55000; pickle file nr 11
file range 55000-60000; pickle file nr 12
file range 60000-65000; pickle file nr 13
file range 65000-70000; pickle file nr 14
file range 70000-75000; pickle file nr 15
file range 75000-80000; pickle file nr 16
file range 80000-85000; pickle file nr 17
file range 85000-90000; pickle file nr 18
file range 90000-95000; pickle file nr 19
file range 95000-100000; pickle file nr 20


In [None]:
temp_rows = get_data(FILE_NAME)[25000:25000+STEP_SIZE]
for row in temp_rows:
    text = Text(row['text']).tag_layer('morph_analysis')
    text.meta['sentence_id'] = row['sentence_id']
    text.meta['document_id'] = row['document_id']
    text.meta['sentence_startend'] = tuple([row['sentence_start'], row['sentence_end']])
    text.meta['subcorpus'] = row['subcorpus']
    stanza_tagger.tag(text)
    temp_df = create_df(text)
temp_df.to_pickle(f"./processed_data_pickle/temp_6_noun_phrase_data_100000.pkl")

In [16]:
dfs = pd.read_pickle('processed_data_pickle/3_noun_phrase_data_100000.pkl')

In [17]:
dfs

Unnamed: 0,phrase,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,has_timex_entity
0,Text(text='minult'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,obl_phrase,"(12, 18)",0,0
1,Text(text='kõigi aegade'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,obl_phrase,"(65, 77)",0,0
2,Text(text='tal'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,obl_phrase,"(103, 106)",0,0
3,Text(text='keegi'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,nsubj_phrase,"(6, 11)",0,0
4,Text(text='Sebastien'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,nsubj_phrase,"(86, 95)",0,0
5,Text(text='Fordi boss Malcolm Wilson'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,nsubj_phrase,"(129, 154)",0,0
6,Text(text='Walter Röhrl'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,nsubj_cop_phrase,"(48, 60)",0,0
7,Text(text='Fordi'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,nmod_phrase,"(129, 134)",0,0
8,Text(text='Malcolm Wilson'),2024-03-11T01:14,3182547,166110,"(2475, 2631)",aja_EPL,appos_phrase,"(140, 154)",0,0


In [3]:
path = 'processed_data_pickle/'

for f in os.listdir(path):
    df = pd.read_pickle(os.path.join(path, f))
    print(len(df))
    print()

25225

24750

23926

23218

21792

19992

18582

19223

19270

11722

25954

9505

26273

27914

27124

26393

26893

24339

23237

24151



In [4]:
path = 'processed_data_pickle/'

for f in os.listdir(path):
    df = pd.read_pickle(os.path.join(path, f))
    
    all_pos = []
    for phrase in df['phrase']:
        all_pos.append(get_POS_sequence(phrase))
    df.insert(len(df.columns), 'pos_sequence', all_pos)
    
    all_graphs = []
    for phrase in df['phrase']:
        graph = create_graph(phrase)
        all_graphs.append(graph)
    df.insert(len(df.columns), 'graph', all_graphs)
    
    df['graph_code'] = [get_graph_code(graph) for graph in df['graph']]
    df['graph_code_pos'] = [get_graph_code(graph, ['pos']) for graph in df['graph']]
    df['graph_code_pos_ner_timex'] = [get_graph_code(graph, ['pos', 'ner_timex']) for graph in df['graph']]
    
    phrase_lengths = [len(df['phrase'][i].words) for i in range(len(df))]
    df.insert(1, 'phrase_length', phrase_lengths) 
    
    all_pos = []
    for phrase in df['phrase']:
        all_pos.append(get_POS_sequence_with_verb_info(phrase))
    df.insert(len(df.columns), 'pos_sequence_verb_info', all_pos)
                
    all_graphs = []
    for phrase in df['phrase']:
        graph = create_graph_with_verb_info(phrase)
        all_graphs.append(graph)
    df.insert(len(df.columns), 'graph_verb_info', all_graphs)
            
    df['graph_code_verb_info'] = [get_graph_code(graph) for graph in df['graph_verb_info']]
    df['graph_code_pos_verb_info'] = [get_graph_code(graph, ['pos']) for graph in df['graph_verb_info']]
    df['graph_code_pos_ner_timex_verb_info'] = [get_graph_code(graph, ['pos', 'ner_timex']) for graph in df['graph_verb_info']]
    
    df.to_pickle(os.path.join(path, f))

In [5]:
df = pd.read_pickle('processed_data_pickle/1_noun_phrase_data_100000.pkl')

In [8]:
df

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='Järgmise aasta jooksul'),3,2024-03-11T22:27,135,1,"(820, 905)",aja_kr,obl_phrase,"(0, 22)",0,...,A-S-K,"(1, 2, 0, 3)","((0, 2, root),(2, 1, amod),(2, 3, case))","(S-A-K,(0, 2, root),(2, 1, amod),(2, 3, case))","(0-0-0,S-A-K,(0, 2, root),(2, 1, amod),(2, 3, case))",A-S-K,"(1, 2, 0, 3)","((0, 2, root),(2, 1, amod),(2, 3, case))","(S-A-K,(0, 2, root),(2, 1, amod),(2, 3, case))","(0-0-0,S-A-K,(0, 2, root),(2, 1, amod),(2, 3, case))"
1,Text(text='ehk paar korda'),3,2024-03-11T22:27,135,1,"(820, 905)",aja_kr,obl_phrase,"(48, 62)",0,...,D-N-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, advmod),(3, 2, nummod))","(S-D-N,(0, 3, root),(3, 1, advmod),(3, 2, nummod))","(0-0-0,S-D-N,(0, 3, root),(3, 1, advmod),(3, 2, nummod))",D-N-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, advmod),(3, 2, nummod))","(S-D-N,(0, 3, root),(3, 1, advmod),(3, 2, nummod))","(0-0-0,S-D-N,(0, 3, root),(3, 1, advmod),(3, 2, nummod))"
2,Text(text='- mingitel üritustel'),3,2024-03-11T22:27,135,1,"(820, 905)",aja_kr,obl_phrase,"(63, 83)",0,...,Z-P-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, punct),(3, 2, det))","(S-Z-P,(0, 3, root),(3, 1, punct),(3, 2, det))","(0-0-0,S-Z-P,(0, 3, root),(3, 1, punct),(3, 2, det))",Z-P-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, punct),(3, 2, det))","(S-Z-P,(0, 3, root),(3, 1, punct),(3, 2, det))","(0-0-0,S-Z-P,(0, 3, root),(3, 1, punct),(3, 2, det))"
3,Text(text='teineteist'),1,2024-03-11T22:27,135,1,"(820, 905)",aja_kr,obj_phrase,"(30, 40)",0,...,P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))",P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))"
4,Text(text='Aivar Mäed'),2,2024-03-11T22:27,312,2,"(4807, 4858)",aja_kr,nsubj_phrase,"(0, 10)",0,...,H-S,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-S,(0, 1, root),(1, 2, flat))","(0-0,H-S,(0, 1, root),(1, 2, flat))",H-S,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-S,(0, 1, root),(1, 2, flat))","(0-0,H-S,(0, 1, root),(1, 2, flat))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25949,Text(text='mida'),1,2024-03-11T22:55,1054213,36662,"(198, 417)",aja_EPL,obj_phrase,"(83, 87)",0,...,P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))",P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))"
25950,Text(text='tulemust'),1,2024-03-11T22:55,1054213,36662,"(198, 417)",aja_EPL,obj_phrase,"(159, 167)",0,...,S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))",S,"(1, 0)","((0, 1, root))","(S,(0, 1, root))","(0,S,(0, 1, root))"
25951,Text(text='oma'),1,2024-03-11T22:55,1054213,36662,"(198, 417)",aja_EPL,nmod_phrase,"(92, 95)",0,...,P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))",P,"(1, 0)","((0, 1, root))","(P,(0, 1, root))","(0,P,(0, 1, root))"
25952,Text(text='Andrew Ferguson'),2,2024-03-11T22:55,1054213,36662,"(198, 417)",aja_EPL,appos_phrase,"(187, 202)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"


### NER phrases

Välja valitud sõnaliigijärjestused on H-H, H-H-H, H-H-S, H-S, H-A-S.

In [7]:
pos_sequences = ['H-H', 'H-H-H', 'H-H-S', 'H-S', 'H-A-S']

In [14]:
df_HH = pd.DataFrame()
df_HHH = pd.DataFrame()
df_HHS = pd.DataFrame()
df_HS = pd.DataFrame()
df_HAS = pd.DataFrame()

path = 'processed_data_pickle/'

for f in os.listdir(path):
    df = pd.read_pickle(os.path.join(path, f))
    temp_HH = pd.DataFrame()
    temp_HHH = pd.DataFrame()
    temp_HHS = pd.DataFrame()
    temp_HS = pd.DataFrame()
    temp_HAS = pd.DataFrame()
    for idx, row in df.iterrows():
        if row['pos_sequence_verb_info'] == 'H-H':
            temp_data = {}
            temp_data.update(row)
            temp = pd.DataFrame.from_records([temp_data])
            temp_HH = pd.concat([temp_HH, temp], ignore_index=True)
        elif row['pos_sequence_verb_info'] == 'H-H-H':
            temp_data = {}
            temp_data.update(row)
            temp = pd.DataFrame.from_records([temp_data])
            temp_HHH = pd.concat([temp_HHH, temp], ignore_index=True)
        elif row['pos_sequence_verb_info'] == 'H-H-S':
            temp_data = {}
            temp_data.update(row)
            temp = pd.DataFrame.from_records([temp_data])
            temp_HHS = pd.concat([temp_HHS, temp], ignore_index=True)
        elif row['pos_sequence_verb_info'] == 'H-S':
            temp_data = {}
            temp_data.update(row)
            temp = pd.DataFrame.from_records([temp_data])
            temp_HS = pd.concat([temp_HS, temp], ignore_index=True)
        elif row['pos_sequence_verb_info'] == 'H-A-S':
            temp_data = {}
            temp_data.update(row)
            temp = pd.DataFrame.from_records([temp_data])
            temp_HAS = pd.concat([temp_HAS, temp], ignore_index=True)
    
    df_HH = pd.concat([df_HH, temp_HH], ignore_index=True)
    df_HHH = pd.concat([df_HHH, temp_HHH], ignore_index=True)
    df_HHS = pd.concat([df_HHS, temp_HHS], ignore_index=True)
    df_HS = pd.concat([df_HS, temp_HS], ignore_index=True)
    df_HAS = pd.concat([df_HAS, temp_HAS], ignore_index=True)
    
# -- target directory path
new_directory = 'ner_phrases_100000'
if not os.path.isdir(new_directory):
    os.mkdir(new_directory)

# -- target directory path
new_directory_path = 'ner_phrases_100000/'

df_HH.to_pickle(f'{new_directory_path}/phrases_HH.pkl')
df_HHH.to_pickle(f'{new_directory_path}/phrases_HHH.pkl')
df_HHS.to_pickle(f'{new_directory_path}/phrases_HHS.pkl')
df_HS.to_pickle(f'{new_directory_path}/phrases_HS.pkl')
df_HAS.to_pickle(f'{new_directory_path}/phrases_HAS.pkl')

In [15]:
df_HH

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='Dream Team'),2,2024-03-12T02:33,9555576,433339,"(2142, 2281)",aja_pm,appos_phrase,"(80, 90)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"
1,Text(text='Arvo Heinmaa'),2,2024-03-12T02:33,9555614,433340,"(1473, 1611)",aja_pm,appos_phrase,"(66, 78)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"
2,Text(text='Tatjana Fomina'),2,2024-03-12T02:33,9558744,433454,"(561, 631)",aja_pm,nsubj_phrase,"(0, 14)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"
3,Text(text='Hortus Musicus'),2,2024-03-12T02:34,9560688,433502,"(1420, 1492)",aja_pm,nsubj_phrase,"(0, 14)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"
4,Text(text='Herbert Grabbi'),2,2024-03-12T02:34,9560955,433510,"(0, 59)",aja_pm,appos_phrase,"(24, 38)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10341,Text(text='Riigikogu Isamaaliidu'),2,2024-03-12T02:33,9545961,433043,"(9462, 9502)",aja_pm,nmod_phrase,"(0, 21)",0,...,H-H,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(H-H,(0, 2, root),(2, 1, nmod))","(0-0,H-H,(0, 2, root),(2, 1, nmod))",H-H,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(H-H,(0, 2, root),(2, 1, nmod))","(0-0,H-H,(0, 2, root),(2, 1, nmod))"
10342,Text(text='Puerto Rico'),2,2024-03-12T02:33,9546471,433058,"(1242, 1340)",aja_pm,nmod_phrase,"(73, 84)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"
10343,Text(text='Sven Kivisildniku'),2,2024-03-12T02:33,9548689,433138,"(13, 259)",aja_pm,nmod_phrase,"(0, 17)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"
10344,Text(text='Mihkel Veske'),2,2024-03-12T02:33,9550194,433190,"(6003, 6020)",aja_pm,nmod_phrase,"(0, 12)",0,...,H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))",H-H,"(1, 0, 2)","((0, 1, root),(1, 2, flat))","(H-H,(0, 1, root),(1, 2, flat))","(0-0,H-H,(0, 1, root),(1, 2, flat))"


In [16]:
df_HHH

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='Rhône-Poulenc Rorer Inc'),3,2024-03-12T02:34,9560613,433500,"(373, 426)",aja_pm,obj_phrase,"(29, 52)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
1,Text(text='Coastal Baltica Holdingu'),3,2024-03-12T02:34,9572161,433833,"(236, 397)",aja_pm,nmod_phrase,"(49, 73)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
2,Text(text='Turkish Daily News'),3,2024-03-12T02:34,9580322,434145,"(16, 182)",aja_pm,appos_phrase,"(88, 106)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(2, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(2, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(2, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(2, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(2, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(2, 3, flat))"
3,Text(text='Toomas Hendrik Ilves'),3,2024-03-12T02:34,9584317,434299,"(0, 206)",aja_pm,appos_phrase,"(93, 113)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
4,Text(text='Kohtunik Juri Sakkarti'),3,2024-03-12T02:36,9672033,437152,"(196, 309)",aja_pm,nmod_phrase,"(0, 22)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,Text(text='Tuva Aldan Madõri'),3,2024-03-12T02:26,9234827,422611,"(379, 563)",aja_pm,nmod_phrase,"(59, 76)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
521,Text(text='New York Timesi'),3,2024-03-12T02:26,9237001,422685,"(4124, 4260)",aja_ee,nmod_phrase,"(92, 107)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
522,Text(text='Salt Lake City'),3,2024-03-12T02:28,9333202,425918,"(2766, 2895)",aja_pm,appos_phrase,"(104, 118)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
523,Text(text='Pet Shop Boysi'),3,2024-03-12T02:29,9365961,427069,"(1198, 1247)",aja_pm,nmod_phrase,"(22, 36)",0,...,H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))",H-H-H,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, flat))","(H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))","(0-0-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"


In [17]:
df_HHS

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='John Lennonile kuuli'),3,2024-03-12T02:34,9562230,433536,"(1958, 2249)",aja_pm,obj_phrase,"(230, 260)",1,...,H-H-S,"(1, 3, 2, 0)","((0, 3, root),(1, 2, flat),(3, 1, nmod))","(S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))","(0-ner_PER-ner_PER,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))",H-H-S,"(1, 3, 2, 0)","((0, 3, root),(1, 2, flat),(3, 1, nmod))","(S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))","(0-ner_PER-ner_PER,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))"
1,Text(text='New Yorgis biitlite'),3,2024-03-12T02:34,9562230,433536,"(1958, 2249)",aja_pm,nmod_phrase,"(108, 127)",0,...,H-H-S,"(1, 3, 2, 0)","((0, 3, root),(1, 2, flat),(3, 1, nmod))","(S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))","(0-0-0,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))",H-H-S,"(1, 3, 2, 0)","((0, 3, root),(1, 2, flat),(3, 1, nmod))","(S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))","(0-0-0,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))"
2,Text(text='Tartu Ülikooli kunstimuuseum'),3,2024-03-12T02:34,9582906,434239,"(5, 133)",aja_pm,nsubj_phrase,"(16, 44)",0,...,H-H-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, nmod),(3, 2, nmod))","(S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))","(0-0-0,S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))",H-H-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, nmod),(3, 2, nmod))","(S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))","(0-0-0,S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))"
3,Text(text='Euroopa Liidu eesistujaks'),3,2024-03-12T02:34,9584317,434299,"(0, 206)",aja_pm,xcomp_phrase,"(131, 156)",1,...,H-H-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, nmod),(3, 2, nmod))","(S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))","(0-ner_LOC-ner_LOC,S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))",H-H-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, nmod),(3, 2, nmod))","(S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))","(0-ner_LOC-ner_LOC,S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))"
4,Text(text='Tartu Ülikooli raamatukogu'),3,2024-03-12T02:34,9585715,434351,"(197, 346)",aja_pm,nmod_phrase,"(102, 128)",0,...,H-H-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, nmod),(3, 2, nmod))","(S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))","(0-0-0,S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))",H-H-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, nmod),(3, 2, nmod))","(S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))","(0-0-0,S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,Text(text='Bill Clintoni aktsiad'),3,2024-03-12T02:33,9534862,432727,"(2968, 3043)",aja_pm,nsubj_phrase,"(36, 57)",1,...,H-H-S,"(1, 3, 2, 0)","((0, 3, root),(1, 2, flat),(3, 1, nmod))","(S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))","(0-ner_PER-ner_PER,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))",H-H-S,"(1, 3, 2, 0)","((0, 3, root),(1, 2, flat),(3, 1, nmod))","(S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))","(0-ner_PER-ner_PER,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))"
1196,Text(text='Riigikogu Isamaaliidu fraktsiooni'),3,2024-03-12T02:33,9545961,433043,"(9462, 9502)",aja_pm,nmod_phrase,"(0, 33)",0,...,H-H-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, nmod),(3, 2, nmod))","(S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))","(0-0-0,S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))",H-H-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, nmod),(3, 2, nmod))","(S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))","(0-0-0,S-H-H,(0, 3, root),(2, 1, nmod),(3, 2, nmod))"
1197,Text(text='Puerto Rico parlamendis'),3,2024-03-12T02:33,9546471,433058,"(1242, 1340)",aja_pm,obl_phrase,"(73, 96)",0,...,H-H-S,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, nmod))","(H-H-S,(0, 1, root),(1, 2, flat),(1, 3, nmod))","(0-0-0,H-H-S,(0, 1, root),(1, 2, flat),(1, 3, nmod))",H-H-S,"(1, 0, 2, 3)","((0, 1, root),(1, 2, flat),(1, 3, nmod))","(H-H-S,(0, 1, root),(1, 2, flat),(1, 3, nmod))","(0-0-0,H-H-S,(0, 1, root),(1, 2, flat),(1, 3, nmod))"
1198,Text(text='Sven Kivisildniku kodulehekülje'),3,2024-03-12T02:33,9548689,433138,"(13, 259)",aja_pm,nmod_phrase,"(0, 31)",1,...,H-H-S,"(1, 3, 2, 0)","((0, 3, root),(1, 2, flat),(3, 1, nmod))","(S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))","(0-ner_PER-ner_PER,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))",H-H-S,"(1, 3, 2, 0)","((0, 3, root),(1, 2, flat),(3, 1, nmod))","(S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))","(0-ner_PER-ner_PER,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))"


In [18]:
df_HS

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='Tartu sildadest'),2,2024-03-12T02:33,9553255,433270,"(356, 439)",aja_pm,obl_phrase,"(44, 59)",1,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-ner_LOC,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-ner_LOC,S-H,(0, 2, root),(2, 1, nmod))"
1,Text(text='Nürnbergi tribunal'),2,2024-03-12T02:33,9553952,433288,"(2825, 2996)",aja_pm,nsubj_phrase,"(101, 119)",0,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))"
2,Text(text='Estiko otsusest'),2,2024-03-12T02:33,9555614,433340,"(1473, 1611)",aja_pm,obl_phrase,"(99, 114)",0,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))"
3,Text(text='Miti sõnul'),2,2024-03-12T02:33,9556672,433378,"(820, 947)",aja_pm,obl_phrase,"(0, 10)",1,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-ner_PER,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-ner_PER,S-H,(0, 2, root),(2, 1, nmod))"
4,Text(text='Eesti Üliõpilaste'),2,2024-03-12T02:33,9556791,433384,"(381, 454)",aja_pm,nmod_phrase,"(40, 57)",0,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10559,Text(text='Jänese kõrtsist'),2,2024-03-12T02:33,9537970,432819,"(1171, 1269)",aja_pm,obl_phrase,"(0, 15)",0,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))"
10560,Text(text='Granada klubi'),2,2024-03-12T02:33,9540750,432912,"(3568, 3627)",aja_pm,obl_phrase,"(16, 29)",1,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-ner_LOC,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-ner_LOC,S-H,(0, 2, root),(2, 1, nmod))"
10561,Text(text='Daiwa laenu'),2,2024-03-12T02:33,9547525,433099,"(4795, 4915)",aja_pm,nmod_phrase,"(78, 89)",0,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))"
10562,"Text(text=""Apple'i aktsiate"")",2,2024-03-12T02:33,9549035,433148,"(3578, 3649)",aja_pm,nmod_phrase,"(0, 16)",0,...,H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))",H-S,"(1, 2, 0)","((0, 2, root),(2, 1, nmod))","(S-H,(0, 2, root),(2, 1, nmod))","(0-0,S-H,(0, 2, root),(2, 1, nmod))"


In [19]:
df_HAS

Unnamed: 0,phrase,phrase_length,document_creation_time,sentence_id,document_id,sentence_startend,subcorpus,phrase_type,phrase_start_end,has_ner_netity,...,pos_sequence,graph,graph_code,graph_code_pos,graph_code_pos_ner_timex,pos_sequence_verb_info,graph_verb_info,graph_code_verb_info,graph_code_pos_verb_info,graph_code_pos_ner_timex_verb_info
0,Text(text='Eesti keskmise suurusega'),3,2024-03-12T02:34,9573263,433881,"(357, 487)",aja_pm,nmod_phrase,"(82, 112)",1,...,H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))",H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"
1,Text(text='Moskva pankadevahelisel turul'),3,2024-03-12T02:34,9583505,434263,"(1766, 1900)",aja_pm,obl_phrase,"(88, 117)",1,...,H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))",H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"
2,Text(text='Hollandi sihvakad atleedid'),3,2024-03-12T02:34,9584761,434314,"(2035, 2122)",aja_pm,nsubj_phrase,"(0, 26)",1,...,H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))",H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"
3,Text(text='Eestis elavate inimeste'),3,2024-03-12T02:36,9638387,436043,"(4173, 4301)",aja_pm,nmod_phrase,"(31, 54)",1,...,H-A-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, obl),(3, 2, acl))","(S-H-A,(0, 3, root),(2, 1, obl),(3, 2, acl))","(0-ner_LOC-0,S-H-A,(0, 3, root),(2, 1, obl),(3, 2, acl))",H-A-S,"(1, 2, 3, 0)","((0, 3, root),(2, 1, obl),(3, 2, acl))","(S-H-A,(0, 3, root),(2, 1, obl),(3, 2, acl))","(0-ner_LOC-0,S-H-A,(0, 3, root),(2, 1, obl),(3, 2, acl))"
4,Text(text='Rodmani sensatsiooniline avaldus'),3,2024-03-12T02:36,9641285,436127,"(3237, 3344)",aja_pm,nsubj_phrase,"(0, 32)",0,...,H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-0-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))",H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-0-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,"Text(text=""Verne'i ulmelised seiklusjutud"")",3,2024-03-12T02:28,9352050,426569,"(290, 508)",aja_pm,nsubj_phrase,"(146, 176)",1,...,H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_PER-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))",H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_PER-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"
578,Text(text='Keskerakonna paljulubava nime'),3,2024-03-12T02:30,9422645,428945,"(4976, 5213)",aja_pm,obl_phrase,"(0, 29)",1,...,H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, acl))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, acl))","(0-ner_ORG-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, acl))",H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, acl))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, acl))","(0-ner_ORG-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, acl))"
579,Text(text='Vähi eelmise valitsuse'),3,2024-03-12T02:30,9431491,429255,"(3813, 3950)",aja_pm,nmod_phrase,"(65, 87)",0,...,H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-0-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))",H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-0-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"
580,Text(text='Tartu uue muusika'),3,2024-03-12T02:31,9439075,429510,"(1888, 1992)",aja_pm,nmod_phrase,"(14, 31)",1,...,H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))",H-A-S,"(1, 3, 2, 0)","((0, 3, root),(3, 1, nmod),(3, 2, amod))","(S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))","(0-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"


In [20]:
grouped_HH = df_HH.groupby('graph_code_pos_ner_timex_verb_info')
grouped_HHH = df_HHH.groupby('graph_code_pos_ner_timex_verb_info')
grouped_HHS = df_HHS.groupby('graph_code_pos_ner_timex_verb_info')
grouped_HS = df_HS.groupby('graph_code_pos_ner_timex_verb_info')
grouped_HAS = df_HAS.groupby('graph_code_pos_ner_timex_verb_info')

In [21]:
grouped_HH.ngroups

14

In [22]:
grouped_HHH.ngroups

34

In [23]:
grouped_HHS.ngroups

63

In [24]:
grouped_HS.ngroups

27

In [25]:
grouped_HAS.ngroups

29

In [35]:
ner_phrase = None
for name, group in grouped_HHH:
    #display(group)
    for phrase in group['phrase']:
        if len(phrase.ner)>0:
            ner_phrase = phrase
            break
ner_phrase

text
Jason Tunks Kanadast

0,1
document_creation_time,2024-03-12T01:00
document_id,307465
phrase_start_end,"(156, 176)"
phrase_type,appos_phrase
sentence_id,5765777
sentence_startend,"(808, 1043)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3


In [49]:
wrds = []
for word in ner_phrase.words:
    print(word.partofspeech[0])
    if len(ner_phrase.ner)>0:
        for n in ner_phrase.ner:
            for i in range(len(n)):
                if n[i]==word:
                    if i==0:
                        wrds.append('<'+word.text)
                    elif i == len(n)-1:
                        wrds.append(word.text+'>')
                    else:
                        wrds.append(word.text)
    else:
        wrds.append(word.text)

H
H
H


In [50]:
ner_phrase

text
Jason Tunks Kanadast

0,1
document_creation_time,2024-03-12T01:00
document_id,307465
phrase_start_end,"(156, 176)"
phrase_type,appos_phrase
sentence_id,5765777
sentence_startend,"(808, 1043)"
subcorpus,aja_EPL

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,3
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
timexes,"tid, type, value, temporal_function, anchor_time_id, mod, quant, freq, begin_point, end_point, part_of_interval",,,False,0
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_analysis,,False,3


In [82]:
dict_HHH = {'phrase': [], 'tree': []}
for name, group in grouped_HHH:
    for phrase in group['phrase']:
        wrds = []
        for word in phrase.words:
            is_first_ner = False
            is_last_ner = False
            if len(phrase.ner) > 0:
                for n in phrase.ner:
                    for i in range(len(n)):
                        if n[i]==word:
                            if len(n) == 1:
                                is_first_ner = True
                                is_last_ner = True
                            else:
                                if i==0:
                                    is_first_ner = True
                                elif i == len(n)-1:
                                    is_last_ner = True
                                    
            if is_first_ner and is_last_ner:
                wrds.append("<"+word.text+">")
            if is_first_ner:
                wrds.append("<"+word.text)
            elif is_last_ner:
                wrds.append(word.text+">")
            else:
                wrds.append(word.text)
                
        dict_HHH['phrase'].append(' '.join(wrds))
        dict_HHH['tree'].append(name)
        
data_HHH = pd.DataFrame(dict_HHH)

In [83]:
data_HHH

Unnamed: 0,phrase,tree
0,Time Warner Inc,"(0-0-0,H-H-H,(0, 1, root),(1, 2, appos),(2, 3, flat))"
1,Sopoti Prokom Trefl,"(0-0-0,H-H-H,(0, 1, root),(1, 2, appos),(2, 3, flat))"
2,Sotsiaalminister Eiki Nestori,"(0-0-0,H-H-H,(0, 1, root),(1, 2, appos),(2, 3, flat))"
3,Õhtulehest Priit Hõbemägi,"(0-0-0,H-H-H,(0, 1, root),(1, 2, appos),(2, 3, flat))"
4,Khalid Khan Achakzai,"(0-0-0,H-H-H,(0, 1, root),(1, 2, appos),(2, 3, flat))"
...,...,...
520,<Ander Paeorg> Estonial,"(ner_PER-ner_PER-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
521,<Jonathan Peled> Jeruusalemmas,"(ner_PER-ner_PER-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
522,<Jim Yonts> Reutersile,"(ner_PER-ner_PER-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, flat))"
523,<Jason Tunks> Kanadast,"(ner_PER-ner_PER-0,H-H-H,(0, 1, root),(1, 2, flat),(1, 3, nmod))"


In [76]:
dict_HH = {'phrase': [], 'tree': []}
for name, group in grouped_HH:
    for phrase in group['phrase']:
        wrds = []
        for word in phrase.words:
            is_first_ner = False
            is_last_ner = False
            if len(phrase.ner) > 0:
                for n in phrase.ner:
                    for i in range(len(n)):
                        if n[i]==word:
                            if len(n) == 1:
                                is_first_ner = True
                                is_last_ner = True
                            else:
                                if i==0:
                                    is_first_ner = True
                                elif i == len(n)-1:
                                    is_last_ner = True
                                    
            if is_first_ner and is_last_ner:
                wrds.append("<"+word.text+">")
            elif is_first_ner:
                wrds.append("<"+word.text)
            elif is_last_ner:
                wrds.append(word.text+">")
            else:
                wrds.append(word.text)
                
        dict_HH['phrase'].append(' '.join(wrds))
        dict_HH['tree'].append(name)
        
data_HH = pd.DataFrame(dict_HH)

In [77]:
data_HH

Unnamed: 0,phrase,tree
0,Ford Sierra,"(0-0,H-H,(0, 1, root),(1, 2, appos))"
1,Vallavanem Kastepõld,"(0-0,H-H,(0, 1, root),(1, 2, appos))"
2,Eestimaa Rahvaliit,"(0-0,H-H,(0, 1, root),(1, 2, appos))"
3,Eestimaa Rahvaliit,"(0-0,H-H,(0, 1, root),(1, 2, appos))"
4,Ford Sierra,"(0-0,H-H,(0, 1, root),(1, 2, appos))"
...,...,...
10341,<KEEGI> RAKVEREST,"(ner_PER-0,H-H,(0, 1, root),(1, 2, nmod))"
10342,<Lõunanaaber> Eestiga,"(ner_PER-0,H-H,(0, 1, root),(1, 2, nmod))"
10343,Seitsmeaastane Sander,"(timex-0,H-H,(0, 1, root),(1, 2, flat))"
10344,Jaanuar Sillamäe,"(timex-0,H-H,(0, 1, root),(1, 2, flat))"


In [88]:
dict_HHS = {'phrase': [], 'tree': []}
for name, group in grouped_HHS:
    for phrase in group['phrase']:
        wrds = []
        for word in phrase.words:
            is_first_ner = False
            is_last_ner = False
            if len(phrase.ner) > 0:
                for n in phrase.ner:
                    for i in range(len(n)):
                        if n[i]==word:
                            if len(n) == 1:
                                is_first_ner = True
                                is_last_ner = True
                            else:
                                if i==0:
                                    is_first_ner = True
                                elif i == len(n)-1:
                                    is_last_ner = True
                                    
            if is_first_ner and is_last_ner:
                wrds.append("<"+word.text+">")
            elif is_first_ner:
                wrds.append("<"+word.text)
            elif is_last_ner:
                wrds.append(word.text+">")
            else:
                wrds.append(word.text)
                
        dict_HHS['phrase'].append(' '.join(wrds))
        dict_HHS['tree'].append(name)
        
data_HHS = pd.DataFrame(dict_HHS)

In [89]:
data_HHS

Unnamed: 0,phrase,tree
0,Peanäitejuht Raivo Trassi,"(0-0-0,H-H-S,(0, 1, root),(1, 2, appos),(2, 3, flat))"
1,Fakto Mustamäe Auto,"(0-0-0,H-H-S,(0, 1, root),(1, 2, flat),(1, 3, appos))"
2,Suure-Jaani Puit allkirjaõigus,"(0-0-0,H-H-S,(0, 1, root),(1, 2, flat),(1, 3, appos))"
3,Tarvet Vaigu narkoäri,"(0-0-0,H-H-S,(0, 1, root),(1, 2, flat),(1, 3, appos))"
4,Terminal Server põhiste,"(0-0-0,H-H-S,(0, 1, root),(1, 2, flat),(1, 3, appos))"
...,...,...
1195,<Tiiu Seppmeister> neiuna,"(ner_PER-ner_PER-0,H-H-S,(0, 1, root),(1, 2, flat),(1, 3, nmod))"
1196,<Sirje Pihti> silmi,"(ner_PER-ner_PER-0,H-H-S,(0, 1, root),(1, 2, flat),(1, 3, obj))"
1197,Kevadisel Steinhude regatil,"(timex-0-0,H-S-H,(0, 1, root),(1, 3, nmod),(3, 2, nmod))"
1198,Older Grupi mais,"(timex-0-0,S-H-H,(0, 3, root),(1, 2, flat),(3, 1, nmod))"


In [90]:
dict_HS = {'phrase': [], 'tree': []}
for name, group in grouped_HS:
    for phrase in group['phrase']:
        wrds = []
        for word in phrase.words:
            is_first_ner = False
            is_last_ner = False
            if len(phrase.ner) > 0:
                for n in phrase.ner:
                    for i in range(len(n)):
                        if n[i]==word:
                            if len(n) == 1:
                                is_first_ner = True
                                is_last_ner = True
                            else:
                                if i==0:
                                    is_first_ner = True
                                elif i == len(n)-1:
                                    is_last_ner = True
                                    
            if is_first_ner and is_last_ner:
                wrds.append("<"+word.text+">")
            elif is_first_ner:
                wrds.append("<"+word.text)
            elif is_last_ner:
                wrds.append(word.text+">")
            else:
                wrds.append(word.text)
                
        dict_HS['phrase'].append(' '.join(wrds))
        dict_HS['tree'].append(name)
        
data_HS = pd.DataFrame(dict_HS)

In [91]:
data_HS

Unnamed: 0,phrase,tree
0,Rahu ajal,"(0-0,H-S,(0, 1, root),(1, 2, case))"
1,Thatcheri ajal,"(0-0,H-S,(0, 1, root),(1, 2, case))"
2,Julia käest,"(0-0,H-S,(0, 1, root),(1, 2, case))"
3,Ribbecki näol,"(0-0,H-S,(0, 1, root),(1, 2, case))"
4,Tamsalu-Vajangu kanti,"(0-0,H-S,(0, 1, root),(1, 2, case))"
...,...,...
10559,<Eesti> laupäeval,"(timex-ner_LOC,S-H,(0, 2, root),(2, 1, nmod))"
10560,<Kiviõlis> ööl,"(timex-ner_LOC,S-H,(0, 2, root),(2, 1, nmod))"
10561,<Eesti> märtsis,"(timex-ner_LOC,S-H,(0, 2, root),(2, 1, nmod))"
10562,<Eesti> lähiajaloost,"(timex-ner_LOC,S-H,(0, 2, root),(2, 1, nmod))"


In [92]:
dict_HAS = {'phrase': [], 'tree': []}
for name, group in grouped_HAS:
    for phrase in group['phrase']:
        wrds = []
        for word in phrase.words:
            is_first_ner = False
            is_last_ner = False
            if len(phrase.ner) > 0:
                for n in phrase.ner:
                    for i in range(len(n)):
                        if n[i]==word:
                            if len(n) == 1:
                                is_first_ner = True
                                is_last_ner = True
                            else:
                                if i==0:
                                    is_first_ner = True
                                elif i == len(n)-1:
                                    is_last_ner = True
                                    
            if is_first_ner and is_last_ner:
                wrds.append("<"+word.text+">")
            elif is_first_ner:
                wrds.append("<"+word.text)
            elif is_last_ner:
                wrds.append(word.text+">")
            else:
                wrds.append(word.text)
                
        dict_HAS['phrase'].append(' '.join(wrds))
        dict_HAS['tree'].append(name)
        
data_HAS = pd.DataFrame(dict_HAS)

In [93]:
data_HAS

Unnamed: 0,phrase,tree
0,Impreza vastavale numbrile,"(0-0-0,H-S-A,(0, 1, root),(1, 3, nmod),(3, 2, acl))"
1,Liviko eelmise aasta,"(0-0-0,H-S-A,(0, 1, root),(1, 3, nmod),(3, 2, amod))"
2,Putini edaspidisele majanduspoliitikale,"(0-0-0,H-S-A,(0, 1, root),(1, 3, nmod),(3, 2, amod))"
3,Aljandi eelmiste hooaegade,"(0-0-0,H-S-A,(0, 1, root),(1, 3, nmod),(3, 2, amod))"
4,Nõud pastöriseerimata piimaga,"(0-0-0,H-S-A,(0, 1, root),(1, 3, nmod),(3, 2, amod))"
...,...,...
577,Üleeilse ülevenemaalise protestimiitingute,"(0-timex-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"
578,<Kesk-Ameerikas> möllava sajandi,"(timex-ner_LOC-0,S-H-A,(0, 3, root),(2, 1, obl),(3, 2, acl))"
579,<Eestimaa> puhkev suvi,"(timex-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, acl))"
580,<Venemaa> karmi talve,"(timex-ner_LOC-0,S-H-A,(0, 3, root),(3, 1, nmod),(3, 2, amod))"


Tulemused salvestatakse Excelisse:

In [96]:
data_HHH.to_excel('ner_phrases_100000/data_HHH.xlsx')
data_HH.to_excel('ner_phrases_100000/data_HH.xlsx')
data_HHS.to_excel('ner_phrases_100000/data_HHS.xlsx')
data_HS.to_excel('ner_phrases_100000/data_HS.xlsx')
data_HAS.to_excel('ner_phrases_100000/data_HAS.xlsx')