# Extract random examples

## Make base table for all examples

1. Fetch collocations table to df
2. Group by verb - compound - case, concat ids to array
3. iterate over grouped table, shuffle ids and fetch ids from 10 examples with different lemmas

In [3]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import sqlite3
import random
import json
from datetime import datetime
date_time = datetime.now().strftime("%Y%m%d-%H%M%S")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
#db_file_name = 'v20_koondkorpus_sentences_test_5000_sg_thread_verb_compound_obl_collocations_20230609-112139.db'
#collection_name = 'koondkorpus_sentences_test_5000_sg_thread'

db_file_name = 'v20_koondkorpus_sentences_verb_compound_obl_collocations_20230609-115059.db'
collection_name = 'koondkorpus_sentences'

In [5]:
connection = sqlite3.connect(db_file_name)
connection.row_factory = sqlite3.Row 
cursor = connection.cursor()

In [6]:
%%time 
# andmebaasist andmete pärimine pandas dataframe objekti 
# andmebaasist andmete pärimine ja kuvamine

df = pd.read_sql_query("""
SELECT col.verb, col.verb_compound, col.obl_case, group_concat( col.id ) as row_ids, sum(`count`) as total
FROM verb_compound_obl AS col
GROUP BY col.verb, col.verb_compound, col.obl_case
ORDER BY verb, verb_compound, obl_case""", connection)

display(df.shape)
display(df.head(3))


(150414, 5)

Unnamed: 0,verb,verb_compound,obl_case,row_ids,total
0,01algama,,<käändumatu>,3308457,1
1,01algama,,gen,3308456,1
2,01surema,,<käändumatu>,3677158,1


CPU times: user 9.04 s, sys: 7.16 s, total: 16.2 s
Wall time: 28.6 s


In [7]:
df.to_csv(f'{date_time}_base_for_example_queries.csv', index=None)


## Get examples row_ids of max 10 different lemmas

In [8]:
df2 = pd.read_csv(f'{date_time}_base_for_example_queries.csv')
df2.head(1)

Unnamed: 0,verb,verb_compound,obl_case,row_ids,total
0,01algama,,<käändumatu>,3308457,1


In [11]:
%%time 
# TODO! siin peab tegelikult võtma kuni 10
dict_with_examples = []
columns = df2.columns
#df['random_example'] = ''
for i in range(df2.shape[0]):
    row = {}
    for c in columns:
        row[c] = df2[c][i]
    examples = df2['row_ids'][i].split(',')
    random.shuffle(examples)
    for exmp in examples[:10]:
        row2 = row.copy()
        row2['random_example'] = exmp
        dict_with_examples.append(row2)
df_examples = pd.DataFrame.from_dict(dict_with_examples)
display(df_examples.head(3))
display(df_examples.shape)
df_examples.to_csv(f'{date_time}_base_for_example_queries2.csv', index=None)

Unnamed: 0,verb,verb_compound,obl_case,row_ids,total,random_example
0,01algama,,<käändumatu>,3308457,1,3308457
1,01algama,,gen,3308456,1,3308456
2,01surema,,<käändumatu>,3677158,1,3677158


(548607, 6)

CPU times: user 14.9 s, sys: 522 ms, total: 15.5 s
Wall time: 16.1 s


## Get random sentence ids of examples (runs ca 7 min for 550 000 rows)

In [12]:
df_sentences = pd.read_csv(f'{date_time}_base_for_example_queries2.csv')
display(df_sentences.head(2))

Unnamed: 0,verb,verb_compound,obl_case,row_ids,total,random_example
0,01algama,,<käändumatu>,3308457,1,3308457
1,01algama,,gen,3308456,1,3308456


In [13]:
%%time

# 1 query is made for each row in base_for_example_queries.csv

# query from sqlite
query_one_random = """SELECT verb_compound_obl_examples.*, random() as order_by
FROM 
verb_compound_obl_examples 
WHERE row_id = %i
ORDER BY order_by
LIMIT 1 
"""

keys = ('sentence_id', 'root_id', 'verb_id', 'compound_ids', 'obl_ids')
for k in keys:
    df_sentences[k] = ''

for i in range(df_sentences.shape[0]):
    cursor.execute(query_one_random % int(df_sentences['random_example'][i]))
    res = cursor.fetchone()
    for k in keys:
        df_sentences.loc[i, k] = res[k]

df_sentences.to_csv(f'{date_time}_base_for_example_sentence_ids.csv', index=None)   

CPU times: user 4min 40s, sys: 29.1 s, total: 5min 9s
Wall time: 6min 33s


## Fetch sentence data from postgres db and make df with examples

In [14]:
df_final = pd.read_csv(f'{date_time}_base_for_example_sentence_ids.csv')
df_final.fillna('')

# make sentences_uniq list
sentence_ids = [int(id) for id in list(df_final['sentence_id'].unique())]

#total number of sentences to fetch and cache in sentences_dict
len(sentence_ids)


466735

In [21]:
%%time
# Fetching sentences from database (ca 500 000)

sentences_dict = {}
from data_helpers.syntax_graph import SyntaxGraph

from data_helpers.db_reader import DbReader



my_db_reader = DbReader(pgpass_file='~/.pgpass',\
                          schema='estonian_text_corpora',\
                          role='estonian_text_corpora_read',\
                          temporary=False,\
                          collection_name=collection_name)
my_db_reader.set_layers(['v172_stanza_syntax'])

for collection_id, text in my_db_reader.get_collections(shuffle=False, progressbar='ascii', col_ids=sentence_ids):
    sentences_dict[collection_id] = text



INFO:storage.py:41: connecting to host: 'postgres.keeleressursid.ee', port: '5432', dbname: 'estonian-text-corpora', user: 'zummy'
INFO:storage.py:58: schema: 'estonian_text_corpora', temporary: False, role: 'estonian_text_corpora_read'


collection_id: 159732: 100%|########################################################################################################################################| 29010/29010 [04:27<00:00, 108.31doc/s]

CPU times: user 1min 43s, sys: 8.88 s, total: 1min 52s
Wall time: 4min 30s





In [None]:
# 
df_final.astype({'compound_ids': 'str', 'compound_ids': 'str'}).dtypes
df_final = df_final.fillna('')
df_final['verb_span'] = ''
df_final['obl_span'] = ''
df_final['obl_lemma'] = ''
df_final['sentence'] = ''

def get_span(graph, nodes, label):
    spans = get_spans(graph, nodes, label)
    if len(spans) == 1:
        return spans[0]
    return spans

def get_spans(graph, nodes, label):
    spans = []
    for n in nodes:
        spans.append({
            'start':graph.nodes[n]['start'],
            'end':graph.nodes[n]['end'],
            'text': graph.nodes[n]['form'],
            'labels': [label]})
    
    return spans


for i in range(df_final.shape[0]):
    sentence_id = df_final['sentence_id'][i]
    verb_id = int(df_final['verb_id'][i])
    obl_root = int(df_final['root_id'][i])
    
    compound_ids = [ int(n) for n in  df_final['compound_ids'][i].split(',') if n.isdigit()]
    obl_ids = [ int(n) for n in  df_final['obl_ids'][i].split(',') if n.isdigit()]
    text = sentences_dict[sentence_id].text;


    g = SyntaxGraph(sentences_dict[sentence_id]['v172_stanza_syntax'])
    
    df_final.loc[i, 'obl_lemma'] = g.nodes[obl_root]['lemma']
    
    # g.draw_graph(highlight=[verb_id])
    
    df_final.loc[i, 'sentence'] = str(text)
    # print(collection_id, text)
    
    df_final.loc[i, 'verb_span'] = json.dumps(get_span(g, [verb_id], 'V'), ensure_ascii=False)
    df_final.loc[i, 'obl_span'] = json.dumps(get_span(g, [obl_root], 'OBL'), ensure_ascii=False)
    df_final.loc[i, 'compound_spans'] = json.dumps(get_spans(g, compound_ids, 'COMPOUND'), ensure_ascii=False)
    df_final.loc[i, 'oblp_spans'] = json.dumps(get_spans(g, obl_ids, 'OBLP'), ensure_ascii=False)
    


In [None]:
df_result = df_final.copy()
del df_result['obl_ids']
del df_result['row_ids']
#del df_result['random_example']
del df_result['compound_ids']
df_result.to_csv(f'{date_time}_verb_obl_case_lemma_examples_random.csv', index=False)



In [15]:
df_result.head()

NameError: name 'df_result' is not defined