# v21 kohakäänetega laused

Skript leiab <code>v21.py</code> poolt korjatud andmete seast andmed, kus ühe verbi küljes on obliikvad kas või enamas erinevas kohakäändes.

Salvestab leitud kohakäänete paarid csv tabelitesse. Iga näite kohta on tabelis olemas lausete ID-d.

Lause tekste lisab tabelisse skript <code>v21_fetch_examples.py</code>.

Tulemus salvestatakse kataloogi <code>result</code>.


In [1]:
%load_ext autoreload
%autoreload 2

import re
import pandas as pd
import sqlite3
import random
import json
from datetime import datetime

%mkdir result

In [2]:
## sqlite custom funktsiooni defineerimine
def regexp(expr, item):
    reg = re.compile(expr)
    return reg.search(item) is not None


# db_file_name = 'v21_koondkorpus_sentences_verb_compound_obl_collocations_20230823-133719.db'
db_file_name = 'v21_koondkorpus_sentences_test_5000_sg_thread_verb_compound_obl_collocations_20230823-124256.db'

collection_name = 'koondkorpus_sentences'
connection = sqlite3.connect(db_file_name)
connection.create_function("REGEXP", 2, regexp)
connection.row_factory = sqlite3.Row 
cursor = connection.cursor()

In [3]:
# kohakäänete paarid

cases = sorted(['adit', 'ill', 'in', 'el', 'all', 'ad', 'abl'])
pairs = sorted(list(set([tuple(sorted([c1, c2])) for c1 in cases for c2 in cases])))
display(pairs)
len(pairs)

[('abl', 'abl'),
 ('abl', 'ad'),
 ('abl', 'adit'),
 ('abl', 'all'),
 ('abl', 'el'),
 ('abl', 'ill'),
 ('abl', 'in'),
 ('ad', 'ad'),
 ('ad', 'adit'),
 ('ad', 'all'),
 ('ad', 'el'),
 ('ad', 'ill'),
 ('ad', 'in'),
 ('adit', 'adit'),
 ('adit', 'all'),
 ('adit', 'el'),
 ('adit', 'ill'),
 ('adit', 'in'),
 ('all', 'all'),
 ('all', 'el'),
 ('all', 'ill'),
 ('all', 'in'),
 ('el', 'el'),
 ('el', 'ill'),
 ('el', 'in'),
 ('ill', 'ill'),
 ('ill', 'in'),
 ('in', 'in')]

28

In [4]:
%%time 
date_time = datetime.now().strftime("%Y%m%d-%H%M%S")
print(date_time)
result_csv_template = f'{date_time}_pairs_%s_%s.csv'

# andmebaasist andmete pärimine pandas dataframe objekti 
for p in pairs:
    result_filename = f'result/{date_time}_sentence_ids_for_{p[0]}_{p[1]}.csv'
    query = f"""
    SELECT 
      vco_ex.sentence_id, 
      vco_ex.root_id,
      vco_ex.verb_id,
      vco_ex.compound_ids,
      vco_ex.clauses_count,
      GROUP_CONCAT(vco.id) as row_ids,
      vco.verb,
      vco.verb_compound,
      vco.count as col_total,
      
      GROUP_CONCAT(vco.obl_case) as cases_list,
      GROUP_CONCAT(vco_ex.root_id) as oblroot_list,
      GROUP_CONCAT(vco_ex.obl_ids, ':') as oblids_list,
      COUNT(vco.obl_case) as obl_nodes_count

    FROM verb_compound_obl_examples AS vco_ex
    INNER JOIN verb_compound_obl vco ON vco.id = vco_ex.row_id
    WHERE 
      vco.obl_case IN ('{p[0]}', '{p[1]}') -- kohakäänded
    GROUP BY vco_ex.sentence_id, verb_id, sentence_id
    
    HAVING 
        obl_nodes_count > 1
        AND cases_list REGEXP '(^|,){p[0]}(,|$)'
        AND cases_list REGEXP '(^|,){p[1]}(,|$)'
    ORDER BY vco.obl_case
    
    --LIMIT 100
    """ 
    df_result = pd.read_sql_query(query, connection)
    df_result.to_csv(result_filename,index=False)

20230824-103951
CPU times: user 3min 40s, sys: 2min 21s, total: 6min 1s
Wall time: 6min 4s


In [5]:
df_result.head()

Unnamed: 0,sentence_id,root_id,verb_id,compound_ids,clauses_count,row_ids,verb,verb_compound,col_total,cases_list,oblroot_list,oblids_list,obl_nodes_count
0,160,3,4,,3,8889,külastama,,8,"in,in",35,3:5,2
1,597,5,3,,2,345347,kandideerima,,59,"in,in",510,"5:9,10",2
2,1320,16,18,,3,819820,olema,,1623,"in,in",1617,16:17,2
3,1345,7,8,,3,768839,käima,,1539,"in,in",76,7:6,2
4,1410,11,6,,2,879880,nägema,,118,"in,in",1114,"9,10,11:12,13,14",2
