In [1]:
!mkdir result

In [7]:
import sqlite3 as sqlite
import os
import pandas as pd

db_file_name = '../v31_koondkorpus_sentences_verb_pattern_obl_20240524-153036.db'


In [5]:
conn = None
if os.path.isfile(db_file_name):
    conn = sqlite.connect(db_file_name)
else:
    print(f'file {db_file_name} does not exist')



## Verbi alluvad korduvate deprelidega

Mitu korda sama deprel kordub sama lause all, sagedusloend.

In [23]:
query = """
SELECT 
sub.deprel, sub.total, COUNT(sub.head_id) AS occurences
FROM 
(
SELECT
tr.head_id,
tr.deprel,
COUNT(tr.id) as total
FROM `transaction` tr
GROUP BY tr.head_id, tr.deprel
) AS sub

GROUP BY sub.deprel, sub.total
"""

df = pd.read_sql(query, con=conn)
df.sort_values('occurences', inplace=True, ascending=False)
df.to_csv('result/deprel_total_occurrences.csv', index=None)
df.head()

Unnamed: 0,deprel,total,occurences
77,nsubj,1,13722953
94,obl,1,6618977
89,obj,1,6207811
9,advmod,1,4657895
22,aux,1,2730736


## Verbi deprel korduva käändega
 
Mitu korda esineb sama deprel samas käändes sama verbi all, sagedusloend.

In [28]:
def extract_case(feats):
    splitted = feats.split(',')

    for case in [
        "nom",  # nimetav
        "gen",  # omastav
        "part",  # osastav
        "adit",  # lyh sisse
        "ill",  # sisse
        "in",  # sees
        "el",  # seest
        "all",  # alale
        "ad",  # alal
        "abl",  # alalt
        "tr",  # saav
        "term",  # rajav
        "es",  # olev
        "abes",  # ilma#
        "kom",  # kaasa#
        ]:
        if case in splitted:
            return case
    return ''
conn.create_function("extract_case", 1, extract_case)

query = """
 SELECT 
sub.deprel, sub.ext_case, sub.total, COUNT(sub.head_id) AS occurences
FROM 
(
SELECT
tr.head_id,
tr.deprel,
extract_case(tr.feats) AS `ext_case`,
COUNT(tr.id) as total
FROM `transaction` tr
GROUP BY tr.head_id, tr.deprel, extract_case(tr.feats)
) AS sub

GROUP BY sub.deprel, sub.total, sub.ext_case
"""

df = pd.read_sql(query, con=conn)
df.sort_values('occurences', inplace=True, ascending=False)
df.to_csv('result/deprel_case_total_occurrences.csv', index=None)
df.head()

Unnamed: 0,deprel,ext_case,total,occurences
0,acl,,1,1737
1,acl,abes,1,1
2,acl,abl,1,4
3,acl,ad,1,32
4,acl,all,1,21


## Konkreetsed laused, kus on ühe verbi all mitu obl sama käändega ad

In [32]:

query = """
SELECT
th.sentence_id,
tr.head_id,
tr.deprel,
extract_case(tr.feats) AS `ext_case`,
COUNT(tr.id) as total
FROM `transaction` tr
INNER JOIN `transaction_head` th ON th.id = tr.head_id
WHERE tr.deprel = 'obl' and ext_case = 'ad'

GROUP BY tr.head_id, tr.deprel, extract_case(tr.feats)
HAVING total>0

"""

df = pd.read_sql(query, con=conn)
df.sort_values('total', inplace=True, ascending=False)
df.to_csv('result/obl_ad_dupl.csv', index=None)
df.head()

Unnamed: 0,sentence_id,head_id,deprel,ext_case,total
1372557,10739649,17209077,obl,ad,6
1663363,13122156,20996792,obl,ad,6
1969217,16781787,25824219,obl,ad,5
429479,3087995,4957174,obl,ad,5
453952,3258417,5236442,obl,ad,5


In [None]:
conn.close()