In [1]:
%load_ext autoreload
%autoreload 2


# kala77 andmebaasi tegemine

Skripti töö tulemusena tekitatakse uus andmebaasifail **kala77**.
Baas sisaldab kõiki transaktsioone, mis pole kaetud olemasolevate mustritega.
Kaetud on **ainult mustribaasis olevad** verbid.

Sisendiks on:

- verbi mustrite andmebaas: verb_patterns_new.db
- verbi transaktsioonide andmebaas: v33_transactions.db

## andmebaas kala77

### Tabelid

### TABEL1 transaction_head

| väli          | tüüp | kirjeldus                                                                         | näide      | märkus          |
| ------------- | ---- | --------------------------------------------------------------------------------- | ---------- | --------------- |
| id            | int  | rea <br/>unikaalne ID                                                             | _56_       |                 |
| sentence_id   | int  | lause id andmebaasis                                                              |            |                 |
| loc           | int  | verbi asukoht lauses                                                              |            |                 |
| verb          | text | verbi lemma                                                                       | _olema_    |                 |
| verb_compound | text | verbi afiksaaladverbid                                                            | alla,peale | eraldajaks koma |
| form          | text | verb sellises vormis, nagu see lauses esines                                      | _oli_      |                 |
| deprel        | text | verbi deprel                                                                      |            |                 |
| feats         | text | verbi morf kategooriad alfabeetilises järjekorras                                 | aux,ps3    |                 |
| phrase        | text | puhastatud fraas (ainult need alluvad, mis on transactions tabelisse salvestatud) |            |                 |

### TABEL2 transaction

| väli       | tüüp | kirjeldus                                                            | näide  | märkus |
| ---------- | ---- | -------------------------------------------------------------------- | ------ | ------ |
| id         | int  | rea <br/>unikaalne ID                                                | _56_   |        |
| head_id    | int  | rea transaction_head.id                                              |        |        |
| loc        | int  | sõna asukoht lauses                                                  |        |        |
| loc_rel    | int  | sõna asukoht verbi suhtes                                            |        |        |
| deprel     | text | sõna deprel                                                          |        |        |
| form       | text | sõna vorm                                                            |        |        |
| lemma      | text | sõna lemma                                                           |        |        |
| pos        | text | sõna sõnaliik                                                        |        |        |
| feats      | text | sõna morf kategooriad alfabeetilises järjekorras                     | add,sg |        |
| parent_loc | int  | vanema tipu loc, juhul kui tegemist on <code>obl</obl> alluvaga case | 2      |        |

### TABEL3 verbs_table

| väli          | tüüp | kirjeldus                               | näide    | märkus          |
| ------------- | ---- | --------------------------------------- | -------- | --------------- |
| verb_id       | int  | verbi unikaalne id                      |          |                 |
| verb          | text | verbi lemma                             | _aasima_ |                 |
| verb_compound | text | verbi afiksaaladverbid                  |          | eraldajaks koma |
| pat_ids       | text | mustrite id-d verb_patterns andmebaasis |          | eraldajaks koma |

### TABEL4 verb_transactions

| väli    | tüüp | kirjeldus         | näide | märkus |
| ------- | ---- | ----------------- | ----- | ------ |
| verb_id | int  | verbi id          |       |        |
| head_id | int  | transaktsiooni id |       |        |


In [2]:
import os
from pathlib import Path
from kala77_helpers import *
import sqlite3
from sqlalchemy import create_engine, text


ROOT = str(Path(os.getcwd()).parent.parent)

PATH_TRANSACTIONS_DB = (
    ROOT
    + "/verb_transactions/v33/v33_koondkorpus_sentences_verb_pattern_obl_20241002-130310.db"
)
PATH_PATTERNS_DB = ROOT + "/verb_patterns/vp_data2.db"
KALA77_DB = os.getcwd() + "/kala77.db"

In [3]:
# tekitab andmebaasifaili, kui seda veel ei olnud

con = sqlite3.connect(KALA77_DB)
con.close()


DATABASE_PATH = f"sqlite:///{KALA77_DB}"
print(DATABASE_PATH)

engine = create_engine(DATABASE_PATH)
reset_tables(engine)


# kasutame sqlalchemyt, et oleks lihtsam tabeleid luua
conn = engine.connect()

# liidame teised andmebaasid
conn.execute(text(f"ATTACH DATABASE '{PATH_PATTERNS_DB}' AS db_pat"))
conn.execute(text(f"ATTACH DATABASE '{PATH_TRANSACTIONS_DB}' AS db_tr"))

sqlite:////Users/rabauti/repos/tu/estnltk/syntax_experiments__verb_templates/verb_patterns/kala77/kala77.db


<sqlalchemy.engine.cursor.CursorResult at 0x1244a4d70>

In [4]:
%%time
# täidame verbs tabeli
fill_table_verbs(conn=conn)

CPU times: user 2.03 ms, sys: 977 μs, total: 3.01 ms
Wall time: 3.12 ms


In [5]:
%%time
verbs = conn.execute(select(verbs_table.c.verb_id, verbs_table.c.pat_ids, verbs_table.c.verb, verbs_table.c.verb_compound)).mappings().all()
print(f"andmebaasi lisati {len(verbs)} verbi")

andmebaasi lisati 1271 verbi
CPU times: user 1.57 ms, sys: 517 μs, total: 2.08 ms
Wall time: 1.64 ms


In [6]:
%%time
# täidame verb_transactions tabeli
from tqdm import tqdm
for v in tqdm(verbs):
    fill_table_verb_transactions(conn=conn, verb_id=v['verb_id'], pat_ids=v['pat_ids'].split(','))
conn.execute(text("SELECT COUNT(verb_id) FROM verb_transactions")).scalar()  

100%|██████████| 1271/1271 [00:19<00:00, 65.29it/s] 


CPU times: user 13.1 s, sys: 3.89 s, total: 16.9 s
Wall time: 20.2 s


10796950

In [7]:
%%time 
# täidame transaction_head tabeli, võtab ca paarkümmend minutit aega
fill_table_transaction_head(conn=conn)


CPU times: user 37.1 s, sys: 1min 56s, total: 2min 33s
Wall time: 22min 31s


In [8]:
%%time 
# täidame transaction_head tabeli, võtab ca paarkümmend minutit aega
fill_table_transaction_row(conn=conn)

CPU times: user 1min 58s, sys: 10min 36s, total: 12min 35s
Wall time: 33min 17s


In [9]:
%%time
# kontrollime 10 juhusliku verbi pealt, et numbrid jooksevad kokku
import random
random_i = random.sample(range(0, len(verbs)-1), 10)
# check verbs stat
# get counts of random transactions to check, that numbers align together
for v in [verbs[i] for i in random_i]:
    show_verb_trans_stat(conn=conn, verb=v)
    

verb {'verb_id': 1200, 'pat_ids': '2262,2263', 'verb': 'virgutama', 'verb_compound': ''}
db_pat total: 91
db_pat unmatched: 36
db_pat matched: 55
db_tr all: 91
kala77 all: 36
 
verb {'verb_id': 406, 'pat_ids': '2496', 'verb': 'kloppima', 'verb_compound': 'üles'}
db_pat total: 11
db_pat unmatched: 6
db_pat matched: 5
db_tr all: 11
kala77 all: 6
 
verb {'verb_id': 843, 'pat_ids': '1585', 'verb': 'reguleerima', 'verb_compound': ''}
db_pat total: 5350
db_pat unmatched: 800
db_pat matched: 4550
db_tr all: 5350
kala77 all: 800
 
verb {'verb_id': 307, 'pat_ids': '503', 'verb': 'kadestama', 'verb_compound': ''}
db_pat total: 810
db_pat unmatched: 212
db_pat matched: 598
db_tr all: 810
kala77 all: 212
 
verb {'verb_id': 236, 'pat_ids': '1049,1050', 'verb': 'jalutama', 'verb_compound': 'läbi'}
db_pat total: 117
db_pat unmatched: 71
db_pat matched: 46
db_tr all: 117
kala77 all: 71
 
verb {'verb_id': 869, 'pat_ids': '1623', 'verb': 'ründama', 'verb_compound': ''}
db_pat total: 10046
db_pat unmatch

In [10]:
conn.close()