### Imports

In [33]:
import sqlite3
import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict

### Analysing the data

In [2]:
conn = sqlite3.connect('verb_obj_collocations_20211112.db')
cur = conn.cursor()

In [6]:
cur.execute("SELECT lemma1, pos1, lemma2, pos2, total FROM verb_obj_koondkorpus WHERE total > 2;")
entries = cur.fetchall()

In [9]:
non_noun = []

for entry in entries:
    if entry[3] not in ['P', 'S']:
        non_noun.append(entry)

In [10]:
len(non_noun)

13650

In [11]:
set([entry[3] for entry in non_noun])

{'A', 'G', 'N', 'Y', 'Z'}

In [12]:
olnud = []

for entry in non_noun:
    if entry[3] not in olnud:
        print(entry)
        olnud.append(entry[3])

('andma', 'V', 'parim', 'A', 1641)
('muutma', 'V', '1', 'N', 1018)
('tundma', 'V', 'end', 'Y', 543)
('tegema', 'V', '«', 'Z', 16)
('tundma', 'V', 'eesti', 'G', 11)


In [13]:
for entry in entries:
    if entry[1] != 'V':
        print(entry)

### Graph exploration

#### Finding nodes

In [14]:
entries_to_keep = []

for entry in entries:
    if entry[3] in ['S', 'P']:
        entries_to_keep.append(entry)

In [15]:
objects = [entry[2] for entry in entries_to_keep]
verbs = [entry[0] for entry in entries_to_keep]

In [16]:
objects_non_dup = []
verbs_non_dup = []

for obj in objects:
    if obj not in objects_non_dup:
        objects_non_dup.append(obj)
        
for verb in verbs:
    if verb not in verbs_non_dup:
        verbs_non_dup.append(verb)

#### Creating the graph

In [21]:
graph = nx.Graph()

In [22]:
graph.add_nodes_from(objects_non_dup, pos='object')
graph.add_nodes_from(verbs_non_dup, pos='verb')

In [24]:
graph.add_edges_from([(p[0], p[2]) for p in entries_to_keep])

In [25]:
subgraphs = [graph.subgraph(c).copy() for c in nx.connected_components(graph)]

In [26]:
largest_subgraph = graph.subgraph(max(nx.connected_components(graph), key=len)).copy()

In [27]:
print("Number of connected components in the graph:", len(subgraphs))
print(f"Largest connected component includes {len(largest_subgraph.nodes())} words and {len(largest_subgraph.edges())} connections between those words")

Number of connected components in the graph: 38
Largest connected component includes 58681 words and 466000 connections between those words


In [28]:
print("Number of subgraphs with two words (one verb-object pair):", sum([1 for subgraph in subgraphs if len(subgraph.nodes()) == 2]))

Number of subgraphs with two words (one verb-object pair): 31


Keeping only words from the largest subgraph because other words aren't connected to many other words (if they're connected to any at all) and because of that wouldn't cluster well

In [29]:
connected_words = largest_subgraph.nodes()

In [30]:
connected_entries = [entry for entry in entries_to_keep if entry[0] in connected_words and entry[2] in connected_words]

### Reducing the number of subjects to a reasonable amount

Keeping 15000 most common objects for the final dataset based on how often they were present in the corpus (finding the sum of all pairs they were in). 

In [34]:
object_counts = defaultdict(int)

for entry in connected_entries:
    object_counts[entry[2]] += entry[4]

In [35]:
sorted_objects = sorted(object_counts.items(), key=lambda kv: kv[1], reverse=True)

In [37]:
objects_to_keep = sorted_objects[:15000]

In [38]:
final_objects = [obj for obj, count in objects_to_keep]

In [50]:
final_entries = [entry for entry in connected_entries if entry[2] in final_objects]

In [52]:
final_verbs = []

for entry in final_entries:
    verb = entry[0]
    if verb not in final_verbs:
        final_verbs.append(verb)

### Checking duplicate pairs

In [55]:
from collections import Counter

In [56]:
final_pairs = [(entry[0], entry[2]) for entry in final_entries]

In [57]:
duplicate_pairs = [item for item, count in Counter(final_pairs).items() if count > 1]

In [58]:
len(duplicate_pairs)

328

In [59]:
duplicate_pairs[:5]

[('tegema', 'miski'),
 ('tegema', 'kõik'),
 ('ütlema', 'miski'),
 ('teadma', 'miski'),
 ('võtma', 'miski')]

In [62]:
objects_in_duplicates = [pair[1] for pair in duplicate_pairs]
set(objects_in_duplicates)

{'iga', 'kõik', 'mina', 'miski', 'muu', 'oma', 'sina'}

In [63]:
for entry in final_entries:
    if entry[2] == 'mina' and entry[0] in ['olema', 'teadma', 'tahtma']:
        print(entry)

('teadma', 'V', 'mina', 'P', 549)
('tahtma', 'V', 'mina', 'P', 464)
('olema', 'V', 'mina', 'P', 50)
('teadma', 'V', 'mina', 'S', 34)
('tahtma', 'V', 'mina', 'S', 12)


In [65]:
for entry in final_entries:
    if entry[2] == 'kõik' and (entry[0], entry[2]) in duplicate_pairs:
        print(entry)

('tegema', 'V', 'kõik', 'P', 10552)
('teadma', 'V', 'kõik', 'P', 1721)
('panema', 'V', 'kõik', 'P', 1321)
('rääkima', 'V', 'kõik', 'P', 747)
('sööma', 'V', 'kõik', 'P', 426)
('arvestama', 'V', 'kõik', 'P', 160)
('näima', 'V', 'kõik', 'P', 71)
('teadma', 'V', 'kõik', 'S', 10)
('tegema', 'V', 'kõik', 'S', 8)
('arvestama', 'V', 'kõik', 'S', 6)
('rääkima', 'V', 'kõik', 'S', 3)
('panema', 'V', 'kõik', 'S', 3)
('näima', 'V', 'kõik', 'S', 3)
('sööma', 'V', 'kõik', 'S', 3)


In [67]:
for entry in final_entries:
    if entry[2] == 'sina' and (entry[0], entry[2]) in duplicate_pairs:
        print(entry)

('tegema', 'V', 'sina', 'P', 615)
('ütlema', 'V', 'sina', 'P', 76)
('ütlema', 'V', 'sina', 'S', 16)
('tegema', 'V', 'sina', 'S', 3)


### Matrix creation

In [68]:
data = pd.DataFrame(0, index=final_objects, columns=final_verbs)
data.head()

Unnamed: 0,tegema,saama,tundma,andma,sõlmima,teadma,maksma,pidama,võtma,pöörama,...,mountima,uvitama,nattima,sailitama,võrdlustama,nendevastuma,postima,väärindama,hapustama,inhibeerima
see,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ise,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
miski,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
data_dict = data.to_dict()

In [71]:
list(data_dict.keys())[:3]

['tegema', 'saama', 'tundma']

In [72]:
for tup in final_entries:
    data_dict[tup[0]][tup[2]] += tup[4]

In [73]:
data_final = pd.DataFrame(data_dict)

In [74]:
data_final

Unnamed: 0,tegema,saama,tundma,andma,sõlmima,teadma,maksma,pidama,võtma,pöörama,...,mountima,uvitama,nattima,sailitama,võrdlustama,nendevastuma,postima,väärindama,hapustama,inhibeerima
see,69534,9208,2789,6079,288,13465,2402,12953,12901,273,...,0,0,3,0,0,0,0,0,0,0
mis,46616,7582,3492,7881,834,5235,3286,8315,8511,116,...,0,0,0,0,0,0,0,0,0,0
tema,3881,2438,3037,1798,82,1208,285,4119,5879,98,...,0,0,0,0,3,0,0,0,0,0
ise,1552,374,20326,1257,5,216,279,6065,2645,430,...,0,0,0,0,0,0,0,0,0,0
miski,22786,4216,571,2191,16,9647,1595,315,6522,29,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tekstuur,4,6,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
lemmiklugu,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alluvus,0,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Malle,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
data_final.shape

(15000, 3735)

In [76]:
data_final.max()[:5]

tegema     69534
saama      62376
tundma     20326
andma      15911
sõlmima    15575
dtype: int64

In [79]:
data_final.at['mina', 'tegema']

2523

In [78]:
for entry in final_entries:
    if entry[0] == 'tegema' and entry[2] == 'mina':
        print(entry)

('tegema', 'V', 'mina', 'P', 2419)
('tegema', 'V', 'mina', 'S', 104)


In [80]:
data_final.to_csv('verb_object_df.csv')