### Imports

In [37]:
import sqlite3
import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict

### Analysing the data

In [2]:
conn = sqlite3.connect('subj_verb_collocations_20211110.db')
cur = conn.cursor()

In [6]:
cur.execute("SELECT lemma1, pos1, lemma2, pos2, total FROM subj_verb_koondkorpus WHERE total > 2;")
entries = cur.fetchall()

In [9]:
non_noun = []

for entry in entries:
    if entry[1] not in ['P', 'S']:
        non_noun.append(entry)

In [10]:
len(non_noun)

41013

In [11]:
set([entry[1] for entry in non_noun])

{'A', 'D', 'G', 'J', 'N', 'V', 'X', 'Y', 'Z'}

In [12]:
olnud = []

for entry in non_noun:
    if entry[1] not in olnud:
        print(entry)
        olnud.append(entry[1])

('tegema', 'V', 'olema', 'V', 31516)
('üks', 'N', 'olema', 'V', 6762)
('viimane', 'A', 'olema', 'V', 4726)
('sl', 'Y', 'kirjutama', 'V', 3868)
('«', 'Z', 'olema', 'V', 126)
('rohkem', 'D', 'tunduma', 'V', 105)
('eesti', 'G', 'olema', 'V', 56)
('aga', 'J', 'olema', 'V', 34)
('pärit', 'X', 'olema', 'V', 32)


In [13]:
for entry in entries:
    if entry[3] != 'V':
        print(entry)

### Graph exploration

#### Finding nodes

In [14]:
entries_to_keep = []

for entry in entries:
    if entry[1] in ['S', 'P']:
        entries_to_keep.append(entry)

In [15]:
subjects = [entry[0] for entry in entries_to_keep]
verbs = [entry[2] for entry in entries_to_keep]

In [16]:
subjects_non_dup = []
verbs_non_dup = []

for subject in subjects:
    if subject not in subjects_non_dup:
        subjects_non_dup.append(subject)
        
for verb in verbs:
    if verb not in verbs_non_dup:
        verbs_non_dup.append(verb)

#### Creating the graph

In [21]:
graph = nx.Graph()

In [22]:
graph.add_nodes_from(subjects_non_dup, pos='subject')
graph.add_nodes_from(verbs_non_dup, pos='verb')

In [23]:
entries_to_keep[0]

('see', 'P', 'olema', 'V', 498627)

In [24]:
graph.add_edges_from([(p[0], p[2]) for p in entries_to_keep])

In [25]:
subgraphs = [graph.subgraph(c).copy() for c in nx.connected_components(graph)]

In [26]:
largest_subgraph = graph.subgraph(max(nx.connected_components(graph), key=len)).copy()

In [27]:
print("Number of connected components in the graph:", len(subgraphs))
print(f"Largest connected component includes {len(largest_subgraph.nodes())} words and {len(largest_subgraph.edges())} connections between those words")

Number of connected components in the graph: 29
Largest connected component includes 97765 words and 655702 connections between those words


In [28]:
print("Number of subgraphs with two words (one subject-verb pair):", sum([1 for subgraph in subgraphs if len(subgraph.nodes()) == 2]))

Number of subgraphs with two words (one subject-verb pair): 23


Keeping only words from the largest subgraph because other words aren't connected to many other words (if they're connected to any at all) and because of that wouldn't cluster well

In [33]:
connected_words = largest_subgraph.nodes()

In [34]:
connected_entries = [entry for entry in entries_to_keep if entry[0] in connected_words and entry[2] in connected_words]

### Reducing the number of subjects to a reasonable amount

Keeping 15000 most common subjects for the final dataset based on how often they were present in the corpus (finding the sum of all pairs they were in). 

In [38]:
subject_counts = defaultdict(int)

for entry in connected_entries:
    subject_counts[entry[0]] += entry[4]

In [43]:
sorted_subjects = sorted(subject_counts.items(), key=lambda kv: kv[1], reverse=True)

In [45]:
subjects_to_keep = sorted_subjects[:15000]

In [46]:
final_subjects = [subj for subj, count in subjects_to_keep]

In [47]:
final_entries = [entry for entry in connected_entries if entry[0] in final_subjects]

In [52]:
final_verbs = []

for entry in final_entries:
    verb = entry[2]
    if verb not in final_verbs:
        final_verbs.append(verb)

### Checking duplicate pairs

In [84]:
from collections import Counter

In [79]:
final_pairs = [(entry[0], entry[2]) for entry in final_entries]

In [85]:
duplicate_pairs = [item for item, count in Counter(final_pairs).items() if count > 1]

In [86]:
len(duplicate_pairs)

555

In [87]:
duplicate_pairs[:5]

[('mina', 'olema'),
 ('mina', 'teadma'),
 ('mina', 'tahtma'),
 ('kõik', 'olema'),
 ('mina', 'saama')]

In [88]:
subjects_in_duplicates = [pair[0] for pair in duplicate_pairs]
set(subjects_in_duplicates)

{'iga', 'ise', 'kõik', 'mina', 'miski', 'muu', 'oma', 'setu', 'sina'}

In [90]:
for entry in final_entries:
    if entry[0] == 'mina' and entry[2] in ['olema', 'teadma', 'tahtma']:
        print(entry)

('mina', 'P', 'olema', 'V', 119511)
('mina', 'P', 'teadma', 'V', 47550)
('mina', 'P', 'tahtma', 'V', 40719)
('mina', 'S', 'olema', 'V', 3313)
('mina', 'S', 'teadma', 'V', 807)
('mina', 'S', 'tahtma', 'V', 583)


In [91]:
for entry in final_entries:
    if entry[0] == 'kõik' and (entry[0], entry[2]) in duplicate_pairs:
        print(entry)

('kõik', 'P', 'olema', 'V', 38615)
('kõik', 'S', 'olema', 'V', 15)


In [92]:
for entry in final_entries:
    if entry[0] == 'setu' and (entry[0], entry[2]) in duplicate_pairs:
        print(entry)

('setu', 'S', 'olema', 'V', 98)
('setu', 'P', 'olema', 'V', 5)


### Matrix creation

In [54]:
data = pd.DataFrame(0, index=final_subjects, columns=final_verbs)
data.head()

Unnamed: 0,olema,teadma,ütlema,tahtma,saama,tähendama,tegema,lisama,arvama,nägema,...,kiduma,klõbisema,runnima,viidsima,pritsuma,ketaalima,seiduma,ücima,müübima,juksima
tema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mina,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
see,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
kes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
data.shape

(15000, 4156)

In [56]:
final_entries[:3]

[('see', 'P', 'olema', 'V', 498627),
 ('tema', 'P', 'olema', 'V', 210031),
 ('mis', 'P', 'olema', 'V', 159528)]

In [119]:
data_dict = data.to_dict()

In [120]:
list(data_dict.keys())[:3]

['olema', 'teadma', 'ütlema']

In [121]:
#for tup in entries_to_keep:
#    data.loc[tup[0], tup[2]] = tup[4]

for tup in final_entries:
    # += count because some pairs are twice in the entries, once where the subject's
    # postag is P and once S, otherwise the default value is 0 anyway so it'll just be count
    data_dict[tup[2]][tup[0]] += tup[4]

In [122]:
data_final = pd.DataFrame(data_dict)

In [123]:
data_final

Unnamed: 0,olema,teadma,ütlema,tahtma,saama,tähendama,tegema,lisama,arvama,nägema,...,kiduma,klõbisema,runnima,viidsima,pritsuma,ketaalima,seiduma,ücima,müübima,juksima
tema,210031,15303,43006,22397,37392,829,33115,31796,7440,11059,...,0,0,0,0,0,0,0,0,0,0
mina,122824,48357,22122,41302,39695,131,29568,659,30362,21060,...,0,0,3,3,0,0,0,3,0,0
see,498627,2305,1064,868,7488,38195,10684,820,201,1968,...,0,0,0,0,3,0,0,0,0,0
mis,159528,607,2033,1970,17968,12900,11499,564,739,3730,...,0,0,0,0,0,0,0,0,0,0
kes,81930,8050,4846,15278,16658,34,13727,303,2673,3735,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cathy,10,0,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Mumme,0,0,9,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
Kosk,0,0,10,0,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
tsiviilkolleegium,6,0,0,0,0,0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
data_final.shape

(15000, 4156)

In [125]:
data_final.max()[:5]

olema     498627
teadma     48357
ütlema     43006
tahtma     41302
saama      39695
dtype: int64

In [128]:
for entry in final_entries:
    if entry[0] == 'see' and entry[2] == 'olema':
        print(entry)

('see', 'P', 'olema', 'V', 498627)


In [126]:
for entry in final_entries:
    if entry[0] == 'mina' and entry[2] == 'olema':
        print(entry)

('mina', 'P', 'olema', 'V', 119511)
('mina', 'S', 'olema', 'V', 3313)


In [129]:
data_final.to_csv('subject_verb_df.csv')