### Imports

In [1]:
from collections import defaultdict
from data_preprocessing import fetch_entries, connected_entries, matrix_creation

### Analysing the data

In [2]:
entries = fetch_entries('verb_obj_collocations_20211112.db', 'verb_obj_koondkorpus')

#### Removing words that don't fit the criteria

In [3]:
non_noun = []

for entry in entries:
    if entry[3] not in ['P', 'S']:
        non_noun.append(entry)

In [4]:
set([entry[3] for entry in non_noun])

{'A', 'G', 'N', 'Y', 'Z'}

In [5]:
for entry in entries:
    if entry[1] != 'V':
        print(entry)

In [6]:
entries_to_keep = []

for entry in entries:
    if entry[3] in ['S', 'P']:
        entries_to_keep.append(entry)

In [7]:
objects_non_dup = list(dict.fromkeys([entry[2] for entry in entries_to_keep]))
verbs_non_dup = list(dict.fromkeys([entry[0] for entry in entries_to_keep]))

#### Removing pairs that are not connected to others

In [8]:
connected = connected_entries(entries_to_keep, objects_non_dup, verbs_non_dup)

#### Reducing the number of subjects to a reasonable amount

In [9]:
object_counts = defaultdict(int)

for entry in connected:
    object_counts[entry[2]] += entry[4]

In [10]:
objects_to_keep = sorted(object_counts.items(), key=lambda kv: kv[1], reverse=True)[:15000]

In [11]:
final_objects = [obj for obj, count in objects_to_keep]
final_entries = [entry for entry in connected if entry[2] in final_objects]

In [12]:
final_verbs = []

for entry in final_entries:
    verb = entry[0]
    if verb not in final_verbs:
        final_verbs.append(verb)

### Creating the matrix used for LDA

In [14]:
df = matrix_creation(final_entries, final_verbs, final_objects).T

In [15]:
df.head()

Unnamed: 0,tegema,saama,tundma,andma,sõlmima,teadma,maksma,pidama,võtma,pöörama,...,mountima,uvitama,nattima,sailitama,võrdlustama,nendevastuma,postima,väärindama,hapustama,inhibeerima
see,69534,9208,2789,6079,288,13465,2402,12953,12901,273,...,0,0,3,0,0,0,0,0,0,0
mis,46616,7582,3492,7881,834,5235,3286,8315,8511,116,...,0,0,0,0,0,0,0,0,0,0
tema,3881,2438,3037,1798,82,1208,285,4119,5879,98,...,0,0,0,0,3,0,0,0,0,0
ise,1552,374,20326,1257,5,216,279,6065,2645,430,...,0,0,0,0,0,0,0,0,0,0
miski,22786,4216,571,2191,16,9647,1595,315,6522,29,...,0,0,0,0,0,0,0,0,0,0
