## Imports and fetching data

In [1]:
import sqlite3
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

In [2]:
conn = sqlite3.connect('verb_compound_advmod_collocations_20220505.db')
cur = conn.cursor()

In [3]:
cur.execute('SELECT name FROM sqlite_master WHERE type="table";')
cur.fetchall()

[('collections_processed',), ('verb_advmod_extended_koondkorpus_base',)]

In [4]:
cur.execute("SELECT * FROM verb_advmod_extended_koondkorpus_base;")
entries = cur.fetchall()

In [5]:
entries[:3]

[('olema',
  'V',
  '',
  '',
  '',
  '',
  9449632,
  'devilgirl_unes : ulluke tead see zum-zum on nii paha = (',
  'zum_zum : aaaaaaaahh .... sina oledki see tama armuke',
  'zum_zum : oled ka titta vä'),
 ('toimuma',
  'V',
  '',
  '',
  '',
  '',
  105688,
  'sarah_14 : ok ja mis w2rk teil sin sis toimub ??',
  'mari : mis toimub',
  'Coolboy : mis toimub'),
 ('saama',
  'V',
  'pihta',
  'D',
  '',
  '',
  1307,
  'Krissu : heheeeee ....... ei saa pihta millest jutt = jätke mu kaablid rahule',
  'Sk8er-Girl : kuulge-eip saa poindile pihta millezt jutt ?',
  'rose_eml : said pihta asjale ?')]

In [6]:
cur.execute('PRAGMA table_info(verb_advmod_extended_koondkorpus_base)')
cur.fetchall()

[(0, 'verb_lemma', 'text', 0, None, 0),
 (1, 'verb_pos', 'text', 0, None, 0),
 (2, 'compound_lemma', 'text', 0, None, 0),
 (3, 'compound_pos', 'text', 0, None, 0),
 (4, 'advmod_lemma', 'text', 0, None, 0),
 (5, 'advmod_pos', 'text', 0, None, 0),
 (6, 'total', 'integer', 0, None, 0),
 (7, 'example1', 'text', 0, None, 0),
 (8, 'example2', 'text', 0, None, 0),
 (9, 'example3', 'text', 0, None, 0)]

## Word count

### Adverbs

In [7]:
entries[0]

('olema',
 'V',
 '',
 '',
 '',
 '',
 9449632,
 'devilgirl_unes : ulluke tead see zum-zum on nii paha = (',
 'zum_zum : aaaaaaaahh .... sina oledki see tama armuke',
 'zum_zum : oled ka titta vä')

In [8]:
entries[0][4]

''

In [9]:
adverbs = [entry[4] for entry in entries]

In [10]:
adverb_counts = Counter(adverbs)

In [11]:
adverb_counts.most_common(10)

[('', 64384),
 ('siis', 14220),
 ('ka', 8697),
 ('kas', 6656),
 ('nii', 6615),
 ('veel', 6317),
 ('juba', 6307),
 ('nüüd', 6130),
 ('küll', 5763),
 ('seal', 5476)]

In [12]:
len(adverbs), len(adverb_counts)

(1087318, 24021)

### Verbs

In [13]:
verbs = [entry[0] for entry in entries]

In [14]:
verb_counts = Counter(verbs)

In [15]:
verb_counts.most_common(10)

[('saama', 11161),
 ('minema', 11117),
 ('võtma', 10050),
 ('tulema', 10014),
 ('olema', 9556),
 ('jääma', 9068),
 ('tegema', 8855),
 ('panema', 7995),
 ('käima', 6388),
 ('andma', 6345)]

In [16]:
len(verbs), len(verb_counts)

(1087318, 27610)

## Verbs by adverb counts

In [17]:
entries[0]

('olema',
 'V',
 '',
 '',
 '',
 '',
 9449632,
 'devilgirl_unes : ulluke tead see zum-zum on nii paha = (',
 'zum_zum : aaaaaaaahh .... sina oledki see tama armuke',
 'zum_zum : oled ka titta vä')

In [18]:
verb_adverb = defaultdict(lambda: defaultdict(int))
verb_adverb_with_counts = defaultdict(lambda: defaultdict(int))

for entry in entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    verb_adverb[verb][adverb] += 1
    verb_adverb_with_counts[verb][adverb] += count

In [19]:
verb_counts.most_common(5)

[('saama', 11161),
 ('minema', 11117),
 ('võtma', 10050),
 ('tulema', 10014),
 ('olema', 9556)]

In [20]:
len(verb_adverb["saama"])

3876

In [21]:
len(verb_adverb["minema"])

3271

In [22]:
len(verb_adverb["võtma"])

3333

## Adverbs by verb counts

In [23]:
adverb_verb = defaultdict(lambda: defaultdict(int))
adverb_verb_with_counts = defaultdict(lambda: defaultdict(int))

for entry in entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    adverb_verb[adverb][verb] += 1
    adverb_verb_with_counts[adverb][verb] += count

In [24]:
adverb_counts.most_common(5)

[('', 64384), ('siis', 14220), ('ka', 8697), ('kas', 6656), ('nii', 6615)]

In [25]:
len(adverb_verb["siis"])

4847

In [26]:
len(adverb_verb[""])

23150

In [27]:
len(verb_counts)

27610

In [28]:
diff_verb_counts = []

for adv, verbs in adverb_verb.items():
    diff_verb_counts.append((adv, len(verbs)))

In [29]:
diff_verb_counts[:3]

[('', 23150), ('sageli', 1589), ('tihti', 1496)]

In [30]:
sorted(diff_verb_counts, key=lambda x: x[1], reverse=True)[:10]

[('', 23150),
 ('siis', 4847),
 ('ka', 4387),
 ('kas', 3667),
 ('nii', 3615),
 ('veel', 3287),
 ('juba', 3216),
 ('siin', 3186),
 ('nüüd', 3183),
 ('küll', 3171)]

## Kui palju verbid esinevad adverbidega

  pr(verb & som adverb)  / pr(verb) skoor peaks olema suur
  
pr(verb & adverb) -> verbiga on koos mingi adverb, liida kokku kõik, kus on midagi ja siis jaga (verb ja adverb)/(verb ja adverb + verb ja ei ole adverbi)

In [31]:
verb_scores = defaultdict(lambda: defaultdict(int))
verbs_counts = defaultdict(int)

for entry in entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    verbs_counts[verb] += count
    if adverb == "":
        verb_scores[verb]["no adverb"] += count
    else:
        verb_scores[verb]["adverb"] += count

In [32]:
all_verb_counts = sum(verbs_counts.values())

In [33]:
all_verb_counts

45179127

In [34]:
verb_probabilities = {}

for verb, value in verbs_counts.items():
    verb_probabilities[verb] = value / all_verb_counts

In [35]:
verb_probabilities["olema"]

0.21904057597217405

In [36]:
verb_scores["olema"]

defaultdict(int, {'no adverb': 9602317, 'adverb': 293745})

In [37]:
293745 / (293745 + 9602317)

0.029683019366693538

In [38]:
scores_verbs = []

for verb, value in verb_scores.items():
    adv = value["adverb"]
    no_adv = value["no adverb"]
    score = adv / (adv + no_adv)
    scores_verbs.append((verb, score))

In [39]:
sorted(scores_verbs, key=lambda x: x[1], reverse=True)[:10]

[('mõtlen-rääkima', 1.0),
 ('joostavõimlema', 1.0),
 ('müüa-kinkida-pantida-vahetama', 1.0),
 ('otsis-ostma', 1.0),
 ('lendad-lendama', 1.0),
 ('suhelda-lavastama', 1.0),
 ('ostnud-müünud-vahetama', 1.0),
 ('coverdattima', 1.0),
 ('rippudes-rahmeldama', 1.0),
 ('hoitud-kaitsma', 1.0)]

In [40]:
verb_scores["mõtlen-rääkima"]

defaultdict(int, {'adverb': 2, 'no adverb': 0})

## Cleaning the data

In [41]:
verbs_counts["olema"]

9896062

In [42]:
verbs_counts["mõtlen-rääkima"]

2

In [43]:
adverb_counts[""]

64384

In [44]:
adverbs_counts = defaultdict(int)

for entry in entries:
    _, _, _, _, adverb, _, count, _, _, _ = entry
    adverbs_counts[adverb] += count

In [45]:
adverbs_counts[""]

36025026

In [46]:
adverbs_counts["kümnetekaupa"]

1

Vaata mis märgend sõnadel on (mingist varasemast data_preprocessing failist saab vaadata, kuidas seda teha) ja eemalda kui ei ole verb, also eemalda kõik, mille verbs_counts väärtus on < 10 (?)

In [47]:
from estnltk.vabamorf.morf import Vabamorf

In [74]:
vm = Vabamorf()

In [75]:
len(list(verbs_counts.keys()))

27610

In [76]:
vm_analysis = vm.analyze(list(verbs_counts.keys()), guess=False)

In [77]:
non_verbs = []
non_verbs_analysis = []
disamb_non_verbs = []
disamb_non_verbs_analysis = []
disamb_verbs = []
disamb_verbs_analysis = []

for analysis in vm_analysis:
    if len(analysis["analysis"]) == 1:
        if analysis["analysis"][0]["partofspeech"] != "V":
            non_verbs.append(analysis["text"])
            non_verbs_analysis.append(analysis)
    else:
        word_postags = [an["partofspeech"] for an in analysis["analysis"]]
        if "V" in word_postags:
            disamb_verbs.append(analysis["text"])
            disamb_verbs_analysis.append(analysis)
        else:
            disamb_non_verbs.append(analysis["text"])
            disamb_non_verbs_analysis.append(analysis)

In [78]:
len(non_verbs), len(disamb_non_verbs), len(disamb_verbs)

(7632, 1, 621)

In [79]:
non_verbs[:10]

['julgema',
 'anuma',
 'ära',
 'looma',
 'kohmetuma',
 'pahvima',
 'vahetuma',
 'küpsema',
 'peetuma',
 'kärtsuma']

In [80]:
vm_analysis_adv = vm.analyze(list(adverb_verb.keys()), guess=False)

In [81]:
non_adverbs = []
non_adverbs_analysis = []
disamb_non_adverbs = []
disamb_non_adverbs_analysis = []
disamb_adverbs = []
disamb_adverbs_analysis = []

for analysis in vm_analysis_adv:
    if len(analysis["analysis"]) == 1:
        if analysis["analysis"][0]["partofspeech"] != "D":
            non_adverbs.append(analysis["text"])
            non_adverbs_analysis.append(analysis)
    else:
        word_postags = [an["partofspeech"] for an in analysis["analysis"]]
        if "D" in word_postags:
            disamb_adverbs.append(analysis["text"])
            disamb_adverbs_analysis.append(analysis)
        else:
            disamb_non_adverbs.append(analysis["text"])
            disamb_non_adverbs_analysis.append(analysis)

In [82]:
len(non_adverbs), len(disamb_non_adverbs), len(disamb_adverbs)

(918, 84, 7)

In [83]:
non_adverbs[:10]

['',
 'aga',
 'vahel',
 'ega',
 'parata',
 'kihla',
 'väärt',
 'ei',
 'mahti',
 'pärast']

In [84]:
print(entries[0])

('olema', 'V', '', '', '', '', 9449632, 'devilgirl_unes : ulluke tead see zum-zum on nii paha = (', 'zum_zum : aaaaaaaahh .... sina oledki see tama armuke', 'zum_zum : oled ka titta vä')


In [85]:
len(entries)

1087318

In [86]:
entries_to_keep = []

for entry in entries:
    verb = entry[0]
    adv = entry[4]
    if verbs_counts[verb] >= 20 and adverbs_counts[adv] >= 20:
        if verb not in non_verbs and (adv not in non_adverbs or adv == ""):
            entries_to_keep.append(entry)

In [87]:
len(entries_to_keep)

975137

## Kui palju verbid esinevad adverbidega

  pr(verb & som adverb)  / pr(verb) skoor peaks olema suur
  
pr(verb & adverb) -> verbiga on koos mingi adverb, liida kokku kõik, kus on midagi ja siis jaga (verb ja adverb)/(verb ja adverb + verb ja ei ole adverbi)

In [88]:
verb_scores = defaultdict(lambda: defaultdict(int))
verbs_counts = defaultdict(int)

for entry in entries_to_keep:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    verbs_counts[verb] += count
    if adverb == "":
        verb_scores[verb]["no adverb"] += count
    else:
        verb_scores[verb]["adverb"] += count

In [89]:
all_verb_counts = sum(verbs_counts.values())

In [90]:
all_verb_counts

44628989

In [91]:
verb_probabilities = {}

for verb, value in verbs_counts.items():
    verb_probabilities[verb] = value / all_verb_counts

In [92]:
verb_probabilities["olema"]

0.221474611490751

In [93]:
verb_scores["olema"]

defaultdict(int, {'no adverb': 9602317, 'adverb': 281871})

In [94]:
scores_verbs = []

for verb, value in verb_scores.items():
    adv = value["adverb"]
    no_adv = value["no adverb"]
    score = adv / (adv + no_adv)
    scores_verbs.append((verb, score))

In [95]:
sorted(scores_verbs, key=lambda x: x[1], reverse=True)[:25]

[('winnuma', 1.0),
 ('meelestama', 0.9341275402943238),
 ('vinnuma', 0.9122807017543859),
 ('räägitud-kirjutama', 0.9),
 ('nurmaalnema', 0.9),
 ('runema', 0.88),
 ('läbuma', 0.7910447761194029),
 ('joudnuma', 0.7741935483870968),
 ('möjuma', 0.7727272727272727),
 ('koteerima', 0.7682926829268293),
 ('joodima', 0.7307692307692307),
 ('j6udnuma', 0.725),
 ('talitama', 0.7248368258330471),
 ('k2ituma', 0.72),
 ('muundama', 0.7090719499478624),
 ('icitama', 0.7027027027027027),
 ('üttlema', 0.696969696969697),
 ('kumerduma', 0.6842105263157895),
 ('suunitlema', 0.6829268292682927),
 ('pangestuma', 0.68),
 ('tõestuma', 0.6756756756756757),
 ('töllerdama', 0.6737588652482269),
 ('suhtuma', 0.6732404090967937),
 ('käituma', 0.6707726202994342),
 ('ühestama', 0.6666666666666666)]

In [96]:
verb_scores["käituma"]

defaultdict(int, {'no adverb': 6575, 'adverb': 13396})

In [97]:
sorted(scores_verbs, key=lambda x: x[1], reverse=True)[-25:]

[('rudima', 0.029411764705882353),
 ('olema', 0.02851736531114139),
 ('põnnama', 0.02666666666666667),
 ('pipettima', 0.022727272727272728),
 ('surra-murdma', 0.02197802197802198),
 ('viirama', 0.011023622047244094),
 ('k6tu-valutama', 0.008264462809917356),
 ('tohtima', 0.008254193662909382),
 ('neimama', 0.005),
 ('võima', 0.004072675726143907),
 ('ei', 0.00015293677394282454),
 ('ibima', 0.0),
 ('teppima', 0.0),
 ('urama', 0.0),
 ('kangestama', 0.0),
 ('nihvama', 0.0),
 ('ohnema', 0.0),
 ('användama', 0.0),
 ('limitama', 0.0),
 ('ecuma', 0.0),
 ('devilgirls6bratama', 0.0),
 ('kiizuma', 0.0),
 ('krikskokkama', 0.0),
 ('hottama', 0.0),
 ('celakylmetama', 0.0)]

## Adverbs by number of verbs

In [98]:
adverb_verb = defaultdict(lambda: defaultdict(int))
adverb_verb_with_counts = defaultdict(lambda: defaultdict(int))
adverb_verb_set = defaultdict(set)

for entry in entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    adverb_verb[adverb][verb] += 1
    adverb_verb_with_counts[adverb][verb] += count
    adverb_verb_set[adverb].add(verb)

In [99]:
adverb_verb_counts = [(adv, len(v)) for adv, v in adverb_verb_set.items()]

In [100]:
sorted(adverb_verb_counts, key=lambda x: x[1], reverse=True)[:25]

[('', 23150),
 ('siis', 4847),
 ('ka', 4387),
 ('kas', 3667),
 ('nii', 3615),
 ('veel', 3287),
 ('juba', 3216),
 ('siin', 3186),
 ('nüüd', 3183),
 ('küll', 3171),
 ('seal', 3162),
 ('ju', 3061),
 ('enam', 2999),
 ('ikka', 2953),
 ('aga', 2661),
 ('rohkem', 2603),
 ('praegu', 2588),
 ('lihtsalt', 2586),
 ('mitte', 2435),
 ('samuti', 2342),
 ('kohe', 2317),
 ('siiski', 2294),
 ('hiljem', 2252),
 ('palju', 2162),
 ('jälle', 2159)]

In [101]:
for a, c in adverb_verb_counts:
    if a[-2:] == "ni":
        print(a, c)

siiani 1276
seni 1809
tänini 473
paremini 1145
tänaseni 648
siiamaani 739
kiiremini 929
sinnamaani 140
meelsamini 271
halvemini 180
kõvemini 176
sinnani 105
koguni 902
senini 594
segamini 201
kõrini 42
pärani 53
ilusamini 42
kergemini 378
kõrgemini 11
senimaani 207
üleni 249
viimseni 173
kindlamini 132
rutemini 21
jänni 11
lihtsamini 116
sagedamini 739
harvemini 190
tihemini 134
kehvemini 126
seniajani 297
tugevamini 224
tihedamini 269
selgemini 145
õigemini 529
hullemini 178
pilgeni 58
hõlpsamini 109
tasemini 18
valjemini 79
kenamini 9
valusamini 28
nõrgemini 43
üksipäini 141
nobedamini 33
täpsemini 203
pigemini 65
teravamini 23
julgemini 83
targemini 35
kinni 25
ühe-kuni 1
raskemini 70
viletsamini 20
hiljemini 8
pikemini 3
mugavamini 7
ennemini 79
kauemini 11
hambuni 19
kuhumaani 21
sirgemini 1
etemini 27
odavamini 17
mõnusamini 2
rängemini 22
aktiivsemini 3
rangemini 11
ägedamini 4
kainemini 1
läbini 24
täbaramini 2
hapramini 1
lahti-kinni 10
karmimini 1
pehmemini 2
heledamini 1
aeg

## Test

In [102]:
from estnltk import Text

In [103]:
adverbs[:10]

['', '', '', '', '', '', '', '', '', '']

In [104]:
singular_adverbs = list(set(adverbs))

In [105]:
t = Text(singular_adverbs[1])

In [106]:
t.tag_layer()

text
sooliselt

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,1
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,1
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,1


In [107]:
for entry in entries:
    if entry[5] not in ["", "D"]:
        print(entry)
        break

('hakkama', 'V', '', '', 'aga', 'J', 1791, 'Tydruk : ääää hirmuta juba tilk pyxis ..... ma ei kannata kui mingi libu minuga paugub ... nii et hakka aga tulema ... ja tgasi sa enam ei jõua', 'siberimees : hakka aga otsima :)', 'Lovemachine : njah , ma ka kohe hakkan aga teega')


In [108]:
new_entries = []

for entry in entries_to_keep:
    if entry[5] in ["", "D"]:
        new_entries.append(entry)

In [109]:
len(entries), len(entries_to_keep), len(new_entries)

(1087318, 975137, 970775)

#### Finding top adverbs

In [110]:
adverb_verb = defaultdict(lambda: defaultdict(int))
adverb_verb_with_counts = defaultdict(lambda: defaultdict(int))
adverb_verb_set = defaultdict(set)

for entry in new_entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    adverb_verb[adverb][verb] += 1
    adverb_verb_with_counts[adverb][verb] += count
    adverb_verb_set[adverb].add(verb)

In [111]:
adverb_verb_counts = [(adv, len(v)) for adv, v in adverb_verb_set.items()]

In [112]:
sorted(adverb_verb_counts, key=lambda x: x[1], reverse=True)[:25]

[('', 4907),
 ('siis', 3555),
 ('ka', 3541),
 ('nii', 3161),
 ('kas', 3028),
 ('veel', 2926),
 ('nüüd', 2911),
 ('küll', 2884),
 ('juba', 2870),
 ('seal', 2806),
 ('enam', 2721),
 ('siin', 2636),
 ('ikka', 2601),
 ('ju', 2600),
 ('praegu', 2422),
 ('rohkem', 2417),
 ('lihtsalt', 2383),
 ('samuti', 2243),
 ('mitte', 2242),
 ('siiski', 2193),
 ('hiljem', 2169),
 ('kohe', 2139),
 ('jälle', 2021),
 ('samas', 1995),
 ('palju', 1990)]

#### Seotud laiendite leidmine

In [113]:
verb_scores = defaultdict(lambda: defaultdict(int))
verbs_counts = defaultdict(int)

for entry in new_entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    verbs_counts[verb] += count
    if adverb == "":
        verb_scores[verb]["no adverb"] += count
    else:
        verb_scores[verb]["adverb"] += count

In [114]:
all_verb_counts = sum(verbs_counts.values())

In [115]:
verb_probabilities = {}

for verb, value in verbs_counts.items():
    verb_probabilities[verb] = value / all_verb_counts

In [116]:
scores_verbs = []

for verb, value in verb_scores.items():
    adv = value["adverb"]
    no_adv = value["no adverb"]
    score = adv / (adv + no_adv)
    scores_verbs.append((verb, score))

In [118]:
sorted(scores_verbs, key=lambda x: x[1], reverse=True)[:50]

[('winnuma', 1.0),
 ('meelestama', 0.9340350877192982),
 ('vinnuma', 0.9122807017543859),
 ('räägitud-kirjutama', 0.9),
 ('nurmaalnema', 0.9),
 ('runema', 0.88),
 ('läbuma', 0.7910447761194029),
 ('joudnuma', 0.7741935483870968),
 ('möjuma', 0.7727272727272727),
 ('koteerima', 0.7682926829268293),
 ('joodima', 0.7307692307692307),
 ('talitama', 0.7221644120707597),
 ('k2ituma', 0.72),
 ('j6udnuma', 0.717948717948718),
 ('muundama', 0.7081589958158996),
 ('icitama', 0.7027027027027027),
 ('üttlema', 0.6875),
 ('kumerduma', 0.6842105263157895),
 ('suunitlema', 0.6829268292682927),
 ('pangestuma', 0.68),
 ('suhtuma', 0.6720232165287824),
 ('töllerdama', 0.6714285714285714),
 ('käituma', 0.6688491563837824),
 ('tõestuma', 0.6666666666666666),
 ('ühestama', 0.6666666666666666),
 ('trippima', 0.6666666666666666),
 ('mõjuma', 0.6631308147730477),
 ('näkkama', 0.6631299734748011),
 ('zhestikuleerima', 0.6595744680851063),
 ('reguleeruma', 0.65625),
 ('tempereerima', 0.6538461538461539),
 ('suu

Seotud verb-adverb paarid:
1. talitama õigesti
2. suhtuma tõsiselt
3. käituma õigesti
4. mõjuma hästi
5. kohtlema võrdselt
6. jaotuma ühtlaselt

In [119]:
new_entries[2]

('saama',
 'V',
 'pihta',
 'D',
 '',
 '',
 1307,
 'Krissu : heheeeee ....... ei saa pihta millest jutt = jätke mu kaablid rahule',
 'Sk8er-Girl : kuulge-eip saa poindile pihta millezt jutt ?',
 'rose_eml : said pihta asjale ?')

In [120]:
talitama = 0
õigesti = 0
talitama_õigesti = 0
kokku = 0

for entry in new_entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    kokku += count
    if verb == "talitama":
        talitama += count
    if adverb == "õigesti":
        õigesti += count
    if verb == "talitama" and adverb == "õigesti":
        talitama_õigesti += count

1) armastama sagedus on 1/1000 lause kohta
2) leegitsevalt sagedus on 1/20000 lause kohta
3) leegitsevalt armastama oodatuds sagedus on 1/1000 * 1/20000 lause kohta
4) leegitsevalt armastama tegelik sagedus on 1/50000 lause kohta
5) lift on  1/50000 /  (1/1000 * 1/20000)  aga seda on parem vaadata kui võtta logaritm

In [121]:
np.log((talitama_õigesti / kokku) / ((talitama / kokku) * (õigesti / kokku)))

5.6262361215084615

In [122]:
suhtuma = 0
õigesti = 0
suhtuma_õigesti = 0
kokku = 0

for entry in new_entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    kokku += count
    if verb == "suhtuma":
        suhtuma += count
    if adverb == "õigesti":
        õigesti += count
    if verb == "suhtuma" and adverb == "õigesti":
        suhtuma_õigesti += count

In [123]:
np.log((suhtuma_õigesti / kokku) / ((suhtuma / kokku) * (õigesti / kokku)))

0.26238350560876705

In [124]:
talitama = 0
talitama_adv = 0
kokku = 0

for entry in new_entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    kokku += count
    if verb == "talitama":
        talitama += count
        if adverb != "":
            talitama_adv += count

In [125]:
talitama, talitama_adv

(2883, 2082)

 pr(verb & som adverb)  / pr(verb) 

In [126]:
(talitama_adv / kokku) / (talitama / kokku)

0.7221644120707597

Leia kõikide verbide jaoks see ülemine skoor: (verb_adv / all) / (verb / all)
Järjesta skoori järgi
Vaata top adverbe ja eemalda mingi hunnik "vabu" adverbe (mis tunduvad vabad)
---> potentsiaalselt wordneti abil, nt kui ülem on aeg või koht vms
Siis tee uuesti ülemine koodijupp pmst, ainult et 
if adverb != "":
    talitama_adv += count
    
asenda:
if adverb not in ["", vabad adverbid]:
    talitama_adv += count


In [127]:
kokku = 0

for entry in new_entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    kokku += count

In [128]:
verb_counts = defaultdict(lambda: defaultdict(int))

for entry in new_entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    verb_counts[verb]["total"] += count
    if adverb == "":
        verb_counts[verb]["no adverb"] += count
    else:
        verb_counts[verb]["adverb"] += count

In [129]:
verb_counts["olema"]

defaultdict(int, {'total': 9876915, 'no adverb': 9602317, 'adverb': 274598})

In [130]:
verb_scores = {}

for verb, value in verb_counts.items():
    score = (value["adverb"] / kokku) / (value["total"] / kokku)
    verb_scores[verb] = score

In [131]:
verb_scores["olema"]

0.02780200092842755

In [132]:
sorted(verb_scores.items(), key=lambda x: x[1], reverse=True)[:50]

[('winnuma', 1.0),
 ('meelestama', 0.9340350877192983),
 ('vinnuma', 0.912280701754386),
 ('räägitud-kirjutama', 0.9),
 ('nurmaalnema', 0.9),
 ('runema', 0.88),
 ('läbuma', 0.7910447761194028),
 ('joudnuma', 0.7741935483870968),
 ('möjuma', 0.7727272727272726),
 ('koteerima', 0.7682926829268293),
 ('joodima', 0.7307692307692307),
 ('talitama', 0.7221644120707597),
 ('k2ituma', 0.72),
 ('j6udnuma', 0.7179487179487178),
 ('muundama', 0.7081589958158996),
 ('icitama', 0.7027027027027027),
 ('üttlema', 0.6875),
 ('kumerduma', 0.6842105263157895),
 ('suunitlema', 0.6829268292682926),
 ('pangestuma', 0.6799999999999999),
 ('suhtuma', 0.6720232165287824),
 ('töllerdama', 0.6714285714285715),
 ('käituma', 0.6688491563837825),
 ('ühestama', 0.6666666666666667),
 ('trippima', 0.6666666666666667),
 ('tõestuma', 0.6666666666666666),
 ('mõjuma', 0.6631308147730477),
 ('näkkama', 0.6631299734748011),
 ('zhestikuleerima', 0.6595744680851063),
 ('reguleeruma', 0.65625),
 ('tempereerima', 0.65384615384

#### Seotud laienditega verbide leidmine

Ülemiste skooride arvutamine ning lisaks juurde ka mitmes lauses kokku esineb ning mitme erineva adverbiga koos.

In [134]:
verb_counts = defaultdict(lambda: defaultdict(int))

for entry in new_entries:
    verb, _, _, _, adverb, _, count, _, _, _ = entry
    verb_counts[verb]["total"] += count
    verb_counts[verb]["different adverbs"] += 1
    if adverb == "":
        verb_counts[verb]["no adverb"] += count
    else:
        verb_counts[verb]["adverb"] += count

In [135]:
verb_info = {}

for verb, value in verb_counts.items():
    score = (value["adverb"] / kokku) / (value["total"] / kokku)
    verb_info[verb] = {"score": score, "total": value["total"], "adverbs": value["different adverbs"]}

In [141]:
verb_info_df = pd.DataFrame(verb_info).T

In [139]:
verb_info_df.to_csv("verb_info.csv", encoding="utf-8-sig")

In [142]:
verb_info_df = verb_info_df[verb_info_df["total"] >= 100]

In [143]:
verb_info_df = verb_info_df[verb_info_df["adverbs"] >= 10]

In [144]:
verb_info_df.shape

(3309, 3)

In [145]:
cleaned_verb_scores = [(k, v) for k, v in verb_scores.items() if k in verb_info_df.index]

In [146]:
len(cleaned_verb_scores)

3309

In [147]:
cleaned_verb_scores[:5]

[('olema', 0.02780200092842755),
 ('toimuma', 0.293384658121354),
 ('saama', 0.21402081465214975),
 ('oskama', 0.3027908991522102),
 ('ei', 0.0001374235812215848)]

In [250]:
sorted(cleaned_verb_scores, key=lambda x: x[1], reverse=True)[:10]

[('meelestama', 0.9340350877192983),
 ('talitama', 0.7221644120707597),
 ('muundama', 0.7081589958158996),
 ('suhtuma', 0.6720232165287824),
 ('töllerdama', 0.6714285714285715),
 ('käituma', 0.6688491563837825),
 ('mõjuma', 0.6631308147730477),
 ('näkkama', 0.6631299734748011),
 ('sujuma', 0.6527415143603134),
 ('kohtlema', 0.6391129032258065)]

In [251]:
sorted(cleaned_verb_scores, key=lambda x: x[1], reverse=True)[-10:]

[('sõnama', 0.05269009330950963),
 ('partsima', 0.04137931034482759),
 ('helluma', 0.036432160804020106),
 ('tänama', 0.03597360155644003),
 ('sihvama', 0.03529411764705883),
 ('olema', 0.02780200092842755),
 ('viirama', 0.011023622047244093),
 ('tohtima', 0.008128431757947729),
 ('võima', 0.004029235126860931),
 ('ei', 0.0001374235812215848)]

#### Vabade adverbide leidmine Wordneti abil

In [183]:
from estnltk.wordnet import Wordnet

In [144]:
adverbs = list(set([entry[4] for entry in new_entries]))

In [142]:
wn = Wordnet()

In [148]:
"hommikuti" in adverbs

True

In [154]:
wn["hommikuti"][0].hypernyms

[]

In [155]:
len(adverbs)

4228

In [156]:
adverb_hypernyms = {}

for adv in adverbs:
    synsets = wn[adv]
    if len(synsets) > 0:
        adverb_hypernyms[adv] = synsets[0].hypernyms

In [157]:
len(adverb_hypernyms.keys())

2371

In [158]:
list(adverb_hypernyms.keys())[:5]

['heldinult', 'alati', 'pikali', 'religioosselt', 'momentselt']

In [168]:
 vm.analyze("heldinud")

[{'text': 'heldinud',
  'analysis': [{'root': 'heldi',
    'root_tokens': ['heldi'],
    'ending': 'nud',
    'clitic': '',
    'partofspeech': 'V',
    'form': 'nud',
    'lemma': 'heldima'},
   {'root': 'heldi=nud',
    'root_tokens': ['heldinud'],
    'ending': '0',
    'clitic': '',
    'partofspeech': 'A',
    'form': '',
    'lemma': 'heldinud'},
   {'root': 'heldi=nud',
    'root_tokens': ['heldinud'],
    'ending': '0',
    'clitic': '',
    'partofspeech': 'A',
    'form': 'sg n',
    'lemma': 'heldinud'},
   {'root': 'heldi=nud',
    'root_tokens': ['heldinud'],
    'ending': 'd',
    'clitic': '',
    'partofspeech': 'A',
    'form': 'pl n',
    'lemma': 'heldinud'}]}]

In [167]:
vm.analyze("hommikuti")

[{'text': 'hommikuti',
  'analysis': [{'root': 'hommikuti',
    'root_tokens': ['hommikuti'],
    'ending': '0',
    'clitic': '',
    'partofspeech': 'D',
    'form': '',
    'lemma': 'hommikuti'}]}]

In [159]:
adverb_hypernyms["alati"]

[]

In [160]:
non_empty_hypernyms = []

for adv, hypernyms in adverb_hypernyms.items():
    if len(hypernyms) > 0:
        non_empty_hypernyms.append(adv)

In [161]:
len(non_empty_hypernyms)

8

In [163]:
print(non_empty_hypernyms)

['koost', 'üll', 'miks', 'hange', 'looja', 'liigiti', 'kalli-kalli', 'enamjagu']


#### Adverbide eemaldamine sageduse järgi

In [133]:
sorted(adverb_verb_counts, key=lambda x: x[1], reverse=True)[:50]

[('', 4907),
 ('siis', 3555),
 ('ka', 3541),
 ('nii', 3161),
 ('kas', 3028),
 ('veel', 2926),
 ('nüüd', 2911),
 ('küll', 2884),
 ('juba', 2870),
 ('seal', 2806),
 ('enam', 2721),
 ('siin', 2636),
 ('ikka', 2601),
 ('ju', 2600),
 ('praegu', 2422),
 ('rohkem', 2417),
 ('lihtsalt', 2383),
 ('samuti', 2243),
 ('mitte', 2242),
 ('siiski', 2193),
 ('hiljem', 2169),
 ('kohe', 2139),
 ('jälle', 2021),
 ('samas', 1995),
 ('palju', 1990),
 ('nagu', 1967),
 ('taas', 1953),
 ('tegelikult', 1950),
 ('ainult', 1950),
 ('täna', 1942),
 ('varem', 1929),
 ('kunagi', 1908),
 ('ilmselt', 1904),
 ('isegi', 1900),
 ('eile', 1896),
 ('alati', 1823),
 ('ehk', 1807),
 ('kindlasti', 1796),
 ('muidugi', 1779),
 ('seni', 1761),
 ('seejärel', 1748),
 ('lõpuks', 1745),
 ('vaid', 1741),
 ('enne', 1721),
 ('pidevalt', 1708),
 ('muidu', 1699),
 ('üldse', 1673),
 ('seetõttu', 1633),
 ('seega', 1626),
 ('ikkagi', 1624)]

#### Adverbide leidmine lõpu järgi

In [150]:
adverb_verb_counts[:5]

[('', 4907),
 ('sageli', 1548),
 ('tihti', 1448),
 ('lihtsalt', 2383),
 ('kuidagi', 1478)]

In [151]:
advs = [adv for adv, c in adverb_verb_counts[1:]]

In [152]:
len(advs)

4227

In [154]:
advs[0], advs[0][-2:]

('sageli', 'li')

In [155]:
group_per_ending = defaultdict(list)

for adv in advs:
    group_per_ending[adv[-2:]].append(adv)

In [156]:
len(group_per_ending)

109

In [160]:
sorted([(end, len(l)) for end, l in group_per_ending.items()], key=lambda x: x[1], reverse=True)

[('lt', 2589),
 ('ti', 275),
 ('le', 110),
 ('si', 89),
 ('al', 86),
 ('di', 82),
 ('ni', 63),
 ('st', 55),
 ('ks', 45),
 ('gi', 41),
 ('is', 36),
 ('tu', 34),
 ('il', 33),
 ('as', 32),
 ('li', 30),
 ('gu', 29),
 ('ki', 28),
 ('es', 27),
 ('el', 27),
 ('ku', 22),
 ('da', 21),
 ('ga', 20),
 ('ta', 20),
 ('du', 19),
 ('se', 18),
 ('sa', 18),
 ('ma', 17),
 ('rd', 16),
 ('us', 15),
 ('pa', 15),
 ('em', 13),
 ('na', 13),
 ('lu', 13),
 ('va', 12),
 ('ju', 12),
 ('ja', 11),
 ('la', 10),
 ('ge', 9),
 ('ul', 9),
 ('ol', 9),
 ('te', 8),
 ('ui', 8),
 ('ra', 8),
 ('ka', 7),
 ('he', 7),
 ('ri', 7),
 ('mi', 7),
 ('äi', 7),
 ('ha', 7),
 ('su', 6),
 ('bi', 6),
 ('ah', 5),
 ('vu', 5),
 ('de', 5),
 ('me', 5),
 ('id', 5),
 ('rt', 5),
 ('ii', 4),
 ('ke', 4),
 ('mu', 4),
 ('ua', 4),
 ('nu', 4),
 ('aa', 4),
 ('nt', 4),
 ('vi', 4),
 ('ll', 3),
 ('pi', 3),
 ('hu', 3),
 ('tt', 3),
 ('os', 2),
 ('ba', 2),
 ('ut', 2),
 ('oh', 2),
 ('ik', 2),
 ('ea', 2),
 ('ne', 2),
 ('ia', 2),
 ('ap', 2),
 ('it', 2),
 ('ud', 2),

In [249]:
group_per_ending["rd"]

['ükskord',
 'seekord',
 'kord',
 'tookord',
 'mõnikord',
 'niivõrd',
 'teinekord',
 'karvavõrd',
 'sedavõrd',
 'kuigivõrd',
 'samavõrd',
 'taaskord',
 'veelkord',
 'mustaverd',
 'niivõrd-kuivõrd',
 '1kord']

In [246]:
group_per_ending["se"]

['üldse',
 'ise',
 'otse',
 'kahasse',
 'üleüldse',
 'sinnasamasse',
 'nõusse',
 'pahuksisse',
 'erakätesse',
 'ülesse',
 'jõusse',
 'sisse',
 'kägarasse',
 'siiasamasse',
 'kõverasse',
 'hambusse',
 'palgesse',
 'ummuksisse']

In [244]:
group_per_ending["ta"]

['ilmtingimata',
 'kahtlemata',
 'kõhklemata',
 'asjata',
 'kogemata',
 'tingimata',
 'tahes-tahtmata',
 'vahetpidamata',
 'takkapihta',
 'alalõpmata',
 'püksata',
 'ilmaasjata',
 'lõpmata',
 'tegevuseta',
 'pikemata',
 'muidugimõista',
 'luhta',
 'viivitamata',
 'viibimata',
 'tahestahtmata']

In [161]:
group_per_ending["lt"][:10]

['lihtsalt',
 'kaheksaselt',
 'pooljuhuslikult',
 'pidevalt',
 'rahaliselt',
 'kokkuhoidlikult',
 'kergelt',
 'tõsiselt',
 'tegelikult',
 'sealt']

In [237]:
group_per_ending["ki"][:10]

['siiski',
 'ühtäkki',
 'nüüdki',
 'äkki',
 'katki',
 'kaugeltki',
 'ealeski',
 'lõhki',
 'küllaltki',
 'hoopiski']

In [223]:
group_per_ending["il"][:20]

['kuskil',
 'kusagil',
 'pisarsilmil',
 'siinmail',
 'sealmail',
 'käsipõsakil',
 'õhevil',
 'võõrsil',
 'neljakäpakil',
 'kodumail',
 'pungil',
 'tulvil',
 'kukil',
 'hajevil',
 'põnevil',
 'neljakäpukil',
 'ärevil',
 'paokil',
 'teoksil',
 'vargil']

In [232]:
testtext = Text("Noor naine pöördub pisarsilmil oma mehe poole")

In [233]:
testtext.tag_layer()
testtext.morph_analysis

layer name,attributes,parent,enveloping,ambiguous,span count
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,7

text,normalized_text,lemma,root,root_tokens,ending,clitic,form,partofspeech
Noor,Noor,noor,noor,['noor'],0,,sg n,A
naine,naine,naine,naine,['naine'],0,,sg n,S
pöördub,pöördub,pöörduma,pöördu,['pöördu'],b,,b,V
pisarsilmil,pisarsilmil,pisarsilmil,pisar_silmil,"['pisar', 'silmil']",0,,,D
oma,oma,oma,oma,['oma'],0,,sg g,P
mehe,mehe,mees,mees,['mees'],0,,sg g,S
poole,poole,poole,poole,['poole'],0,,,K


In [168]:
group_per_ending["lt"][0][:-2]

'lihtsa'

In [169]:
lt_endings_removed = [word[:-2] for word in group_per_ending["lt"]]

In [170]:
lt_analysis = vm.analyze(lt_endings_removed, guess=False)

In [171]:
lt_analysis[0]

{'text': 'lihtsa',
 'analysis': [{'root': 'lihtne',
   'root_tokens': ['lihtne'],
   'ending': '0',
   'clitic': '',
   'partofspeech': 'A',
   'form': 'sg g',
   'lemma': 'lihtne'}]}

In [172]:
len(lt_analysis)

2589

In [182]:
2589 / 4227

0.6124911284599006

In [179]:
lt_lemmas = []

for an in lt_analysis:
    lt_lemmas.append(an["analysis"][0]["lemma"])

In [180]:
lt_lemmas[:5]

['lihtne', 'kaheksane', 'pooljuhuslik', 'pidev', 'rahaline']

In [252]:
lt_lemmas[:10]

['lihtne',
 'kaheksane',
 'pooljuhuslik',
 'pidev',
 'rahaline',
 'kokkuhoidlik',
 'kerge',
 'tõsine',
 'tegelik',
 'siga']

In [184]:
wn = Wordnet()

In [185]:
lt_adverb_hypernyms = {}

for adv in lt_lemmas:
    synsets = wn[adv]
    if len(synsets) > 0:
        lt_adverb_hypernyms[adv] = synsets[0].hypernyms

In [186]:
len(lt_lemmas), len(lt_adverb_hypernyms)

(2589, 1684)

In [201]:
list(lt_adverb_hypernyms.keys())[:5]

['lihtne', 'pidev', 'rahaline', 'kokkuhoidlik', 'kerge']

In [202]:
hypernyms = []

for k, v in lt_adverb_hypernyms.items():
    if len(v) > 0:
        hypernyms.append(k)

In [203]:
len(hypernyms)

76

In [208]:
lt_adverb_hypernyms["siga"][0].literal

'koduloom'

In [212]:
hypernym_literals = {}

for w in hypernyms:
    hypernym_literals[w] = lt_adverb_hypernyms[w][0].literal

In [214]:
len(hypernym_literals), len(set(hypernym_literals.values()))

(76, 52)

In [215]:
Counter(hypernym_literals.values()).most_common()

[('inimene', 12),
 ('kordne', 4),
 ('koduloom', 3),
 ('haige', 3),
 ('hulk', 2),
 ('ealine', 2),
 ('kindel', 2),
 ('endeline', 2),
 ('asend', 2),
 ('miski', 2),
 ('keerukus', 1),
 ('keeleüksus', 1),
 ('halb õnn', 1),
 ('sisu', 1),
 ('rohkus', 1),
 ('kuulmiselund', 1),
 ('võistlus', 1),
 ('materjal', 1),
 ('pühendaja', 1),
 ('kulgemine', 1),
 ('kannataja', 1),
 ('vorm', 1),
 ('sigaret', 1),
 ('võimelisus', 1),
 ('kogema', 1),
 ('omadus', 1),
 ('lahke', 1),
 ('kulgev', 1),
 ('rakmed', 1),
 ('keegi', 1),
 ('mõjujõud', 1),
 ('kiriklik', 1),
 ('näit', 1),
 ('olev', 1),
 ('valge', 1),
 ('mõõde', 1),
 ('aktiivne', 1),
 ('divisjon', 1),
 ('otsustamine', 1),
 ('hinne', 1),
 ('kehaline omadus', 1),
 ('suhtlus', 1),
 ('natsionaalne', 1),
 ('töötu', 1),
 ('varustus', 1),
 ('rikas', 1),
 ('pind', 1),
 ('värviline', 1),
 ('ese', 1),
 ('käituja', 1),
 ('liigutama', 1),
 ('osavõtja', 1)]