In [1]:
import pickle
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [2]:
with open("1m_samples/sampled_1m_sentences_layers_1.pickle", "rb") as f:
        sentences = pickle.load(f)

In [3]:
len(sentences)

50000

In [4]:
sentences[0]

text
Kui sageli tuli sul kooli juhtkonna ees aru anda ?

0,1
file,aja_kr_2003_02_18.xml
sent_end,789
sent_start,739
subcorpus,aja_kr
text_no,0
title,Eurovisiooni Don Quijote
type,artikkel

layer name,attributes,parent,enveloping,ambiguous,span count
words,normalized_form,,,True,10
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,10
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,10
v172_stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc, parent_span, children",morph_extended,,False,10
v172_obl_phrases,"entity_type, free_entity, is_valid, root_id, root",,v172_stanza_syntax,False,2


In [15]:
sentences[0].v172_obl_phrases[0]['root']['lemma']

'sina'

In [16]:
verb_obl_phrases = []

for text_object in sentences:
    for ent in text_object.v172_obl_phrases:
        obl_root = ent['root']
        obl_root_id = ent['root_id']
        obl_lemma = obl_root['lemma']
        obl_form = obl_root.form[0]
        current_head = text_object.head[obl_root_id - 1]
        prev_head = obl_root_id

        while current_head != 0:
            prev_head = current_head
            current_head = text_object.head[current_head - 1]

        verb_lemma = text_object.v172_stanza_syntax.lemma[prev_head - 1]
        verb_obl = [verb_lemma]

        verb_comp = []
        for w in text_object.v172_stanza_syntax:
            if w.head == prev_head and 'compound' in w.deprel:
                verb_comp.append(w.lemma)

        verb_obl.extend(verb_comp)
        verb_obl.extend([obl_lemma, obl_form])

        verb_obl_phrases.append(verb_obl)

In [17]:
len(verb_obl_phrases)

70937

In [18]:
verb_obl_phrases[:5]

[['tulema', 'sina', 'sg ad'],
 ['tulema', 'juhtkond', 'sg g'],
 ['toksima', 'sisse', 'pill', 'sg ad'],
 ['toksima', 'sisse', 'hommik', 'sg ad'],
 ['koos', 'hetk', 'sg el']]

In [19]:
lengths = [len(phrase) for phrase in verb_obl_phrases]

In [20]:
Counter(lengths)

Counter({3: 66205, 4: 4697, 5: 35})

In [25]:
final_phrases = []

for phrase in verb_obl_phrases:
    if len(phrase) == 3:
        final_phrases.append(phrase[:1] + [None, None] + phrase[1:])
    if len(phrase) == 4:
        final_phrases.append(phrase[:2] + [None] + phrase[2:])
    if len(phrase) == 5:
        final_phrases.append(phrase)

In [26]:
final_phrases[:3]

[['tulema', None, None, 'sina', 'sg ad'],
 ['tulema', None, None, 'juhtkond', 'sg g'],
 ['toksima', 'sisse', None, 'pill', 'sg ad']]

In [27]:
full_phrases_df = pd.DataFrame(final_phrases, columns=["verb", "compound1", "compound2", "obl", "obl_form"])

In [30]:
full_phrases_df.head()

Unnamed: 0,verb,compound1,compound2,obl,obl_form
0,tulema,,,sina,sg ad
1,tulema,,,juhtkond,sg g
2,toksima,sisse,,pill,sg ad
3,toksima,sisse,,hommik,sg ad
4,koos,,,hetk,sg el


In [31]:
final_phrases_in = [phrase for phrase in final_phrases if phrase[-1] in ['sg in', 'pl in']]

In [32]:
len(final_phrases), len(final_phrases_in)

(70937, 10698)

In [33]:
final_phrases_in[:3]

[['tõetera', None, None, 'see', 'sg in'],
 ['kandideerima', None, None, 'Tallinn', 'sg in'],
 ['kandideerima', None, None, 'nimekiri', 'sg in']]

In [34]:
full_phrases_in_df = pd.DataFrame(final_phrases_in, columns=["verb", "compound1", "compound2", "obl", "obl_form"])

In [36]:
full_phrases_in_df.head()

Unnamed: 0,verb,compound1,compound2,obl,obl_form
0,tõetera,,,see,sg in
1,kandideerima,,,Tallinn,sg in
2,kandideerima,,,nimekiri,sg in
3,ütlema,,,kodu,sg in
4,ütlema,,,veebruar,sg in


In [37]:
full_phrases_df.to_csv("full_phrases_df.csv", index=False)
full_phrases_in_df.to_csv("full_phrases_in_df.csv", index=False)