In [1]:
#import sys
#!conda install --yes --prefix {sys.prefix} pygraphviz

# Alus - sihitis kollokatsioonid

## Vajalikud teegid

In [2]:
from __future__ import print_function

import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from collections import defaultdict
import matplotlib.pyplot as plt
import random

import estnltk
from estnltk.storage.postgres import PostgresStorage

import networkx as nx
import sqlite3
import sys


## Abifunktsioonid

In [3]:
class graphFunctions:

    # kahe listi ühisosa 
    def intersection(a, b):
        return list(set(a).intersection(b))

    # tipu leidmine atribuudi väärtuse järgi
    def get_nodes_by_attributes(G,  attrname, attrvalue ):
        nodes = defaultdict(list)
        {nodes[v].append(k) for k, v in nx.get_node_attributes(G,attrname).items()}
        if attrvalue in nodes:
            return dict(nodes)[attrvalue]
        return []

    # graafi joonistamine 
    # tipp - lemma
    # serv - deprel
    def drawGraph(G):
        pos = graphviz_layout(G, prog='dot')
        labels = nx.get_node_attributes(G, 'lemma')
        nx.draw(G, pos, cmap = plt.get_cmap('jet'),labels=labels, with_labels=True)
        edge_labels = nx.get_edge_attributes(G, 'deprel')
        nx.draw_networkx_edge_labels(G, pos, edge_labels)
        plt.show()


In [4]:
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

### Baasi  (sqlite3)  ettevalmistamine täitmiseks

Luuakse tabelid:
* collections_processed - salvestatakse viimane salvestatud collection Id
* {TABLENAME} - tabel kollokatsioonidega
 * lemma1 text
 * pos1 text
 * lemma2 text
 * pos2 text
 * total integer
 * example1 text
 * example2 text
 * example3 text

Luuakse indeksid:
* collections_processed_uniq
* {TABLENAME}_col1_col2_unique

In [5]:
def prepCollDb():
    global TABLENAME, cursor, conn

    cursor.execute(f"""CREATE TABLE IF NOT EXISTS collections_processed
                      (tablename text, lastcollection integer);
                      """)
    
    cursor.execute(f"""CREATE UNIQUE INDEX IF NOT EXISTS collections_processed_uniq ON collections_processed(tablename);
    """)
    
    
    cursor.execute(f"""CREATE TABLE IF NOT EXISTS {TABLENAME}
                      (lemma1 text, pos1 text, lemma2 text, pos2 text, total integer, example1 text, example2 text, example3 text);
                      """)

    INDEXNAME = f'{TABLENAME}_col1_col2_unique'
    cursor.execute(f"""CREATE UNIQUE INDEX IF NOT EXISTS {INDEXNAME}
        ON {TABLENAME}(lemma1, pos1, lemma2, pos2);
        """)
    
    conn.commit()



Tulemuse salvestamine baasi (sqlite3)

In [6]:
def saveCollToDb(collocations, examples, lastcollection):
    
    global TABLENAME, cursor, conn
    sqlColls = []

    for key in collocations.keys():
        example1 = None
        example2 = None
        example3 = None

        if len(examples[key])>0:
            example1 = examples[key][0]
        if len(examples[key])>1:
            example2 = examples[key][1]
        if len(examples[key])>2:
            example3 = examples[key][2]

        sqlColls.append( (key[0], key[2], key[1],  key[3], collocations[key], example1, example2, example3 , collocations[key], example1, example2,example3,) )

    cursor.executemany(f"""
        INSERT INTO {TABLENAME} VALUES (?,?,?,?,?,?,?,?)
        ON CONFLICT(lemma1, pos1, lemma2, pos2)
        DO UPDATE SET total=total+?
            ,  example1= ?
            ,  example2= ?
            ,  example3= ?
            ;""", sqlColls)

    cursor.execute(f"""
        INSERT INTO collections_processed VALUES (?,?)
        ON CONFLICT(tablename) DO UPDATE SET lastcollection=?;""", (TABLENAME, lastcollection, lastcollection,) )
    
    conn.commit()
    eprint(f'andmebaasi salvestatud kollokatsioonid kollektsioonidest: 0 - {lastcollection}' )
    
    
    

### Alus - sihitis sõltuvuste leidmine 

Alus - sihitis seosed
1. Leitakse kõik sõnaliigiga V tipud
2. Leitakse kõik tipud deprel = **nsubj** ja deprel = **csubj**
3. Leitakse kõik tipud deprel = **nsubj:cop** ja deprel = **csubj:cop**

Alus-sihitis kollokaatideks loetakse:
* tipp1, mille deprel on **nsubj** või **csubj** ja tema vahetu vanem (parent), tipp2 sõnaliigiga pos = **V**
* tipp1, mille deprel on **nsubj:cop** või **csubj:cop** ja tipp2, millel on tipp1-ga ühine vanem ja mille sõnaliik pos = **V** ja deprel = **cop** 


In [7]:
def extract_verb_subject(G, collocations, examples, sentence):

    # lyhim tee tippude vahel
    path = nx.all_pairs_shortest_path_length(G)
    path_reversed = nx.all_pairs_shortest_path_length(G.reverse())

    # kauguste maatriksid
    dpath = {x[0]:x[1] for x in path}
    dpath_reversed = {x[0]:x[1] for x in path}

    # eraldame tipud vajalike parameetritega
    verbnodes = graphFunctions.get_nodes_by_attributes(G, attrname = 'pos', attrvalue = 'V')

    # need peavad vahetult seotud olema
    subjnodes = graphFunctions.get_nodes_by_attributes(G, attrname = 'deprel', attrvalue = 'nsubj') + graphFunctions.get_nodes_by_attributes(G, attrname = 'deprel', attrvalue = 'csubj')

    #nendega peab olema keerulisem seos
    subjcopnodes = graphFunctions.get_nodes_by_attributes(G, attrname = 'deprel', attrvalue = 'nsubj:cop') + graphFunctions.get_nodes_by_attributes(G, attrname = 'deprel', attrvalue = 'csubj:cop')

    #liigume tegusõnade kaupa
    for verb in verbnodes:
        for subj in subjnodes:
            if subj in dpath[verb] and dpath[verb][subj]==1:
                #print (verb, subj)
                key = ( G.nodes[subj]['lemma'], G.nodes[verb]['lemma'], G.nodes[subj]['pos'],  G.nodes[verb]['pos'] , )
                #print (key)
                if not key in collocations:
                    collocations[key] = 0
                if not key in examples:
                    examples[key] = []
                collocations[key] += 1
                examples[key].append(sentence)
                if len(examples[key]) > 3:
                    del(examples[key][random.randint(0, 2)])

        # siin otsime kaugemal seotud subjecte
        # peab olema V cop tipuga ühine vanem
        #print (G.nodes[verb])
        if G.nodes[verb]['deprel'] == 'cop':
            #neighbors
            #predecessors
            #should be always one parent
            #tegusõna parent
            parents = [parent for parent in G.predecessors(verb)]
            if len(parents)> 1:
                eprint ('Mingi jama puu struktuuriga - tipul on rohkem kui üks vanem')
                graphFunctions.drawGraph(G)
                exit()
            parent = parents[0]
            siblings = [sibling for sibling in G.successors(parent)]
            fitted_nodes = graphFunctions.intersection(subjcopnodes, siblings)
            for subjcop in fitted_nodes:
                key = ( G.nodes[subjcop]['lemma'], G.nodes[verb]['lemma'], G.nodes[subjcop]['pos'],  G.nodes[verb]['pos'] , )
                if not key in collocations:
                    collocations[key] = 0
                if not key in examples:
                    examples[key] = []
                collocations[key] += 1
                examples[key].append(sentence)
                if len(examples[key]) > 3:
                    del(examples[key][random.randint(0, 3)])
            #graphFunctions.drawGraph(G);
    return (collocations, examples)


In [8]:
def getPlaceholder(tablename):
    cursor.execute(f"""SELECT lastcollection FROM collections_processed WHERE tablename = ?""", (tablename,) );
    lastcollection = cursor.fetchone()

    if not lastcollection:
        lastcollection = -1
    else:
        #andmete kogumine jäi pooleli ning jätkatakse samasse faili kirjutamist
        lastcollection = lastcollection[0]
    return lastcollection

## Muutujad

**Korpuse TSV fail**

Lähte TSV-faili ja DB tabeli nimi, kuhu tulemus salvestatakse (TSV genereerimise kood on failis **collect_texts_db_tsv.ipynb**)

In [9]:
collectionName = 'koondkorpus_base_subset_of_5000_v2' # 'koondkorpus_base_v2'

**Kollokatsiooni tüüp**

In [10]:
TYPE = 'subj_verb'

**Tabeli nimi andmebaasis**

In [11]:
TABLENAME = f'{TYPE}_{collectionName}'

**Kollektsioonide arv**, mille kaupa salvestatakse vahepealne tulemus andmebaasi.

In [12]:
BATCHSIZE = 1000

## Andmebaasi loomine ja ette valmistamine

In [13]:
conn = sqlite3.connect(f"{TYPE}_collocations.db") #
cursor = conn.cursor()
prepCollDb()

## Kollokatsioonide alla laadimine ja salvestamine

In [14]:
storage = PostgresStorage(pgpass_file='~/.pgpass',
                          schema='estonian_text_corpora',
                          role='estonian_text_corpora_read',
                          temporary=False)


INFO:storage.py:42: connecting to host: 'postgres.keeleressursid.ee', port: '5432', dbname: 'estonian-text-corpora', user: 'zummy'
INFO:storage.py:58: schema: 'estonian_text_corpora', temporary: False, role: 'estonian_text_corpora_read'


In [15]:
collection = storage[collectionName]
TOTALROWS = len(collection)

Kontrollitakse, mitmenda kollektsiooni juurde skript eelmine kord jäi, juhul kui skripti töö katkes ootamatult. **Nullist alustamiseks tuleb kustutada skripti loodud *{TYPE}_collocations.db* fail.**


In [16]:
lastcollection = getPlaceholder(TABLENAME)

#mitu kollektsiooni jäi alla tõmbamata
unprocessed = TOTALROWS - lastcollection - 1

eprint (f'{unprocessed} out of {TOTALROWS} collections to download')

###
collocations = {}
examples = {}
iterations = 0
unsaved = 0

word_id = 0

for (colId, text) in collection.select (progressbar='notebook', layers=['v166_sentences', 'v168_stanza_ensemble_syntax'], return_index=True ).tail(unprocessed):
    sentences_start = [span.start for span in text.v166_sentences]
    sentences_end = [span.end for span in text.v166_sentences]
    
    iterations +=1
    unsaved = 1
    
    for span in text.v168_stanza_ensemble_syntax:
        
        word_id +=1
        
        #lause algus
        if span.start in sentences_start:
            current_sentence = []
            word_id +=1
            G = nx.DiGraph()
        
        G.add_node(word_id, id=span.id, lemma=span.lemma, pos=span.upostag, deprel=span.deprel)
        G.add_edge(word_id - span.id + span.head, word_id, deprel = span.deprel)
        
        current_sentence.append(span)
        
        #lause lõpp
        if span.end in sentences_end:
            current_sentence_text = ' '.join([s.text for s in current_sentence])
            (collocations, examples) = extract_verb_subject(G, collocations, examples, current_sentence_text)
            unsaved = 1
            continue
    
    if not iterations%BATCHSIZE:
        saveCollToDb(collocations, examples, colId)
        collocations = {}
        unsaved = 0

if unsaved: saveCollToDb(collocations, examples, colId)
eprint ("Done.")

##########


5001 out of 5000 collections to download


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))

andmebaasi salvestatud kollokatsioonid kollektsioonidest: 0 - 999
andmebaasi salvestatud kollokatsioonid kollektsioonidest: 0 - 1999
andmebaasi salvestatud kollokatsioonid kollektsioonidest: 0 - 2999
andmebaasi salvestatud kollokatsioonid kollektsioonidest: 0 - 3999



Done.


andmebaasi salvestatud kollokatsioonid kollektsioonidest: 0 - 4999


### Andmebaasi indeksite lisamine


Tehakse kõige viimasena, et andmete sisestamine andmebaasi oleks kiirem.

In [17]:

indexesQ = [
    f'CREATE INDEX IF NOT EXISTS "lemma1" ON "{TABLENAME}" ("lemma1");'
    , f'CREATE INDEX IF NOT EXISTS "lemma2" ON "{TABLENAME}" ("lemma2");'
    , f'CREATE INDEX IF NOT EXISTS "pos1" ON "{TABLENAME}" ("pos1");'
    , f'CREATE INDEX IF NOT EXISTS "pos2" ON "{TABLENAME}" ("pos2");'
    , f'CREATE INDEX IF NOT EXISTS "total" ON "{TABLENAME}" ("total" DESC);']

for q in indexesQ: cursor.execute(q)

In [18]:
cursor.execute(f"SELECT count(*) FROM {TABLENAME}")
all_collocations = cursor.fetchall()

In [19]:
all_collocations

[(82701,)]