# Osalausete süntaksipuude statistika

* Statistika kogutakse osalausete kaupa. 
* Tabeldatakse eraldi korpuse kategooriate järgi
* Osalause süntaksipuu struktuuri salvestamisel ignoreeritakse sõnade järjekorda lauses.


## Skripti tööks vajalikud andmed

Pyhtoni teek **syntaxCutter** : https://github.com/estnltk/syntax_experiments/tree/syntax_consistency/syntax_cutter_library

**TSV_CLAUSES_FILE** TSV fail korpuse sisuga : http://localhost:8888/notebooks/collocation_net/data_extraction/from_tsv/collect_texts_db_tsv_clauses.ipynb

**COLLECTIONS_METADATA_FILE** : https://github.com/estnltk/estnltk-model-training/blob/main/collocation_net/data_extraction/from_tsv/collect_metadata_db.ipynb

**TSV_SENTENCES_FILE** : TSV fail korpuse lausetega: skript 1_collect_sentences_text.ipynb


In [1]:
import networkx as nx
import sys
import json
import random
sys.path.append('../../syntax_experiments/syntax_cutter_library')
from syntaxCutter.readerTSVClause import reader

import sqlite3
import pandas as pd

## Andmete kogumine
###  Funktsioonid

In [2]:
import os, psutil # psutil may need to be installed
def usage():
    process = psutil.Process(os.getpid())
    return process.memory_info()[0] / float(2 ** 20)


#Metaandmete laadimine

def subcorpus(string):
    if string == None:
        return string
    return string.split('_')[0]

def loadMetadata(filename):
    global metaDF
    #k6iki andmeid pole vaja, ainult korpuse tüüp
    metaDF=pd.read_csv(filename, sep='\t', index_col='colId', usecols=['colId', 'subcorpus'])
    metaDF['subcorpus'] = metaDF['subcorpus'].apply(subcorpus)

# Andmebaasi tekitamine
def prepDb():
    global  cursor, conn, TABLENAME
    
    
    #collections processed
   
    cursor.execute(f"""CREATE TABLE IF NOT EXISTS collections_processed
                      (tablename text, lastcollection integer);
                      """)
    
    cursor.execute(f"""CREATE UNIQUE INDEX IF NOT EXISTS collections_processed_uniq ON collections_processed(tablename);
    """)
    
    #sentences
    
    cursor.execute(f"""CREATE TABLE IF NOT EXISTS sentences
                      (uid text UNIQUE, sentence text);
                      """)
    
    #tsv failist lugemise korral loome tabeli alati nullist
    cursor.execute(f"""DELETE FROM sentences;""")
    
    #clauses 
    #tsv failist lugemise korral loome tabeli alati nullist
    cursor.execute(f"""
        INSERT INTO collections_processed VALUES (?,?)
        ON CONFLICT(tablename) DO UPDATE SET lastcollection=?;""", (TABLENAME, 0, 0,) )
    
 
    cursor.execute(f"""CREATE TABLE IF NOT EXISTS {TABLENAME}
                      (structure text, texttype text, tokens integer, trees integer, total integer, example1 text, example2 text, example3 text);
                      """)

    #tsv failist lugemise korral loome tabeli alati nullist
    cursor.execute(f"""DELETE FROM {TABLENAME};""")

    
    INDEXNAME = f'{TABLENAME}_col1_col2_unique'
    cursor.execute(f"""CREATE UNIQUE INDEX IF NOT EXISTS {INDEXNAME}
        ON {TABLENAME}(structure, texttype);
        """)
    
    conn.commit()
    
#### Andmete kogumiseks ja salvestamiseks

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

def tree(gClause, key):
    kids = []
    for child in gClause.successors(key):
        kids.append(tree(gClause, child))
    if not len(kids):
        return gClause.nodes[key]['deprel']
    return {gClause.nodes[key]['deprel']: sorted(kids, key=lambda x: str(x).replace('\'','').replace('{',''))}

def collect_clause_trees(uid, G, collocations, examples):
    global metaDF
    
    colId = int(uid.split('_')[0])
    gClauses = G.get_clauses()
   
    for gClause in gClauses:
        clauseLength = gClause.tokensTotal()
        forest = []
        deprels = nx.get_node_attributes(gClause, 'deprel').items()
        nodes = {}
        for k, v in deprels:
            nodes[k] = v
        for key, value in  sorted(nodes.items(), key=lambda x:x[1]):
            if not nx.ancestors(gClause, key):
                forest.append(tree(gClause, key))
        key = (json.dumps(forest), metaDF['subcorpus'][colId], clauseLength, len(forest))
        if not key in examples:
            examples[key] = []
        if not key in collocations:
            collocations[key] = 0
        examples[key].append(uid)
        collocations[key] += 1
    return (collocations, examples,)
        

def saveCollToDb(collocations, examples, lastcollection):
    global TABLENAME, cursor, conn
    sqlColls = []

    for key in collocations.keys():
        example1 = None
        example2 = None
        example3 = None
        examplesTotal = len(examples[key])
        
        if examplesTotal>3:
            random.shuffle(examples[key])
        #if examplesTotal>100:
        #    examples[key] = examples[key][:99]
        
        if examplesTotal:
            example1 = examples[key][0]
        if examplesTotal>1:
            example2 = examples[key][1]
        if examplesTotal>2:
            example3 = examples[key][2]
        #structure text, texttype text, tokens integer, total integer,
        sqlColls.append( (key[0], key[1], key[2], key[3], collocations[key], example1, example2, example3 , collocations[key], example1, example2,example3,) )

    cursor.executemany(f"""
        INSERT INTO {TABLENAME} VALUES (?,?,?,?,?,?,?,?)
        ON CONFLICT(structure, texttype)
        DO UPDATE SET total=total+?
            ,  example1= ?
            ,  example2= ?
            ,  example3= ?
            ;""", sqlColls)

    cursor.execute(f"""
        INSERT INTO collections_processed VALUES (?,?)
        ON CONFLICT(tablename) DO UPDATE SET lastcollection=?;""", (TABLENAME, lastcollection, lastcollection,) )
    
    conn.commit()
    eprint(f'andmebaasi salvestatud osalaused kollektsioonidest: 0 - {lastcollection}' )
    
    #print ('save', collocations, examples, colId)
    



### Muutujad

In [3]:
#source = 'koondkorpus_base_subset_of_5000_v2'
source = 'koondkorpus_base_v2' 
if 'koondkorpus_base_v2' == source:
    #'koondkorpus_v2'
    
    TSV_CLAUSES_FILE='/Volumes/Selena/Kollokatsioonid/koondkorpus/koondkorpus_base_v2_clauses_20221022.tsv'
    COLLECTIONS_METADATA_FILE = '~/repos/estnltk-model-training/collocation_net/data_extraction/from_tsv/metadata_koondkorpus_base_v2.tsv'
    RESULT_DB_FILE = "./clauses_trees_experiment_20221113.db"
    TSV_SENTENCES_FILE = "./koondkorpus_base_v2_sentences.tsv"
    BATCHSIZE = 5000
    TABLENAME = 'clauses'

else:
    #'koondkorpus_base_subset_of_5000_v2'

    TSV_CLAUSES_FILE= '/Users/rabauti/repos/estnltk-model-training/collocation_net/data_extraction/from_tsv/koondkorpus_base_subset_of_5000_v2_clauses.tsv'
    COLLECTIONS_METADATA_FILE = '~/repos/estnltk-model-training/collocation_net/data_extraction/from_tsv/metadata_koondkorpus_base_subset_of_5000_v2.tsv'
    RESULT_DB_FILE = "./clauses_trees_experiment_subset_of_5000_20221113.db"
    TSV_SENTENCES_FILE = "./sentences.tsv"
    BATCHSIZE = 1000
    TABLENAME = 'clauses'


### Metaandmete laadimine

In [4]:
loadMetadata(COLLECTIONS_METADATA_FILE)
print(metaDF.shape)
metaDF.head(2)

(705356, 1)


Unnamed: 0_level_0,subcorpus
colId,Unnamed: 1_level_1
0,aja
1,aja


### Andmete kogumine TSV failist

In [5]:
conn = sqlite3.connect(RESULT_DB_FILE) 
cursor = conn.cursor()
prepDb()


examples = {}
#not used

collocations = {}
unsaved = True
prevCol = None
count = 0

oReader = reader.Reader(file=TSV_CLAUSES_FILE)
for uid, G in oReader.get_sentences_generator(mode='graph'):
    count+=1
    unsaved = True
    colId = int(uid.split('_')[0])
   
    (collocations, examples) = collect_clause_trees(uid, G, collocations, examples)
    
    if not prevCol ==colId and not colId==0 and not colId%BATCHSIZE:
        saveCollToDb(collocations, examples, colId)
        collocations = {}
        unsaved = False
    prevCol = colId
    
if unsaved:
    saveCollToDb(collocations, examples, colId)
    collocations = {}
examples = {}


2022-11-13 23:50:44,227 - Reader - INFO - Reading sentences in progress.


HBox(children=(FloatProgress(value=0.0, description='TSV lines', max=279945679.0, style=ProgressStyle(descript…

andmebaasi salvestatud osalaused kollektsioonidest: 0 - 5000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 10000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 15000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 20000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 25000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 30000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 35000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 40000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 45000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 50000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 55000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 60000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 65000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 70000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 75000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 80000
andmebaas

andmebaasi salvestatud osalaused kollektsioonidest: 0 - 660000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 665000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 670000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 675000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 680000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 685000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 690000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 695000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 700000
andmebaasi salvestatud osalaused kollektsioonidest: 0 - 705000
2022-11-14 05:52:58,459 - Reader - INFO - Reading sentences done.





andmebaasi salvestatud osalaused kollektsioonidest: 0 - 705355


### Andmete indekseerimine 

In [6]:
indexesQ = [
    f'CREATE INDEX IF NOT EXISTS "structure" ON "{TABLENAME}" ("structure");'
    , f'CREATE INDEX IF NOT EXISTS "texttype" ON "{TABLENAME}" ("texttype");'
    , f'CREATE INDEX IF NOT EXISTS "tokens" ON "{TABLENAME}" ("tokens");'
    , f'CREATE INDEX IF NOT EXISTS "trees" ON "{TABLENAME}" ("tokens");'
    , f'CREATE INDEX IF NOT EXISTS "total" ON "{TABLENAME}" ("total" DESC);']

for q in indexesQ: cursor.execute(q)
    
conn.close()

## Näidete lausete lisamine andmebaasi (ei kasuta)

In [7]:
if 0:
    def loadSentences(filename):
        global sentencesDf
        #k6iki andmeid pole vaja, ainult korpuse tüüp
        sentencesDf=pd.read_csv(filename, sep='\t', index_col='uid', usecols=['uid', 'sentence'])

    loadSentences(TSV_SENTENCES_FILE)
    # load sentences that are used in examples
    sentencesDf['copy_uid'] = sentencesDf.index
    sentencesDf.head(2)
    
    conn = sqlite3.connect(RESULT_DB_FILE) 
    conn.row_factory = sqlite3.Row  
    cursor = conn.cursor()

    Query = """SELECT DISTINCT example as uid  FROM
    (     
        SELECT example1 as example  FROM clauses 
        UNION ALL SELECT example2  as example FROM clauses 
        UNION ALL SELECT example3  as example FROM clauses 

    ) x ORDER BY example
    """

    dfExamples = pd.read_sql_query(Query, conn, index_col='uid')
    sentencesDf = pd.merge(dfExamples,sentencesDf,on='uid') 
    
    Query = f'INSERT INTO sentences VALUES (?,?) ON conflict DO NOTHING;'
    cursor.executemany(Query, sentencesDf[['copy_uid', 'sentence']].values.tolist())
    conn.commit()
    conn.close()
