### Imports and database connection

In [6]:
import psycopg2
import estnltk
from estnltk.storage.postgres import PostgresStorage

In [8]:
import time
from datetime import timedelta, datetime

In [7]:
con = psycopg2.connect(user = "postgres",
                       password = "",
                       host = "127.0.0.1",
                       port = "5432",
                       database = "adj_noun_pairs")

try:
    cursor = con.cursor();
except (Exception, psycopg2.Error) as error:
    print("Error in Connection",error)

In [59]:
# Create adjective and noun pair table in the Postgres database
try:
    pair_table = "all_pairs"
    create_table_query = '''CREATE TABLE '''+ pair_table +''' 
              (adjective      TEXT    NOT NULL,
               noun           TEXT    NOT NULL,
               id             INT     NOT NULL,
               type           TEXT    NOT NULL
               ); '''

    cursor.execute(create_table_query)
    con.commit()
    print("Table ("+ pair_table +") created successfully in PostgreSQL ")
except (Exception, psycopg2.Error) as error:
    con.rollback()
    print("Error while creating the table: ", error)

Table (all_pairs) created successfully in PostgreSQL 


In [138]:
storage = PostgresStorage(host="postgres.keeleressursid.ee",
                          dbname="estonian-text-corpora",
                          user="", #enter username
                          password="", #enter password
                         port=5432,
                         schema="estonian_text_corpora")

INFO:storage.py:42: connecting to host: 'postgres.keeleressursid.ee', port: 5432, dbname: 'estonian-text-corpora', user: 'sormusbi'
INFO:storage.py:58: schema: 'estonian_text_corpora', temporary: False, role: 'sormusbi'


In [139]:
collection = storage['koondkorpus_base_v2']

DEBUG:pg_operations.py:75: SELECT table_name, pg_size_pretty(pg_total_relation_size('estonian_text_corpora'||'.'||table_name)), obj_description(('estonian_text_corpora'||'.'||table_name)::regclass), S.n_live_tup FROM information_schema.tables LEFT JOIN pg_stat_user_tables S ON S.relname = table_name AND S.schemaname = table_schema WHERE table_schema='estonian_text_corpora' AND table_type='BASE TABLE';
DEBUG:pg_operations.py:75: SELECT table_name, pg_size_pretty(pg_total_relation_size('estonian_text_corpora'||'.'||table_name)), obj_description(('estonian_text_corpora'||'.'||table_name)::regclass), S.n_live_tup FROM information_schema.tables LEFT JOIN pg_stat_user_tables S ON S.relname = table_name AND S.schemaname = table_schema WHERE table_schema='estonian_text_corpora' AND table_type='BASE TABLE';


In [140]:
collection.selected_layers = ['v166_morph_analysis', 'v166_sentences']
collection.selected_layers

['v166_words', 'v166_morph_analysis', 'v166_sentences']

### Pair extraction from the corpus

In [77]:
# A function to extract the pairs from a text
def extract_pairs(text, text_id):
    paarid = []
    text_type = text.meta['type']
    
    for sentence in text.v166_sentences: # sisse kirjutatud, sest tean, et siin failis on sellise nimega see layer
        for i, word in enumerate(sentence):
            word = word.v166_morph_analysis
            
            if word.partofspeech[0] != 'S':
                continue

            noun = word.lemma[0]

            if i - 1 >= 0:
                word2 = sentence[i-1].v166_morph_analysis

                if word2.partofspeech[0] == 'A':
                    paarid.append((word2.lemma[0], noun, text_id, text_type))

                    # Kui on juba omadussõna nimisõna paar, siis seal võib eelnevalt olla ka mitu omadussõna,
                    # lihtsuse mõttes on kuni kolm eelnevat sõna arvesse võetud
                    
                    # Esimene lihtne võimalus on, et on kaks kirjeldavat omadussõna, nt ilus sinine kleit
                    if i - 2 >= 0:
                        word3 = sentence[i-2].v166_morph_analysis

                        if word3.partofspeech[0] == 'A':
                            paarid.append((word3.lemma[0], noun, text_id, text_type))

                    if i - 3 >= 0:
                        word3_pos = sentence[i-2].v166_morph_analysis.partofspeech[0]
                        
                        # Kui võtta eelnevalt kolmandat sõna arvesse, siis on järgnevad võimalused:
                        # esiteks omadussõna omadussõna omadussõna nimisõna,
                        # teiseks omadussõna , omadussõna nimisõna - võib ka eristada erinevaid lauseosi, kuid seda ilmselt
                        #                                            esineb vähem ning eksimus seega väike - hiljem saab välja
                        #                                            arvestada
                        # kolmandaks omadussõna sidesõna omadussõna nimisõna
                        if word3_pos in ['A', 'Z', 'J']:
                            word4 = sentence[i-3].v166_morph_analysis
                            if word4.partofspeech[0] == 'A':
                                paarid.append((word4.lemma[0], noun, text_id, text_type))
            # nimisõna olema omadussõna                  
            if i + 2 < len(sentence) - 1:
                word5 = sentence[i+2].v166_morph_analysis
                if sentence[i+1].v166_morph_analysis.lemma[0] == 'olema' and word5.partofspeech[0] == 'A':
                    paarid.append((word5.lemma[0], noun, text_id, text_type))
                    
    return paarid

In [137]:
# Loops over all texts in the corpus and finds pairs from each text
start = datetime.now()
for idx, text in enumerate(collection.select(layers=collection.selected_layers, return_index=False, progressbar='notebook', itersize=1)):
    if idx == 1: 
        first = datetime.now()
        print(first-start)
        
    pairs = extract_pairs(text, idx)
    cursor.executemany("INSERT INTO all_pairs VALUES (%s, %s, %s, %s)", pairs)
    con.commit()
                
end = datetime.now()
print(end-first)

DEBUG:subcollection.py:95: SELECT count(*) FROM (SELECT "estonian_text_corpora"."koondkorpus_base_v2"."id", "estonian_text_corpora"."koondkorpus_base_v2"."data", "estonian_text_corpora"."koondkorpus_base_v2__v166_words__layer"."data", "estonian_text_corpora"."koondkorpus_base_v2__v166_morph_analysis__layer"."data", "estonian_text_corpora"."koondkorpus_base_v2__v166_sentences__layer"."data" FROM "estonian_text_corpora"."koondkorpus_base_v2", "estonian_text_corpora"."koondkorpus_base_v2__v166_morph_analysis__layer", "estonian_text_corpora"."koondkorpus_base_v2__v166_sentences__layer", "estonian_text_corpora"."koondkorpus_base_v2__v166_words__layer" WHERE "estonian_text_corpora"."koondkorpus_base_v2"."id" = "estonian_text_corpora"."koondkorpus_base_v2__v166_morph_analysis__layer"."text_id" AND "estonian_text_corpora"."koondkorpus_base_v2"."id" = "estonian_text_corpora"."koondkorpus_base_v2__v166_sentences__layer"."text_id" AND "estonian_text_corpora"."koondkorpus_base_v2"."id" = "estonian

HBox(children=(FloatProgress(value=0.0, max=705356.0), HTML(value='')))

0:00:07.708527



OperationalError: server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.


In [153]:
cursor.execute("SELECT * FROM all_pairs_counts")
all_pairs = cursor.fetchall()

In [154]:
len(all_pairs)

3070458