# Explore with Sqlite databases

In [9]:
import sys
sys.path.append("../python/")
import pentoref.IO as IO
import sqlite3 as sqlite
from google_trans_new import google_translator

In [2]:
# Create databases if required
if False:   # make True if you need to create the databases from the derived data
    for corpus_name in ["TAKE", "TAKECV", "PENTOCV"]:
        data_dir = "../../../pentoref/{0}_PENTOREF".format(corpus_name)
        dfwords, dfutts, dfrefs, dfscenes, dfactions = IO.convert_subcorpus_raw_data_to_dataframes(data_dir)
        IO.write_corpus_to_database("{0}.db".format(corpus_name),
                                    corpus_name, dfwords, dfutts, dfrefs, dfscenes, dfactions)

In [3]:
# Connect to database
CORPUS = "PENTOCV"
db = sqlite.connect("{0}.db".format(CORPUS))
cursor = db.cursor()
# get the table column header names
print("utts", [x[1] for x in cursor.execute("PRAGMA table_info(utts)")])
print("words", [x[1] for x in cursor.execute("PRAGMA table_info(words)")])
print("refs", [x[1] for x in cursor.execute("PRAGMA table_info(refs)")])
print("scenes", [x[1] for x in cursor.execute("PRAGMA table_info(scenes)")])
print("actions", [x[1] for x in cursor.execute("PRAGMA table_info(actions)")])

utts ['gameID', 'uttID', 'starttime', 'endtime', 'utt', 'utt_clean', 'role', 'speaker']
words ['gameID', 'uttID', 'position', 'word', 'lemma', 'tag']
refs ['refID', 'gameID', 'uttID', 'text', 'id', 'piece', 'location']
scenes ['timestampID', 'gameID', 'pieceID', 'position_global', 'position_x', 'position_y', 'shape', 'shape_distribution', 'shape_orientation', 'shape_skewness_horizontal', 'shape_skewness_vertical', 'shape_edges', 'colour', 'colour_distribution', 'colour_hsv', 'colour_rgb']
actions ['gameID', 'starttime', 'endtime', 'hand', 'action', 'piece']


## Get utterances from certain time periods in each experiment or for certain episodes

In [13]:
translator = google_translator()  

for row in db.execute("SELECT gameID, starttime, speaker, utt_clean FROM utts" + \
                   # " WHERE starttime >= 200 AND starttime <= 300" + \
                     ' WHERE gameID = "r1_1_1_b"' + \
                    " ORDER BY gameID, starttime"):
    print(row)
    line = row[3]
    print(line)
    if not line:
        continue
    translate_text = translator.translate(line,lang_src='de',lang_tgt='en') 
    print(translate_text)

('r1_1_1_b', 185.719, 'A', '')

('r1_1_1_b', 194.232, 'B', 'leg das orange L mit der kurzen Seite nach rechts zeigend die lange Seite nach unten zu dir zu deinem Bauch hin')
leg das orange L mit der kurzen Seite nach rechts zeigend die lange Seite nach unten zu dir zu deinem Bauch hin
Leg the Orange L with the short side pointing to the right the long side down to you to your belly 
('r1_1_1_b', 204.521, 'B', '')

('r1_1_1_b', 208.872, 'B', 'die kurze Seite nach rechts zeigend die lange Seite zu dir hin')
die kurze Seite nach rechts zeigend die lange Seite zu dir hin
The short side to the right pointing the long side to you 
('r1_1_1_b', 211.999, 'B', '')

('r1_1_1_b', 213.9, 'B', 'genau')
genau
I agree 
('r1_1_1_b', 214.356, 'B', '')

('r1_1_1_b', 215.105, 'B', 'das Kreuz nimmst du und setzt es oben auf die kurze Seite')
das Kreuz nimmst du und setzt es oben auf die kurze Seite
You take the cross and puts it up on the short side 
('r1_1_1_b', 219.022, 'B', '')

('r1_1_1_b', 219.251, '

## Get mutual information between words used in referring expressions and properties of the referent

In [5]:
from collections import Counter
from pentoref.IOutils import clean_utt

In [124]:
piece_counter = Counter()
word_counter = Counter()
word_piece_counter = Counter()
train_data = []
test_data = []

PAIR_NUM = "r10"

for row in db.execute("SELECT id, gameID, text, uttID FROM refs" + \
#for row in db.execute("SELECT shape, colour, orientation, gridPosition, gameID, pieceID FROM scenes" + \
                     " ORDER by gameID"):
    if False: 
        # TAKE
        isTarget = db.execute('SELECT refID FROM refs WHERE gameID ="' + row[4] + '" AND pieceID ="' + row[5] + '"')
        target = False 
        for r1 in isTarget:
            target = True
        if not target:
            continue
    print(row)
    #TAKE
    #shape, colour, orientation, gridPosition, gameID, pieceID = row
    #piece = colour  #+ "_" + shape #shape + "_" + colour
    
    #PENTOCV
    piece, gameID, text, uttID = row
    pair_num = gameID.split("_")[0]
    #if not pair_num == PAIR_NUM:
    #    continue
    
    piece_counter[piece] += 1
    
    if CORPUS in ["TAKECV", "TAKE"]:
        local_word_count = Counter()
        feature_vector = {}
        for f in db.execute('SELECT word from words WHERE gameID ="' + str(gameID) + '"'):
            #print(f)
            for word in clean_utt(f[0].lower()).split():
                local_word_count[word] += 1
        word_dem = len(local_word_count.values())
        for k, v in local_word_count.items():
            feature_vector[k] = local_word_count[k]/word_dem
            word_counter[k] += (v/word_dem)
            word_piece_counter[piece+"__"+k] += (v/word_dem)
    elif CORPUS == "PENTOCV":
        word_dem = len(clean_utt(text.lower()).split())
        feature_vector = {}
        local_word_count = Counter()
        for word in clean_utt(text.lower()).split():
            word_counter[word] += (1/word_dem)
            word_piece_counter[piece+"__"+word] += (1/word_dem)
            local_word_count[word] += 1
        for k, v in local_word_count.items():
            feature_vector[k] = local_word_count[k]/word_dem
        if pair_num == PAIR_NUM:
            test_data.append((feature_vector, piece))
        else:
            train_data.append((feature_vector, piece))

('L', 'r1_1_1_b', 'das <v="orange">orangene</v> $L', 5887)
('X', 'r1_1_1_b', 'das Kreuz', 5893)
('X', 'r1_1_1_b', 'es', 5893)
('T', 'r1_1_1_b', 'es', 5899)
('T', 'r1_1_1_b', '(<p>d-</p> + das) grüne $T', 5901)
('T', 'r1_1_1_b', 'das $T', 5903)
('T', 'r1_1_1_b', 'das $T', 5907)
('T', 'r1_1_1_b', 'das $T', 5907)
('T', 'r1_1_1_b', '<v="es">\'s</v>', 5909)
('U', 'r1_1_1_b', 'das gelbe', 5919)
('U', 'r1_1_1_b', 'die Brücke', 5921)
('P', 'r1_1_1_b', 'das pinkfarbene Teil', 5924)
('I', 'r1_1_2_b', 'den blauen Balken', 5935)
('V', 'r1_1_2_b', 'das blaue $L', 5937)
('L', 'r1_1_2_b', 'das <v="orange">orangene</v> $L', 5939)
('L', 'r1_1_2_b', 'es', 5939)
('Z', 'r1_1_2_b', 'das $S', 5952)
('Z', 'r1_1_2_b', 'es', 5952)
('Z', 'r1_1_2_b', '<v="es">\'s</v> da', 5963)
('V', 'r1_1_2_b', 'den blauen', 5963)
('P', 'r1_1_2_b', 'das pinkfarbene Teil', 5969)
('P', 'r1_1_2_b', 'es', 5969)
('P', 'r1_1_3_b', 'das pinke Teil', 5982)
('P', 'r1_1_3_b', 'es', 5982)
('W', 'r1_1_3_b', 'grüne Treppe', 5984)
('W', 'r1_

('L', 'r4_2_13_b', 'den <v="orangen">orangenen</v> Stein', 10461)
('T', 'r4_2_13_b', 'des $Ts', 10461)
('I', 'r4_2_13_b', 'den langen blauen Stein', 10463)
('L', 'r4_2_13_b', 'den . <v="orangen">orangenen</v> Winkel', 10463)
('P', 'r4_2_13_b', 'den rosa Stein', 10468)
('X', 'r4_2_13_b', 'das rote Kreuz', 10472)
('X', 'r4_2_13_b', 'es', 10472)
('W', 'r4_2_13_b', 'das grüne $M', 10475)
('Y', 'r4_2_13_b', 'den braunen Stein', 10485)
('U', 'r4_2_13_b', 'den gelben Stein', 10500)
('U', 'r4_2_13_b', 'er', 10500)
('N', 'r4_2_13_s', 'den lilanen', 11260)
('U', 'r4_2_13_s', 'das gelbe $C', 11264)
('W', 'r4_2_13_s', 'das grüne $M', 11266)
('U', 'r4_2_7_b', 'gelbe Stein', 9905)
('U', 'r4_2_7_b', 'da drauf', 9911)
('U', 'r4_2_7_b', 'darein', 9911)
('X', 'r4_2_7_b', 'rote Kreuz', 9911)
('T', 'r4_2_7_b', 'das  . große grüne $T', 9913)
('X', 'r4_2_7_b', 'das Kreuz', 9913)
('T', 'r4_2_7_b', 'drauf', 9915)
('Z', 'r4_2_7_b', '(das $C {F äh} + das $Z)', 9919)
('N', 'r4_2_7_b', 'diesen <v="lila">lilanen</

('Y', 'r7_2_15_b', 'braun', 390)
('V', 'r7_2_15_b', 'den . blauen Winkel', 404)
('Y', 'r7_2_15_b', 'das Braune', 406)
('I', 'r7_2_15_b', 'das lange Blaue', 410)
('V', 'r7_2_15_b', 'Winkel', 410)
('N', 'r7_2_15_b', 'das <v="Lila">Lilane</v>', 415)
('T', 'r7_2_15_b', 'das .. grüne $T', 423)
('N', 'r7_2_15_b', 'das <v="Lila">Lilane</v>', 425)
('L', 'r7_2_15_b', 'das <v="Orange">Orangene</v>', 431)
('L', 'r7_2_15_b', 'das <v="Orange">Orangene</v>', 439)
('L', 'r7_2_15_b', 'das Gelbe', 439)
('U', 'r7_2_15_b', 'Gelbe', 439)
('X', 'r7_2_15_b', 'das rote Kreuz', 439)
('X', 'r7_2_15_b', 'das Rote', 456)
('W', 'r7_2_15_b', 'das', 466)
('Y', 'r7_2_15_s', 'Bra:un', 2217)
('L', 'r7_2_15_s', 'orange', 2217)
('X', 'r7_2_15_s', 'rot', 2217)
('I', 'r7_2_16_b', 'Den blauen Langen', 488)
('I', 'r7_2_16_b', 'das', 488)
('I', 'r7_2_16_b', 'darauf <ref id="V" piece="target"> den rechten Winkel', 496)
('V', 'r7_2_16_b', 'er', 496)
('Y', 'r7_2_16_b', 'das Braune', 502)
('Z', 'r7_2_16_b', 'Blaue', 511)
('P', '

In [125]:
len(test_data)

0

In [126]:
from nltk.classify import SklearnClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report
from random import shuffle
import numpy as np

def trainClassifier(trainData):
    print("Training Classifier...")
    return SklearnClassifier(LinearSVC(loss='squared_hinge')).train(trainData)
    #return SklearnClassifier(LogisticRegression()).train(trainData)
    #return SklearnClassifier(MultinomialNB()).train(trainData)

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))

def crossValidate(dataset, folds):
    shuffle(dataset)
    dataset = dataset[:int(len(dataset)/8)] # random 1/8th%
    results = []
    foldSize = int(len(dataset)/folds)+1
    
    for i in range(0,len(dataset),int(foldSize)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+foldSize))
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(myTrainData)
        y_true = [x[1] for x in myTestData]
        y_pred = predictLabels(myTestData, classifier)
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
        print(classification_report(y_true, y_pred))
        
    avgResults = (np.mean([x[0] for x in results]),
                  np.mean([x[1] for x in results]),
                  np.mean([x[2] for x in results]))
    return avgResults

av = crossValidate(train_data, 10)
print(av)
if False:
    cl = trainClassifier(train_data)
    y_true = [x[1] for x in test_data]
    y_pred = predictLabels(test_data, cl)
    print(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
    print(classification_report(y_true, y_pred))


Fold start on items 0 - 35
Training Classifier...
              precision    recall  f1-score   support

           F       1.00      0.75      0.86         4
           I       0.67      1.00      0.80         2
           N       1.00      0.75      0.86         4
           P       1.00      0.50      0.67         2
           T       0.75      0.60      0.67         5
           U       1.00      1.00      1.00         1
           V       1.00      0.50      0.67         2
           W       0.43      1.00      0.60         3
           X       1.00      0.50      0.67         2
           Y       0.78      0.88      0.82         8
           Z       0.50      0.50      0.50         2

    accuracy                           0.74        35
   macro avg       0.83      0.72      0.74        35
weighted avg       0.82      0.74      0.75        35

Fold start on items 35 - 70
Training Classifier...
              precision    recall  f1-score   support

           F       0.75      0.

In [129]:
#10-f cross_val on individual pairs (smaller datasets)
ind_results= {"r1" : (0.8224358974358974, 0.7846153846153846, 0.7778205128205128),
"r2" : (0.7678858784893269, 0.6637931034482759, 0.675242522256069),
"r3" : (0.8181915306915307, 0.7599067599067599, 0.758845497936407),
"r4" : (0.7992240687681375, 0.68470066518847, 0.6877442020358089),
"r5" : (0.7639380411255411, 0.6191666666666666, 0.6379182692307691),
"r6" : (0.9057487922705313, 0.8761528326745719, 0.8737389495294634),
"r7" : (0.8732662337662337, 0.8296739130434782, 0.8278420650668934),
"r8" : (0.7130039355485784, 0.5734432234432234, 0.5872288333290527)}

print("Trained on same data 10f cross (average performance per pair) av 1/8 of data")
print(np.mean([x[0] for x in ind_results.values()]))
print(np.mean([x[1] for x in ind_results.values()]))
print(np.mean([x[2] for x in ind_results.values()]))

print("*" * 30)
print("Trained on all data (mixed) 10-fold cross val (of random 1/8th of data)")
#10-f cross_val mixed overall (whole dataset)
global_result_10f_of_random_eighth = [
    (0.7303828289936665, 0.6563546798029556, 0.6591810707081643),
    (0.7773714129329894, 0.7020689655172414, 0.705385763114827),
    (0.7340298765274135, 0.6626600985221675, 0.6634152186073369),
    (0.763595589960122, 0.6799014778325124, 0.6800262533435527)
]
print(np.mean([x[0] for x in global_result_10f_of_random_eighth]))
print(np.mean([x[1] for x in global_result_10f_of_random_eighth]))
print(np.mean([x[2] for x in global_result_10f_of_random_eighth]))


print("*" * 30)
print("Trained on all data (mixed) 8-fold cross val (full_data)")
#10-f cross_val mixed overall (whole dataset)
global_result_8 = (0.7844808924585374, 0.7293746248177686, 0.7410059116896628)
global_result_10 = (0.7905361390798162, 0.7292899089488714, 0.7417777879399795)
for x in global_result_8:
    print(x)
    

print("*" * 30)
#mixed- trained on other 7
other_results= {"r1" : (0.7779836356055868, 0.6829268292682927, 0.690424865840207, None),
"r2" : (0.7219102027091829, 0.6401384083044983, 0.6505178871728421, None),
"r3" : (0.8292961148443981, 0.734375, 0.7530287500141206, None),
"r4" : (0.7893900944674533, 0.7048054919908466, 0.7258770742493045, None),
"r5" : (0.7215635412139649, 0.6059322033898306, 0.6263491634014793, None),
"r6" : (0.8328990107115107, 0.809375, 0.8035774233470034, None),
"r7" : (0.8375586804425542, 0.8149779735682819, 0.818528077822021, None),
"r8" : (0.7323545548007373, 0.5745098039215686, 0.5976834631543789, None)}

print("Trained on other data 8-dialogue-fold cross-val")
print(np.mean([x[0] for x in other_results.values()]))
print(np.mean([x[1] for x in other_results.values()]))
print(np.mean([x[2] for x in other_results.values()]))

Trained on same data 10f cross (average performance per pair) av 1/8 of data
0.8079617972619721
0.7239315686233538
0.728297606525622
******************************
Trained on all data (mixed) 10-fold cross val (of random 1/8th of data)
0.7513449271035478
0.6752463054187192
0.6770020764434702
******************************
Trained on all data (mixed) 8-fold cross val (full_data)
0.7844808924585374
0.7293746248177686
0.7410059116896628
******************************
Trained on other data 8-dialogue-fold cross-val
0.7803694793494236
0.6958800888054149
0.7082483381251696


In [None]:
good_pieces = ["X", "Y", "P", "N", "U", "F", "Z", "L", "T", "I", "W", "V", "UNK"]
print("non standard pieces", {k:v for k,v in piece_counter.items() if k not in good_pieces})
piece_counter

In [102]:
print(len(word_counter), "words")
word_counter.most_common(20)

374 words


[('das', 94.59751915673368),
 ('ja', 74.96806652002296),
 ('rechts', 66.95496000139885),
 ('unten', 65.01167651587855),
 ('oben', 62.703650072015805),
 ('links', 62.102001006203004),
 ('richtig', 35.23788092527971),
 ('okay', 30.58326592447299),
 ('der', 26.705970444692667),
 ('rote', 21.798976854347682),
 ('gelbe', 21.03494271746558),
 ('graue', 16.95892332641557),
 ('blaue', 16.86355046913869),
 ('grüne', 16.14851689160511),
 ('kreuz', 15.95614052614051),
 ('einzige', 15.723457520740121),
 ('die', 15.714750797753208),
 ('form', 15.108645811756922),
 ('t', 14.205841142288502),
 ('ähm', 12.765858277653606)]

In [103]:
word_total = sum(word_piece_counter.values())
piece_total= sum(piece_counter.values())

In [150]:

for piece, piece_count in piece_counter.items():
    p_piece = piece_count/piece_total
    print("piece:", piece, piece_count, p_piece)
    highest = -1
    best_word = ""
    rank = {}
    for word, w_count in word_counter.items():
        #print(word, w_count)
        if w_count < 2: 
            continue
        p_word = w_count / word_total
        p_word_piece = word_piece_counter[piece+"__"+word] / word_total
        #check bayes = p(w|c)p(c) = p(c|w)p(w)
        bayes_check_1 = (word_piece_counter[piece+"__"+word]/piece_count) * p_piece
        bayes_check_2 = (word_piece_counter[piece+"__"+word]/w_count) * p_word
        if not round(bayes_check_1,4)== round(bayes_check_2,4):
            print("MISMATCH", word, bayes_check_1, bayes_check_2)
        mi = (p_word_piece/(p_piece * p_word))
        rank[word] = (mi, word_piece_counter[piece+"__"+word], p_word_piece, w_count, p_word)
        if mi > highest:
            highest = mi
            best_word = word
    if True:
        top = 5
        for k, v in sorted(rank.items(), key=lambda x:x[1][0], reverse=True):
            print(k,v[0], [round(x,4) for x in v[1:]])
            top -=1
            if top <= 0: 
                break
    print("*" * 30)

piece: L 238 0.22995169082125605
MISMATCH das 0.056392914653784244 0.05660905815529811
MISMATCH l 0.02148148148148148 0.02156381598491366
orangene 4.348739495798319 [4.3333, 0.0042, 4.3333, 0.0042]
orange 4.217910991111246 [53.7333, 0.0521, 55.4, 0.0537]
orangen 3.937511174682639 [9.575, 0.0093, 10.575, 0.0103]
l-form 3.41686674669868 [1.8333, 0.0018, 2.3333, 0.0023]
l 2.9507723740564398 [22.2333, 0.0216, 32.7667, 0.0318]
******************************
piece: X 230 0.2222222222222222
kreuz 4.5 [53.0, 0.0514, 53.0, 0.0514]
roten 4.5 [5.6667, 0.0055, 5.6667, 0.0055]
x 4.5 [2.8333, 0.0027, 2.8333, 0.0027]
rot 4.5 [7.0, 0.0068, 7.0, 0.0068]
rote 4.4639999999999995 [31.0, 0.0301, 31.25, 0.0303]
******************************
piece: T 253 0.24444444444444444
MISMATCH das 0.06937198067632855 0.0696378705120001
MISMATCH grüne 0.022431561996779373 0.02251753798124791
dunkelgrüne 4.090909090909091 [11.25, 0.0109, 11.25, 0.0109]
t 3.8354691665949945 [66.0667, 0.0641, 70.4667, 0.0683]
ts 3.6818181

In [153]:
db.close()

In [106]:
"""
p(w, c) = p(w|c)p(c) = p(c|w)p(w)
p(w) is the liklihood of the word appearing in a referring expression
p(c) is the liklihood of a referring expression having this class
p(w|c) is the liklihood of a word appearing given we've got this class
p(c|w) is the liklihood of this class appearing given we've got this word
"""

SyntaxError: invalid syntax (<ipython-input-106-dc7248c79cbb>, line 1)