In [1]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
import re, time, bisect, math
import TreeKernel as tk
import TreeBuild as tb

In [9]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin', binary=True)  # C binary format

In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [4]:
def _getNLPToks_(rawSentence):
    output = nlp.annotate(rawSentence, properties={
      'annotators': 'tokenize,ssplit,pos,depparse,parse',
      'outputFormat': 'json'
      })
    return output['sentences'][0]['parse'].split("\n")

In [12]:
def _w2vSimilarity_(word1, word2, modelPtr):
    return modelPtr.similarity(word1, word2)

## Test: Tree Generation and Collins-Duffy Kernel

In [5]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1, tree_b1)
tb._generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._CollinsDuffy_(tree_b1, tree_b2, 0.8, 1, 0)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kerne
print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.07650089263916016 seconds ---
--- 0.07787394523620605 seconds ---
--- 0.07800984382629395 seconds ---
--- 0.08410000801086426 seconds ---
Raw Score: 0.8
Norm Score: 0.124034734589


## Test: Tree Generation and Moschitti PT Kernel

## Test: Tree Kernels on Training Data

In [7]:
from random import randint

def read_data(path_to_file):
    df = pd.read_csv(path_to_file)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df_train = read_data("input/train.csv")

Shape of base training File =  (404290, 6)
Shape of base training data after cleaning =  (404288, 6)


In [8]:
## SET UP TEST INDICES TO SELECT RANDOM ROWS

TEST_SAMPLES = 100
indices = np.arange(TEST_SAMPLES)
for i in range(len(indices)):
    indices[i] = randint(0,df_train.shape[0])

In [9]:
start_time = time.time()

for ind in indices:
    
    tree_1 = tb.tree()
    tree_2 = tb.tree()

    toks_1 = _getNLPToks_(df_train.get_value(ind,'question1'))
    toks_2 = _getNLPToks_(df_train.get_value(ind,'question2'))

    # Generate a tree structure
    tb._generateTree_(toks_1, tree_1)
    tb._generateTree_(toks_2, tree_2)

    # Flip the trees
    tb._flipTree_(tree_1)
    tb._flipTree_(tree_2)

    
    print("%s\n%s" % (df_train.get_value(ind,'question1'), df_train.get_value(ind,'question2')))
    
    for lamb in [1]:
        (scoreRaw_cd, scoreNorm_cd) = tk._CollinsDuffy_(tree_1, tree_2, lamb, 1, 1)
        print("Collins-Duffy | Norm: %f | Raw: %f | Lambda: %3f" % (scoreNorm_cd, scoreRaw_cd, lamb))
    
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel


What are the treatments available for knee pain?
What are the best treatments available for knee pain?
Collins-Duffy | Norm: 0.942809 | Raw: 8.000000 | Lambda: 1.000000
Is Shri Atal Bihari Vajpayee one of the best prime ministers of India? What has been his contribution to India? Does he really deserve the Bharat Ratna that was awarded to him, which, other prime ministers could or should have gotten, but didn't?
Why is Atal Bihari Vajpayee considered one of the best Prime Ministers of India?
Collins-Duffy | Norm: 0.710047 | Raw: 11.000000 | Lambda: 1.000000
How close is the US to eliminating cash entirely and replacing it with electronic currency?
What states in the US use currency other than the US dollar?
Collins-Duffy | Norm: 0.322749 | Raw: 5.000000 | Lambda: 1.000000
What should I do to get more answers of my questions from Quora?
What can I do to assertively make my Quora question interesting enough to answer?
Collins-Duffy | Norm: 0.322329 | Raw: 4.000000 | Lambda: 1.000000
How 

In [31]:
import pickle

# with open('filename.pickle', 'wb') as handle:
#     pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)

# print a == b

import json, os
with open('testoutput', 'ab') as fout:
    count = 0
    for row in df_train.iterrows():
        count+=1
        if count > 100:
            break
    
        tree_1 = tb.tree()
        tree_2 = tb.tree()
        
        print(row)

        print(row[1]['question1'])
        print(row[1]['question2'])
        
        toks_1 = _getNLPToks_(row[1]['question1'])
        toks_2 = _getNLPToks_(row[1]['question2'])

        # Generate a tree structure
        tb._generateTree_(toks_1, tree_1)
        tb._generateTree_(toks_2, tree_2)

        # Flip the trees
        tb._flipTree_(tree_1)
        tb._flipTree_(tree_2)

        tmp = {'q1':tree_1, 'q2':tree_2}

        pickle.dump(tmp, fout, protocol=pickle.HIGHEST_PROTOCOL)
        
        tree_1.clear()
        tree_2.clear()

(0, id                                                              0
qid1                                                            1
qid2                                                            2
question1       What is the step by step guide to invest in sh...
question2       What is the step by step guide to invest in sh...
is_duplicate                                                    0
Name: 0, dtype: object)
What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?
(1, id                                                              1
qid1                                                            3
qid2                                                            4
question1       What is the story of Kohinoor (Koh-i-Noor) Dia...
question2       What would happen if the Indian government sto...
is_duplicate                                                    0
Name: 1, dtype: object)
What is the story of Kohino

In [24]:
with open('testoutput', 'rb') as handle:
    try:
        while True:
            tmp = pickle.load(handle)
            (scoreRaw_cd, scoreNorm_cd) = tk._CollinsDuffy_(tmp['q1'], tmp['q2'], 1.0, 1, 1)
            print("Collins-Duffy | Norm: %f | Raw: %f | Lambda: %3f" % (scoreNorm_cd, scoreRaw_cd, lamb))
    
    except EOFError:
        pass

Collins-Duffy | Norm: 0.944911 | Raw: 15.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.404061 | Raw: 4.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.286039 | Raw: 3.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.000000 | Raw: 0.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.419314 | Raw: 4.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.500773 | Raw: 9.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.000000 | Raw: 0.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.503953 | Raw: 4.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.612372 | Raw: 6.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.447214 | Raw: 3.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.139010 | Raw: 2.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.365148 | Raw: 2.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.668153 | Raw: 5.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.816497 | Raw: 2.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.901421 | Raw: 29.000000 | Lambda: 1.000000
Collins-

In [27]:
df_train[1].values

KeyError: 1