In [1]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
import re, time, bisect, math
import TreeKernel as tk
import TreeBuild as tb

In [4]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin', binary=True)  # C binary format

In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [3]:
def _getNLPToks_(rawSentence):
    output = nlp.annotate(rawSentence, properties={
      'annotators': 'tokenize,ssplit,pos,parse',
      'outputFormat': 'json'
      })
    return output['sentences'][0]['parse'].split("\n")

In [4]:
def _w2vSimilarity_(word1, word2, modelPtr):
    return modelPtr.similarity(word1, word2)

## Test: Tree Generation and Collins-Duffy Kernel

In [8]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1, tree_b1)
tb._generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._CollinsDuffy_(tree_b1, tree_b2, 0.8, 1, 0)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.03782510757446289 seconds ---
--- 0.0381472110748291 seconds ---
--- 0.038294076919555664 seconds ---
--- 0.041108131408691406 seconds ---
Raw Score: 0.8
Norm Score: 0.124034734589


## Test: Tree Generation and Moschitti PT Kernel

In [6]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1, tree_b1)
tb._generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._MoschittiPT_(tree_b1, tree_b2, 0.8, 0.4, 1)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.03903079032897949 seconds ---
--- 0.03934288024902344 seconds ---
--- 0.03950095176696777 seconds ---
--- 0.04249095916748047 seconds ---
Raw Score: 3.36
Norm Score: 0.326722528894


## Test: Tree Kernels on Training Data

In [9]:
from random import randint

def read_data(path_to_file):
    df = pd.read_csv(path_to_file)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df_train = read_data("input/train.csv")

Shape of base training File =  (404290, 6)
Shape of base training data after cleaning =  (404288, 6)


In [10]:
## SET UP TEST INDICES TO SELECT RANDOM ROWS

TEST_SAMPLES = 100
indices = np.arange(TEST_SAMPLES)
for i in range(len(indices)):
    indices[i] = randint(0,df_train.shape[0])

In [13]:
for ind in indices:
    
    tree_1 = tb.tree()
    tree_2 = tb.tree()

    toks_1 = _getNLPToks_(df_train.get_value(ind,'question1'))
    toks_2 = _getNLPToks_(df_train.get_value(ind,'question2'))

    # Generate a tree structure
    tb._generateTree_(toks_1, tree_1)
    tb._generateTree_(toks_2, tree_2)

    # Flip the trees
    tb._flipTree_(tree_1)
    tb._flipTree_(tree_2)

    (scoreRaw_cd, scoreNorm_cd) = tk._CollinsDuffy_(tree_1, tree_2, 0.8, 1, 1)
    (scoreRaw_pt, scoreNorm_pt) = tk._MoschittiPT_(tree_1, tree_2, 0.8, 0.4, 1)
    print("%s\n%s" % (df_train.get_value(ind,'question1'), df_train.get_value(ind,'question2')))
    print("Collins-Duffy | Norm: %f | Raw: %f" % (scoreNorm_cd, scoreRaw_cd))
    print("Moschitti | Norm: %f | Raw: %f\n" % (scoreNorm_pt, scoreRaw_pt))

What are the jobs for me after completing master's in economics?
What are the jobs after m.a economic?
Collins-Duffy | Norm: 0.545545 | Raw: 4.000000
Moschitti | Norm: 0.722947 | Raw: 11.168000

A couple who has 5 sons went for a picnic. Each son has 7 sisters. Each sister has 3 babies. In total, how many people went for the picnic?
Brain Teasers: Two women set out to the market to sell some oranges. Each  had 30 oranges. The first lady sold oranges at 2 for a rupee and second lady sold oranges at 3 for rupee. At last, the first and second lady made Rs 15 and Rs 10 respectively. So the total amount is Rs 25. The next day, when they sold their 30 oranges together to make business profitable, they pooled their sixty oranges and sold at rate of 5 for Rs 2. After they sold all oranges, they found they had only Rs 24. They could not understand where the one rupee went. Where did it go?
Collins-Duffy | Norm: 0.000000 | Raw: 0.000000
Moschitti | Norm: 0.451110 | Raw: 8.704000

How do I burn t