In [8]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
import re, time, bisect, math
import TreeKernel as tk
import TreeBuild as tb

In [9]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin', binary=True)  # C binary format

In [10]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [16]:
def _getNLPToks_(rawSentence):
    output = nlp.annotate(rawSentence, properties={
      'annotators': 'tokenize,ssplit,pos,parse',
      'outputFormat': 'json'
      })
    return output['sentences'][0]['parse'].split("\n")

In [12]:
def _w2vSimilarity_(word1, word2, modelPtr):
    return modelPtr.similarity(word1, word2)

## Test: Tree Generation and Collins-Duffy Kernel

In [15]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1, tree_b1)
tb._generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._CollinsDuffy_(tree_b1, tree_b2, 0.8, 1, 0)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

{'sentences': [{'enhancedDependencies': [{'dep': 'ROOT', 'dependent': 4, 'governor': 0, 'dependentGloss': 'enjoy', 'governorGloss': 'ROOT'}, {'dep': 'advmod', 'dependent': 1, 'governor': 4, 'dependentGloss': 'Why', 'governorGloss': 'enjoy'}, {'dep': 'aux', 'dependent': 2, 'governor': 4, 'dependentGloss': 'do', 'governorGloss': 'enjoy'}, {'dep': 'nsubj', 'dependent': 3, 'governor': 4, 'dependentGloss': 'we', 'governorGloss': 'enjoy'}, {'dep': 'dobj', 'dependent': 5, 'governor': 4, 'dependentGloss': 'parties', 'governorGloss': 'enjoy'}, {'dep': 'case', 'dependent': 6, 'governor': 8, 'dependentGloss': 'in', 'governorGloss': 'scope'}, {'dep': 'det', 'dependent': 7, 'governor': 8, 'dependentGloss': 'the', 'governorGloss': 'scope'}, {'dep': 'nmod:in', 'dependent': 8, 'governor': 4, 'dependentGloss': 'scope', 'governorGloss': 'enjoy'}, {'dep': 'case', 'dependent': 9, 'governor': 11, 'dependentGloss': 'of', 'governorGloss': 'universe'}, {'dep': 'det', 'dependent': 10, 'governor': 11, 'dependen

## Test: Tree Generation and Moschitti PT Kernel

In [24]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1, tree_b1)
tb._generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

for i in tree_b1.items():
    print(i)
    
print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._MoschittiPT_(tree_b1, tree_b2, 0.8, 0.4, 1)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.04182696342468262 seconds ---
--- 0.042407989501953125 seconds ---
(0, {'indent': 0, 'curid': 0, 'childrenTok': 'SBARQ', 'parid': -1, 'posOrTok': 'ROOT', 'children': [1]})
(1, {'indent': 2, 'curid': 1, 'childrenTok': ['WHADVP', 'SQ'], 'parid': 0, 'posOrTok': 'SBARQ', 'children': [2, 5]})
(2, {'indent': 4, 'curid': 2, 'childrenTok': ['WRB'], 'parid': 1, 'posOrTok': 'WHADVP', 'children': [3]})
(3, {'indent': 6, 'curid': 3, 'childrenTok': ['Why'], 'parid': 2, 'posOrTok': 'WRB', 'children': [4]})
(4, {'indent': 6, 'curid': 4, 'childrenTok': [], 'parid': 3, 'posOrTok': 'Why', 'children': []})
(5, {'indent': 4, 'curid': 5, 'childrenTok': ['VBP', 'NP', 'VP'], 'parid': 1, 'posOrTok': 'SQ', 'children': [6, 8, 11]})
(6, {'indent': 6, 'curid': 6, 'childrenTok': ['do'], 'parid': 5, 'posOrTok': 'VBP', 'children': [7]})
(7, {'indent': 6, 'curid': 7, 'childrenTok': [], 'parid': 6, 'posOrTok': 'do', 'children': []})
(8, {'indent': 6, 'curid': 8, 'childrenTok': ['PRP'], 'parid': 5, 'posOrTok': 'N

## Test: Tree Kernels on Training Data

In [18]:
from random import randint

def read_data(path_to_file):
    df = pd.read_csv(path_to_file)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df_train = read_data("input/train.csv")

Shape of base training File =  (404290, 6)
Shape of base training data after cleaning =  (404288, 6)


In [19]:
## SET UP TEST INDICES TO SELECT RANDOM ROWS

TEST_SAMPLES = 100
indices = np.arange(TEST_SAMPLES)
for i in range(len(indices)):
    indices[i] = randint(0,df_train.shape[0])

In [21]:
for ind in indices:
    
    tree_1 = tb.tree()
    tree_2 = tb.tree()

    toks_1 = _getNLPToks_(df_train.get_value(ind,'question1'))
    toks_2 = _getNLPToks_(df_train.get_value(ind,'question2'))

    # Generate a tree structure
    tb._generateTree_(toks_1, tree_1)
    tb._generateTree_(toks_2, tree_2)

    # Flip the trees
    tb._flipTree_(tree_1)
    tb._flipTree_(tree_2)

    (scoreRaw_cd, scoreNorm_cd) = tk._CollinsDuffy_(tree_1, tree_2, 0.8, 1, 1)
    (scoreRaw_pt, scoreNorm_pt) = tk._MoschittiPT_(tree_1, tree_2, 0.8, 0.4, 1)
    print("%s\n%s" % (df_train.get_value(ind,'question1'), df_train.get_value(ind,'question2')))
    print("Collins-Duffy | Norm: %f | Raw: %f" % (scoreNorm_cd, scoreRaw_cd))
    print("Moschitti | Norm: %f | Raw: %f\n" % (scoreNorm_pt, scoreRaw_pt))

Why does one bake a cake?
How do I bake a cake without an oven?
Collins-Duffy | Norm: 0.408248 | Raw: 2.400000
Moschitti | Norm: 0.588981 | Raw: 5.216000

What are some of the coolest "facts" about the Marvel/DC universe and characters?
What are some mind blowing facts about DC comics?
Collins-Duffy | Norm: 0.445435 | Raw: 4.000000
Moschitti | Norm: 0.704011 | Raw: 13.728000

Can a man lose his virginity by masturbating? If yes how?
How did you lose your virginity as a man?
Collins-Duffy | Norm: 0.462910 | Raw: 2.400000
Moschitti | Norm: 0.550585 | Raw: 4.192000

What is the best way to turn off my iPod touch?
How do you turn off an iPod?
Collins-Duffy | Norm: 0.267261 | Raw: 1.600000
Moschitti | Norm: 0.460889 | Raw: 4.160000

What are some mind blowing technology tools that most people don't know?
What are some mind-blowing inventions tools that most people don't know about?
Collins-Duffy | Norm: 0.789352 | Raw: 7.200000
Moschitti | Norm: 0.853602 | Raw: 12.576000

What are examples 