In [10]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
import re, time, bisect, math
import TreeKernel as tk
import TreeBuild as tb

In [9]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin', binary=True)  # C binary format

In [11]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [12]:
def _getNLPToks_(rawSentence):
    output = nlp.annotate(rawSentence, properties={
      'annotators': 'tokenize,ssplit,pos,parse',
      'outputFormat': 'json'
      })
    return output['sentences'][0]['parse'].split("\n")

In [12]:
def _w2vSimilarity_(word1, word2, modelPtr):
    return modelPtr.similarity(word1, word2)

## Test: Tree Generation and Collins-Duffy Kernel

In [13]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1, tree_b1)
tb._generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._CollinsDuffy_(tree_b1, tree_b2, 0.8, 1, 0)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kerne
print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.09436917304992676 seconds ---
--- 0.09468698501586914 seconds ---
--- 0.0948190689086914 seconds ---
--- 0.0969841480255127 seconds ---
Raw Score: 0.8
Norm Score: 0.124034734589


## Test: Tree Generation and Moschitti PT Kernel

In [24]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1, tree_b1)
tb._generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

for i in tree_b1.items():
    print(i)
    
print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._MoschittiPT_(tree_b1, tree_b2, 0.8, 0.4, 1)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel

print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.04182696342468262 seconds ---
--- 0.042407989501953125 seconds ---
(0, {'indent': 0, 'curid': 0, 'childrenTok': 'SBARQ', 'parid': -1, 'posOrTok': 'ROOT', 'children': [1]})
(1, {'indent': 2, 'curid': 1, 'childrenTok': ['WHADVP', 'SQ'], 'parid': 0, 'posOrTok': 'SBARQ', 'children': [2, 5]})
(2, {'indent': 4, 'curid': 2, 'childrenTok': ['WRB'], 'parid': 1, 'posOrTok': 'WHADVP', 'children': [3]})
(3, {'indent': 6, 'curid': 3, 'childrenTok': ['Why'], 'parid': 2, 'posOrTok': 'WRB', 'children': [4]})
(4, {'indent': 6, 'curid': 4, 'childrenTok': [], 'parid': 3, 'posOrTok': 'Why', 'children': []})
(5, {'indent': 4, 'curid': 5, 'childrenTok': ['VBP', 'NP', 'VP'], 'parid': 1, 'posOrTok': 'SQ', 'children': [6, 8, 11]})
(6, {'indent': 6, 'curid': 6, 'childrenTok': ['do'], 'parid': 5, 'posOrTok': 'VBP', 'children': [7]})
(7, {'indent': 6, 'curid': 7, 'childrenTok': [], 'parid': 6, 'posOrTok': 'do', 'children': []})
(8, {'indent': 6, 'curid': 8, 'childrenTok': ['PRP'], 'parid': 5, 'posOrTok': 'N

## Test: Tree Kernels on Training Data

In [15]:
from random import randint

def read_data(path_to_file):
    df = pd.read_csv(path_to_file)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df_train = read_data("input/train.csv")

Shape of base training File =  (404290, 6)
Shape of base training data after cleaning =  (404288, 6)


In [16]:
## SET UP TEST INDICES TO SELECT RANDOM ROWS

TEST_SAMPLES = 100
indices = np.arange(TEST_SAMPLES)
for i in range(len(indices)):
    indices[i] = randint(0,df_train.shape[0])

In [24]:
start_time = time.time()

for ind in indices:
    
    tree_1 = tb.tree()
    tree_2 = tb.tree()

    toks_1 = _getNLPToks_(df_train.get_value(ind,'question1'))
    toks_2 = _getNLPToks_(df_train.get_value(ind,'question2'))

    # Generate a tree structure
    tb._generateTree_(toks_1, tree_1)
    tb._generateTree_(toks_2, tree_2)

    # Flip the trees
    tb._flipTree_(tree_1)
    tb._flipTree_(tree_2)

    
    print("%s\n%s" % (df_train.get_value(ind,'question1'), df_train.get_value(ind,'question2')))
    
    for lamb in [1, 0.8, 0.4, 0.2, 0.1]:
        (scoreRaw_cd, scoreNorm_cd) = tk._CollinsDuffy_(tree_1, tree_2, lamb, 1, 1)
        print("Collins-Duffy | Norm: %f | Raw: %f | Lambda: %3f" % (scoreNorm_cd, scoreRaw_cd, lamb))
    
print("--- %s seconds ---" % (time.time() - start_time)) # tree kernel


21 years old male.Gynecomastia patient.Smoke 3–4 ciggarettes a day.Have to go through a surgery.Safe in current circumstances?
Would I put a 1-3 year old cat under too much stress if I moved to a new house five or six months after adopting it?
Collins-Duffy | Norm: 0.219265 | Raw: 5.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.219265 | Raw: 4.000000 | Lambda: 0.800000
Collins-Duffy | Norm: 0.219265 | Raw: 2.000000 | Lambda: 0.400000
Collins-Duffy | Norm: 0.219265 | Raw: 1.000000 | Lambda: 0.200000
Collins-Duffy | Norm: 0.219265 | Raw: 0.500000 | Lambda: 0.100000
Which companies hire the students who have less than 60%?
Does MDI Gurgaon have sectional cut offs?
Collins-Duffy | Norm: 0.119523 | Raw: 1.000000 | Lambda: 1.000000
Collins-Duffy | Norm: 0.119523 | Raw: 0.800000 | Lambda: 0.800000
Collins-Duffy | Norm: 0.119523 | Raw: 0.400000 | Lambda: 0.400000
Collins-Duffy | Norm: 0.119523 | Raw: 0.200000 | Lambda: 0.200000
Collins-Duffy | Norm: 0.119523 | Raw: 0.100000 | Lambda: 0.100

In [25]:
tree_1

defaultdict(<function TreeBuild.tree>,
            {0: {'children': [1],
              'childrenTok': 'SBARQ',
              'curid': 0,
              'indent': 0,
              'parid': -1,
              'posOrTok': 'ROOT'},
             1: {'children': [2, 5],
              'childrenTok': ['WHNP', 'SQ'],
              'curid': 1,
              'indent': 2,
              'parid': 0,
              'posOrTok': 'SBARQ'},
             2: {'children': [3],
              'childrenTok': ['WP'],
              'curid': 2,
              'indent': 4,
              'parid': 1,
              'posOrTok': 'WHNP'},
             3: {'children': [4],
              'childrenTok': ['What'],
              'curid': 3,
              'indent': 6,
              'parid': 2,
              'posOrTok': 'WP'},
             4: {'children': [],
              'childrenTok': [],
              'curid': 4,
              'indent': 6,
              'parid': 3,
              'posOrTok': 'What'},
             5: {'children'

In [77]:
import pickle

# with open('filename.pickle', 'wb') as handle:
#     pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)

# print a == b

import json, os
with open('testoutput', 'ab') as fout:
    count = 0
    for row in df_train.iterrows():
        count+=1
        if count > 100:
            break
    
        tree_1 = tb.tree()
        tree_2 = tb.tree()

        toks_1 = _getNLPToks_(row[1]['question1'])
        toks_2 = _getNLPToks_(row[1]['question2'])

        # Generate a tree structure
        tb._generateTree_(toks_1, tree_1)
        tb._generateTree_(toks_2, tree_2)

        # Flip the trees
        tb._flipTree_(tree_1)
        tb._flipTree_(tree_2)

        tmp = {'q1':tree_1, 'q2':tree_2}

        pickle.dump(tmp, fout, protocol=pickle.HIGHEST_PROTOCOL)


In [78]:
with open('testoutput', 'rb') as handle:
    a = pickle.load(handle)
    print(a['q1'][10])
    b = pickle.load(handle)
    print(b['q1'][10])

{'children': [11], 'indent': 10, 'childrenTok': ['the'], 'posOrTok': 'DT', 'curid': 10, 'parid': 9}
{'children': [11], 'indent': 10, 'childrenTok': ['the'], 'posOrTok': 'DT', 'curid': 10, 'parid': 9}


In [79]:
print(a)

{'q1': defaultdict(<function tree at 0x10fa7a620>, {0: {'children': [1], 'indent': 0, 'childrenTok': 'SBARQ', 'posOrTok': 'ROOT', 'curid': 0, 'parid': -1}, 1: {'children': [2, 5], 'indent': 2, 'childrenTok': ['WHNP', 'SQ'], 'posOrTok': 'SBARQ', 'curid': 1, 'parid': 0}, 2: {'children': [3], 'indent': 4, 'childrenTok': ['WP'], 'posOrTok': 'WHNP', 'curid': 2, 'parid': 1}, 3: {'children': [4], 'indent': 6, 'childrenTok': ['What'], 'posOrTok': 'WP', 'curid': 3, 'parid': 2}, 4: {'children': [], 'indent': 6, 'childrenTok': [], 'posOrTok': 'What', 'curid': 4, 'parid': 3}, 5: {'children': [6, 8], 'indent': 4, 'childrenTok': ['VBZ', 'NP'], 'posOrTok': 'SQ', 'curid': 5, 'parid': 1}, 6: {'children': [7], 'indent': 6, 'childrenTok': ['is'], 'posOrTok': 'VBZ', 'curid': 6, 'parid': 5}, 7: {'children': [], 'indent': 6, 'childrenTok': [], 'posOrTok': 'is', 'curid': 7, 'parid': 6}, 8: {'children': [9, 14, 22], 'indent': 6, 'childrenTok': ['NP', 'PP', 'S'], 'posOrTok': 'NP', 'curid': 8, 'parid': 5}, 9: {

In [80]:
print(b)

{'q1': defaultdict(<function tree at 0x10fa7a620>, {0: {'children': [1], 'indent': 0, 'childrenTok': 'SBARQ', 'posOrTok': 'ROOT', 'curid': 0, 'parid': -1}, 1: {'children': [2, 5], 'indent': 2, 'childrenTok': ['WHNP', 'SQ'], 'posOrTok': 'SBARQ', 'curid': 1, 'parid': 0}, 2: {'children': [3], 'indent': 4, 'childrenTok': ['WP'], 'posOrTok': 'WHNP', 'curid': 2, 'parid': 1}, 3: {'children': [4], 'indent': 6, 'childrenTok': ['What'], 'posOrTok': 'WP', 'curid': 3, 'parid': 2}, 4: {'children': [], 'indent': 6, 'childrenTok': [], 'posOrTok': 'What', 'curid': 4, 'parid': 3}, 5: {'children': [6, 8], 'indent': 4, 'childrenTok': ['VBZ', 'NP'], 'posOrTok': 'SQ', 'curid': 5, 'parid': 1}, 6: {'children': [7], 'indent': 6, 'childrenTok': ['is'], 'posOrTok': 'VBZ', 'curid': 6, 'parid': 5}, 7: {'children': [], 'indent': 6, 'childrenTok': [], 'posOrTok': 'is', 'curid': 7, 'parid': 6}, 8: {'children': [9, 14], 'indent': 6, 'childrenTok': ['NP', 'PP'], 'posOrTok': 'NP', 'curid': 8, 'parid': 5}, 9: {'children