In [1]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
import re, time, bisect, math
import json
import TreeKernel as tk
import TreeBuild as tb
import pickle

In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [3]:
def _getNLPToks_(rawSentence):
    try:
        output = nlp.annotate(rawSentence, properties={
            'annotators': 'tokenize,ssplit,pos,parse,depparse',
            'outputFormat': 'json'
        })
    except UnicodeDecodeError:
        print("HERE")
        sentence = unidecode(rawSentence)
        output = nlp.annotate(sentence, properties={
            'annotators': 'tokenize,ssplit,pos,parse,depparse',
            'outputFormat': 'json'
        })
    if (isinstance(output, str)):
        print(output)
        output = json.loads(output) # Convert str output to dict
        
    dependencies = output['sentences'][0]['basicDependencies']
    tokens = output['sentences'][0]['tokens']
    parse = output['sentences'][0]['parse'].split("\n")
    
    return {'deps':dependencies,
            'toks':tokens, 
            'parse':parse}


## Test: Tree Generation and Collins-Duffy Kernel

In [4]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = "What are college parties like?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1, tree_b1)
tb._generateTree_(toks_b2, tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._CollinsDuffy_(tree_b1, tree_b2, 0.8, 1, 0)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kerne
print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 29.943864822387695 seconds ---


TypeError: unhashable type: 'slice'

## Test: Tree Kernels on Training Data

In [5]:
from random import randint

def read_data(path_to_file):
    df = pd.read_csv(path_to_file)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df_train = read_data("input/train.csv")

Shape of base training File =  (404290, 6)
Shape of base training data after cleaning =  (404288, 6)


In [16]:
## SET UP TEST INDICES TO SELECT RANDOM ROWS

TEST_SAMPLES = 100
indices = np.arange(TEST_SAMPLES)
for i in range(len(indices)):
    indices[i] = randint(0,df_train.shape[0])

In [18]:
df_sample = df_train.ix[indices]

for row in df_sample.iterrows():

    tree_1 = tb.tree()
    tree_2 = tb.tree()

    all_1 = _getNLPToks_(row[1]['question1'])
    all_2 = _getNLPToks_(row[1]['question2'])

    toks_1 = all_1['parse']
    toks_2 = all_2['parse']
    
    # Generate a tree structure
    tb._generateTree_(toks_1, tree_1)
    tb._generateTree_(toks_2, tree_2)

    # Flip the trees
    tb._flipTree_(tree_1)
    tb._flipTree_(tree_2)

    # ST-Score
    (rs_st, ns_st) = tk._CollinsDuffy_(tree_1, tree_2, 0.8, 1, 0)
    # SST-Score
    (rs_sst, ns_sst) = tk._CollinsDuffy_(tree_1, tree_2, 0.8, 1, 1)

    print(row[1]['question1'])
    print(row[1]['question2'])
    print("ST Norm Score: %f" % ns_st)
    print("SST Norm Score: %f" % ns_sst)
    print("ST Raw Score: %f" % rs_st)
    print("SST Raw Score: %f" % rs_sst)

What is the expected cutoff for KVPY 2016 SA -stream 2016?
How was the KVPY SA 2016? What is the expected cutoff?
ST Norm Score: 0.566139
SST Norm Score: 0.566139
ST Raw Score: 4.000000
SST Raw Score: 4.000000
How do I publish YouTube video on wimp?
Is police corruption endemic in the UK?
ST Norm Score: 0.000000
SST Norm Score: 0.000000
ST Raw Score: 0.000000
SST Raw Score: 0.000000
How do I create a linkedin profile?
How can I improve my LinkedIn Profile?
ST Norm Score: 0.377964
SST Norm Score: 0.377964
ST Raw Score: 1.600000
SST Raw Score: 1.600000
Why do we get a runny nose when we eat something spicy?
Why do we get runny noses when we eat spicy food?
ST Norm Score: 0.741249
SST Norm Score: 0.741249
ST Raw Score: 8.000000
SST Raw Score: 8.000000
Why do my condoms always break?
Why do condoms break?
ST Norm Score: 0.750000
SST Norm Score: 0.750000
ST Raw Score: 2.400000
SST Raw Score: 2.400000
Why am I always the first to initiate a conversation?
How can I be the one to initiate the 

In [None]:
with open('input/stanfordData_train1.nlp', 'rb') as handle:
    try:
        while True:
            tmp = pickle.load(handle)
            if tmp == {}:
                print("Empty")
    except EOFError:
        pass

In [None]:
print(df_train.ix[212574]['question1'])
_getNLPToks_(df_train.ix[212574]['question1'])

In [8]:
df_train.ix[[1,5]]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
