In [3]:
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP 
import re, time, bisect, math
import json
import TreeKernel as tk
import TreeBuild as tb
import pickle

In [4]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [41]:
def _getNLPToks_(rawSentence):
    try:
        output = nlp.annotate(rawSentence, properties={
            'annotators': 'tokenize,ssplit,pos,parse,ner,depparse',
            'outputFormat': 'json'
        })
    except UnicodeDecodeError:
        print("HERE")
        sentence = unidecode(rawSentence)
        output = nlp.annotate(sentence, properties={
            'annotators': 'tokenize,ssplit,pos,parse,ner,depparse',
            'outputFormat': 'json'
        })
    if (isinstance(output, str)):
        print("Error Processing Row. Attempt to replace %")
        return _getNLPToks_(rawSentence.replace("%",""))
            
    dependencies = output['sentences'][0]['basicDependencies']
    tokens = output['sentences'][0]['tokens']
    parse = output['sentences'][0]['parse'].split("\n")
    
    return {'deps':dependencies,
            'toks':tokens, 
            'parse':parse}


## Test: Tree Generation and Collins-Duffy Kernel

In [42]:
test_b1 = "Why do we enjoy parties in the scope of the universe"
test_b2 = ''"Men don't respond to words; they respond to no contact."" True or false?"

# test_b1 = "I tom?"
# test_b2 = "I, Claudius?"
start_time = time.time()

tree_b1 = tb.tree()
tree_b2 = tb.tree()

# Generate raw tokens using Stanford Core NLP

toks_b1 = _getNLPToks_(test_b1)
toks_b2 = _getNLPToks_(test_b2)
print("--- %s seconds ---" % (time.time() - start_time)) # tokenize

# Generate a tree structure
tb._generateTree_(toks_b1['parse'], tree_b1)
tb._generateTree_(toks_b2['parse'], tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # generate tree

# Flip the trees
tb._flipTree_(tree_b1)
tb._flipTree_(tree_b2)

print("--- %s seconds ---" % (time.time() - start_time)) # flip tree

(rawScore, normScore) = tk._CollinsDuffy_(tree_b1, tree_b2, 0.4, 1, 0)
print("--- %s seconds ---" % (time.time() - start_time)) # tree kerne
print("Raw Score: %s" % (rawScore))
print("Norm Score: %s" % (normScore))

--- 0.07348895072937012 seconds ---
--- 0.07387304306030273 seconds ---
--- 0.07401585578918457 seconds ---
--- 0.07808399200439453 seconds ---
Raw Score: 0.4
Norm Score: 0.0689390255423


## Test: Tree Kernels on Training Data

In [43]:
from random import randint

def read_data(path_to_file):
    df = pd.read_csv(path_to_file)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df_train = read_data("input/test.csv")

Shape of base training File =  (2345796, 3)
Shape of base training data after cleaning =  (2345790, 3)


In [44]:
## SET UP TEST INDICES TO SELECT RANDOM ROWS

TEST_SAMPLES = 100
indices = np.arange(TEST_SAMPLES)
for i in range(len(indices)):
    indices[i] = randint(0,df_train.shape[0])

In [45]:
df_sample = df_train.ix[indices]

for row in df_sample.iterrows():

    tree_1 = tb.tree()
    tree_2 = tb.tree()

    all_1 = _getNLPToks_(row[1]['question1'])
    all_2 = _getNLPToks_(row[1]['question2'])

    toks_1 = all_1['parse']
    toks_2 = all_2['parse']
    
    # Generate a tree structure
    tb._generateTree_(toks_1, tree_1)
    tb._generateTree_(toks_2, tree_2)

    # Flip the trees
    tb._flipTree_(tree_1)
    tb._flipTree_(tree_2)

    # ST-Score
    (rs_st, ns_st) = tk._CollinsDuffy_(tree_1, tree_2, 0.8, 1, 0)
    # SST-Score
    (rs_sst, ns_sst) = tk._CollinsDuffy_(tree_1, tree_2, 0.8, 1, 1)

    print(row[1]['question1'])
    print(row[1]['question2'])
    print("ST Norm Score: %f" % ns_st)
    print("SST Norm Score: %f" % ns_sst)
    print("ST Raw Score: %f" % rs_st)
    print("SST Raw Score: %f" % rs_sst)

Why did Eratosthenes assume that the light rays were parallel and that the well was pointing to the centre of the Earth?
How can we measure the size of the earth through viewing a ship sailing off?
ST Norm Score: 0.292483
SST Norm Score: 0.000394
ST Raw Score: 7.200000
SST Raw Score: 23.354880
Where is the difference between TM (trademark), R (registered), and C (copyright)?
How do opinion trademark a name?
ST Norm Score: 0.082699
SST Norm Score: 0.010735
ST Raw Score: 0.800000
SST Raw Score: 5.792000
I'm 32 and I feel like a complete failure. I just work from home but my real passion make digital marketing & performing. What should I do?
What does it feel like to have home flooded?
ST Norm Score: 0.063918
SST Norm Score: 0.003139
ST Raw Score: 0.800000
SST Raw Score: 2.400000
GOI bans 500 and booking rupee currency notes, what will be the implications of it on Indian economy?
What will be the stories in the Indian economy of sudden 500 & 1000 rupee notes withdrawal of by the governmen

In [None]:
with gzip.open('input/stanfordData_train1.nlp', 'rb') as handle:
    try:
        while True:
            tmp = pickle.load(handle)
            if tmp == {}:
                print("Empty")
    except EOFError:
        pass

In [None]:
print(df_train.ix[212574]['question1'])
_getNLPToks_(df_train.ix[212574]['question1'])

In [8]:
df_train.ix[[1,5]]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1


In [31]:
a = "%%%%%"

In [32]:
a.replace("%","")

''