In [1]:
import re
import string
import pandas as pd
import numpy as np
import itertools
import HTMLParser
html_parser = HTMLParser.HTMLParser()
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from scipy.stats import skew, kurtosis
import json
from joblib import Parallel, delayed
import multiprocessing

from gensim.models import word2vec
from nltk.stem.porter import *
stemmer = PorterStemmer()
main_words = re.compile(r"\w+",re.I)
from nltk.corpus import stopwords
from gensim import corpora
#nltk.data.path



In [2]:
train_data = pd.read_csv('train.csv',sep=',')
test_data = pd.read_csv('test.csv',sep=',')

In [None]:
# Row Ids with empty string
# Train : question1 () , question2 (105780,201841 )
# Test : question1 (1046690,1461432 ) , question2 (379205,817520, 943911 , 1270024)

In [3]:
def apos_fn(row_ini):
    apost_ =  {"do not" : " don’t",  "will not" : " won’t",  "it’s" : " it is",  "can’t" : " can not",  "I’ll" : " I will ",  "I’ve" : " I have",  "you're" : " you are", 
            "didn't" : " did not",  "she's" : " she is",  "they're" : " they are",  "we're" : " we are",  "you've" : " you have",  "aren’t" : " are not",  "she'd" : " she would", 
            "let's" : " let us",  "we’ve" : " we have",  "couldn’t" : " could not",  "who’s" : " who is", "what's":" what is"}
    words = row_ini.split()
    final_row = [apost_[word] if word in apost_ else word for word in words]
    final_row = " ".join(final_row)
    return final_row

In [4]:
def remove_stop(row_ini, stw):
    words = set(map(lambda word : word.lower().strip(), row_ini.split(" ") ))
#    stop_words = list(["ax","dir","na","i","you","edu","s","t","m","can","lines","re","what","there","all","we","one","the","of","or","in","for","by","on","but","is","in","-","ã","a","not","with","as","was","if","they","are","this","and","it","have","from","at","my","be","by","not","that","to","div","may","please","ok","ltr","n","don","na","\u0081","a","an",",","from","com","org","like","likes","so"])
    words = [word for word in words if word not in stw]
    row_fin = " ".join(words)
    return row_fin

In [5]:
def row_clean(input_col):
    #removing html characters
    input_col = input_col.apply(lambda row: html_parser.unescape(row))
    # decoding
    input_col = input_col.apply(lambda row: row.decode("utf8").encode('ascii','ignore'))
    #digits
    input_col = input_col.apply(lambda row: re.sub("\d+", " ", row))
    #punctuation
    input_col = input_col.apply(lambda row: row.translate(None, string.punctuation) )
    #links
    input_col = input_col.apply(lambda row: re.sub(r'^https?:\/\/.*[\r\n]*', '', row, flags=re.MULTILINE) )    
    # proper word standards
    input_col = input_col.apply(lambda row: ''.join(''.join(s)[:2] for _, s in itertools.groupby(row)) )
    # to lower and removing stopwords
    stw = stopwords.words('english')
    input_col = input_col.apply(lambda row: remove_stop(row, stw))
    # attached words 
#    input_col = input_col.apply(lambda row: " ".join(re.findall('[A-Z][^A-Z]*', row)) )
    # removing apostrophe to proper words
    input_col = input_col.apply(lambda row: apos_fn(row))
        #mutliple space
    input_col = input_col.apply(lambda row: re.sub('\s+', ' ', row).strip() )
    
    return input_col

In [6]:
def fill_empty_rows(input_df):
    input_df.question1.fillna('',inplace=True)
    input_df['que1_clean'] = row_clean(input_df.question1)    
    input_df.question2.fillna('',inplace=True)
    input_df['que2_clean'] = row_clean(input_df.question2)
    return input_df

In [7]:
train_data = fill_empty_rows(train_data)
test_data = fill_empty_rows(test_data)

In [8]:
print train_data.columns
print " --- "
print test_data.columns
print " --- "
print train_data.shape
print " --- "
print test_data.shape

Index([u'id', u'qid1', u'qid2', u'question1', u'question2', u'is_duplicate',
       u'que1_clean', u'que2_clean'],
      dtype='object')
 --- 
Index([u'test_id', u'question1', u'question2', u'que1_clean', u'que2_clean'], dtype='object')
 --- 
(404290, 8)
 --- 
(2345796, 5)


In [9]:
def que_tokens(input_col):
    que_tokened = []
    input_col.apply(lambda row: que_tokened.append([stemmer.stem(i.lower()) for i in main_words.findall(row) ]) )
    return que_tokened

# Train
train_data['que1_clean_tokens'] = que_tokens(train_data.que1_clean)
train_data['que2_clean_tokens'] = que_tokens(train_data.que2_clean)
# Test
test_data['que1_clean_tokens'] = que_tokens(test_data.que1_clean)
test_data['que2_clean_tokens'] = que_tokens(test_data.que2_clean)

In [10]:
def question_freq(input_df):
    # Features with frequency of questions in their respective columns
    input_df['q1_freq_in_q1'] = input_df.groupby('question1')['question1'].transform('count')
    input_df['q2_freq_in_q2'] = input_df.groupby('question2')['question2'].transform('count')
    
    # Features with frequency of each of the question1 and question2 in both the columns of the input_df
    qid_1_2 = pd.DataFrame({'qid_count':input_df[['question1','question2']].stack().value_counts()})
    qid_1_2['question']= qid_1_2.index
    qid_1_2.reset_index(drop=True, inplace=True)

    input_df = pd.merge(input_df, qid_1_2, left_on=['question1'],right_on=['question'],how='left', right_index=False)
    input_df.drop('question',inplace=True, axis=1)
    input_df.rename(columns={'qid_count': 'q1_freq_in_q1q2'}, inplace=True)
    input_df = pd.merge(input_df, qid_1_2, left_on=['question2'],right_on=['question'],how='left', right_index=False)
    input_df.drop('question',inplace=True, axis=1)
    input_df.rename(columns={'qid_count': 'q2_freq_in_q1q2'}, inplace=True)
    
    return input_df

In [11]:
train_data = question_freq(train_data)
test_data = question_freq(test_data)

In [37]:
sentence_list = []
sentence_list = list(train_data.que1_clean_tokens)+list(train_data.que2_clean_tokens) + list(test_data.que1_clean_tokens)+list(test_data.que2_clean_tokens)

In [39]:
# word2vec parameters
num_features = 300    # Word-vector dimensionality                      
min_word_count = 1    # Min word count                        
num_workers = 6       # Threads
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# To get the logs of the word2vec model

In [2]:
#test_data.to_csv("test_w2v.csv", index=False, sep=',')
#train_data.to_csv("train_w2v.csv", index=False, sep=',')
test_data = pd.read_csv("test_w2v.csv", sep=',')
train_data = pd.read_csv("train_w2v.csv", sep=',')

In [3]:
train_data.drop(['qid1','qid2','question1','question2','que1_clean','que2_clean'], inplace=True, axis=1)
print train_data.columns

Index([u'id', u'is_duplicate', u'que1_clean_tokens', u'que2_clean_tokens',
       u'q1_freq_in_q1', u'q2_freq_in_q2', u'q1_freq_in_q1q2',
       u'q2_freq_in_q1q2'],
      dtype='object')


In [4]:
test_data.drop(['question1','question2','que1_clean','que2_clean'], inplace=True, axis=1)
print test_data.columns

Index([u'test_id', u'que1_clean_tokens', u'que2_clean_tokens',
       u'q1_freq_in_q1', u'q2_freq_in_q2', u'q1_freq_in_q1q2',
       u'q2_freq_in_q1q2'],
      dtype='object')


In [41]:
model_300_train_test = word2vec.Word2Vec(sentence_list, workers=num_workers, size=num_features, min_count = min_word_count, 
                          window = context, sample = downsampling)

model_name = "model_300_1min_10c_train_test"
model_300_train_test.save(model_name)

2017-05-18 22:23:17,226 : INFO : collecting all words and their counts
2017-05-18 22:23:17,226 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-18 22:23:17,267 : INFO : PROGRESS: at sentence #10000, processed 52328 words, keeping 8747 word types
2017-05-18 22:23:17,316 : INFO : PROGRESS: at sentence #20000, processed 104865 words, keeping 12493 word types
2017-05-18 22:23:17,365 : INFO : PROGRESS: at sentence #30000, processed 157308 words, keeping 15363 word types
2017-05-18 22:23:17,392 : INFO : PROGRESS: at sentence #40000, processed 209243 words, keeping 17649 word types
2017-05-18 22:23:17,418 : INFO : PROGRESS: at sentence #50000, processed 261791 words, keeping 19818 word types
2017-05-18 22:23:17,463 : INFO : PROGRESS: at sentence #60000, processed 314077 words, keeping 21770 word types
2017-05-18 22:23:17,513 : INFO : PROGRESS: at sentence #70000, processed 366410 words, keeping 23497 word types
2017-05-18 22:23:17,565 : INFO : PROGRESS: at se

In [5]:
#model_300_train_test = word2vec.Word2Vec.load('model_300_1min_10c_train_test')

In [6]:
def sen2Vec(words, model, set_index2word ):
    featureVec = np.zeros((300,), dtype="float32")  # Number of features is 300
    nwords = 0
    for word in words:
        if word in set_index2word :
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])
    if(nwords>0):
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [7]:
def create_similarity(model, input_df):
# Vector for sentence 1 : apply this or json.dumps
#    senvec_1 = sen2Vec(str(" ".join(row[colNames[8]])).split(), model=model)
    # Vector for sentence 2
#    senvec_2 = sen2Vec(str(" ".join(row[colNames[9]])).split(), model=model)
    set_index2word= set(model.wv.index2word)
    
    # Cosine similarity
    input_df['cosine_val']= input_df.apply(
        lambda row: float("{0:.2f}".format(1- cosine( sen2Vec(  json.dumps(row['que1_clean_tokens']) , model, set_index2word),
                               sen2Vec( json.dumps(row['que2_clean_tokens']), model, set_index2word) ) )), axis=1 )
    input_df['cosine_val'].fillna(0, inplace=True)
    
    # Cityblock similarity
    input_df['cityblock_val']= input_df.apply(
        lambda row: float("{0:.2f}".format( cityblock(sen2Vec( json.dumps(row['que1_clean_tokens']) , model, set_index2word),
                               sen2Vec( json.dumps(row['que2_clean_tokens']), model, set_index2word) ) )), axis=1 )
    input_df['cityblock_val'].fillna(0, inplace=True)
    
    # Jaccard similarity
    input_df['jaccard_val']= input_df.apply(
        lambda row: float("{0:.2f}".format( jaccard(sen2Vec( json.dumps(row['que1_clean_tokens']) , model, set_index2word),
                               sen2Vec( json.dumps(row['que2_clean_tokens']), model, set_index2word) ) )), axis=1 )
    input_df['jaccard_val'].fillna(0, inplace=True)
    
    # Canberra similarity
    input_df['canberra_val']= input_df.apply(
        lambda row: float("{0:.2f}".format( canberra(sen2Vec( json.dumps(row['que1_clean_tokens']) , model, set_index2word),
                               sen2Vec( json.dumps(row['que2_clean_tokens']), model, set_index2word) ) )), axis=1 )
    input_df['canberra_val'].fillna(0, inplace=True)
    
    # Euclidean similarity
    input_df['euclidean_val']= input_df.apply(
        lambda row: float("{0:.2f}".format( euclidean(sen2Vec( json.dumps(row['que1_clean_tokens']) , model, set_index2word),
                               sen2Vec( json.dumps(row['que2_clean_tokens']), model, set_index2word) ) )), axis=1 )
    input_df['euclidean_val'].fillna(0, inplace=True)
    
    # Minkowski similarity
    input_df['minkowski_val']= input_df.apply(
        lambda row: float("{0:.2f}".format( minkowski(sen2Vec( json.dumps(row['que1_clean_tokens']) , model, set_index2word),
                               sen2Vec( json.dumps(row['que2_clean_tokens']), model, set_index2word), 3) )), axis=1 )
    input_df['minkowski_val'].fillna(0, inplace=True)
    
    # Braycurtis similarity
    input_df['braycurtis_val']= input_df.apply(
        lambda row: float("{0:.2f}".format( braycurtis(sen2Vec( json.dumps(row['que1_clean_tokens']) , model, set_index2word),
                               sen2Vec( json.dumps(row['que2_clean_tokens']), model, set_index2word) ),2 )), axis=1 )
    input_df['braycurtis_val'].fillna(0, inplace=True)
    
    # Skewness
    input_df['skew_que1']= input_df.apply(
        lambda row: float("{0:.2f}".format( skew(sen2Vec( json.dumps(row['que1_clean_tokens']) , model, set_index2word) ) )), 
                            axis=1 )
    input_df['skew_que1'].fillna(0, inplace=True)
        
    input_df['skew_que2']= input_df.apply(
        lambda row: float("{0:.2f}".format( skew(sen2Vec( json.dumps(row['que2_clean_tokens']) , model, set_index2word) ) )),
                            axis=1 )
    input_df['skew_que2'].fillna(0, inplace=True)
    
    # Kurtosis
    input_df['kurt_que1']= input_df.apply(
        lambda row: float("{0:.2f}".format( kurtosis(sen2Vec( json.dumps(row['que1_clean_tokens']) , model, set_index2word) ))),
                            axis=1 )
    input_df['kurt_que1'].fillna(0, inplace=True)
        
    input_df['kurt_que2']= input_df.apply(
        lambda row: float("{0:.2f}".format( kurtosis(sen2Vec( json.dumps(row['que2_clean_tokens']) , model, set_index2word) ))),
                            axis=1 )
    input_df['kurt_que2'].fillna(0, inplace=True)
    
    return input_df

In [110]:
# Not used
def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)

In [8]:
#train_data = applyParallel(train_data.groupby(train_data.index), create_que_vec)
#test_data = applyParallel(test_data.groupby(train_data.index), create_que_vec)

#aa = train_data.iloc[1:10000]
#aa = applyParallel(aa.groupby(aa.index), create_similarity)
#aa = create_similarity( aa)

In [9]:
train_data = create_similarity(model_300_train_test, train_data)
train_data.drop(['que1_clean_tokens', 'que2_clean_tokens'], inplace=True, axis=1)
train_data.to_csv("train_w2v_feats.csv", index=False, sep=',')
del train_data

In [10]:
test_data = create_similarity(model_300_train_test, test_data)
test_data.drop(['que1_clean_tokens', 'que2_clean_tokens'], inplace=True, axis=1)
test_data.to_csv("test_w2v_feats.csv", index=False, sep=',')
del test_data

In [11]:
# Breaking into multiple chunks as size big to upload
test_data = pd.read_csv("test_w2v_feats.csv", sep=',')

In [14]:
for i in [1,2,3]:
    vars()['test_w2v_feats'+str(i)] = test_data.iloc[((test_data.shape[0]/3)*(i-1)):((test_data.shape[0]/3)*i)]
    vars()['test_w2v_feats'+str(i)].to_csv('test_w2v_feats'+str(i)+'.csv', sep=',', index=False)
    del vars()['test_w2v_feats'+str(i)]

In [None]:
# Add TF-IDF Features

In [None]:
# +++++++++++++++++++++++++++++++++++++++
# CODE BELOW THIS CELL HAS NOT BEEN USED!
# PLEASE DO NOT TRY THE CODE BELOW!!!!!
# +++++++++++++++++++++++++++++++++++++++

In [None]:
# For word similarity
#word1 = 'sun'
#word2 = 'moon'
#cosine_similarity = numpy.dot(model[word1], model[word2])/(numpy.linalg.norm(model[word2])* numpy.linalg.norm(model[word1]))

In [None]:
"""def quora_dict_creation(input_df):    
    all_tokens = input_df.que1_clean_tokens.tolist()+input_df.que2_clean_tokens.tolist()
    final_dict = corpora.Dictionary(all_tokens)
    final_dict.filter_extremes(no_below=1, no_above=0.9)
    final_dict.compactify()
    return final_dict
quora_dict_ini = quora_dict_creation(aa)

def sent2vec(input_col, quora_dict):
    input_col_vector = [quora_dict.doc2bow(text) for text in input_col.tolist()]
    input_col_matrix = gensim.matutils.corpus2csc(input_col_vector, num_terms=len(quora_dict.token2id))    
    return input_col_matrix.transpose()

que1_clean_matrix = sent2vec(aa.que1_clean_tokens, quora_dict_ini)
que2_clean_matrix = sent2vec(aa.que2_clean_tokens, quora_dict_ini)"""

In [None]:
"""from sklearn.feature_extraction.text import TfidfVectorizer
aa = train_data.head(10)
corpus= aa.que1_clean
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
print dict(zip(vectorizer.get_feature_names(), idf))"""