In [2]:

from __future__ import  division
get_ipython().magic('matplotlib inline')
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 500)
import seaborn as sns
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12
from tqdm import tqdm, tqdm_notebook
import itertools as it
import pickle
import glob
import os
import string

from scipy import sparse

import nltk
import spacy

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, make_scorer
from sklearn.decomposition import TruncatedSVD

from scipy.optimize import minimize

# import eli5
from IPython.display import display

import xgboost as xgb


# In[5]:

df_train = pd.read_csv('train.csv', 
                       dtype={
                           'question1': np.str,
                           'question2': np.str
                       })
df_train['test_id'] = -1
df_test = pd.read_csv('test.csv', 
                      dtype={
                          'question1': np.str,
                          'question2': np.str
                      })
df_test['id'] = -1
df_test['qid1'] = -1
df_test['qid2'] = -1
df_test['is_duplicate'] = -1

df = pd.concat([df_train, df_test])
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')
df['uid'] = np.arange(df.shape[0])
df = df.set_index(['uid'])
print(df.dtypes)
del(df_train, df_test)


# In[6]:

ix_train = np.where(df['id'] >= 0)[0]
ix_test = np.where(df['id'] == -1)[0]
ix_is_dup = np.where(df['is_duplicate'] == 1)[0]
ix_not_dup = np.where(df['is_duplicate'] == 0)[0]


# In[7]:

print(df[df['is_duplicate'] >= 0]['is_duplicate'].value_counts(normalize=True))


# In[8]:

df['len1'] = df['question1'].str.len().astype(np.float32)
df['len2'] = df['question2'].str.len().astype(np.float32)
df['abs_diff_len1_len2'] = np.abs(df['len1'] - df['len2'])


# In[11]:

max_in_dup = df.loc[ix_is_dup]['abs_diff_len1_len2'].max()
print('Maximum among duplicates:       ', max_in_dup)
max_in_not_dups = df.loc[ix_not_dup]['abs_diff_len1_len2'].max()
print('Maximum among non-duplicates:     ', max_in_not_dups)
print('Maximum among non-duplicates: ', (df.loc[ix_train]['abs_diff_len1_len2'] > max_in_dup).sum())
std_in_dups = df.loc[ix_is_dup]['abs_diff_len1_len2'].std()
print('Standard deviation in duplicates:', std_in_dups)
replace_value = max_in_dup + 2*std_in_dups
print('New value:              ', replace_value)


# In[12]:

df['abs_diff_len1_len2'] = df['abs_diff_len1_len2'].apply(lambda x: x if x < replace_value else replace_value)


# In[13]:

df['log_abs_diff_len1_len2'] = np.log(df['abs_diff_len1_len2'] + 1)


# In[14]:

df['ratio_len1_len2'] = df['len1'].apply(lambda x: x if x > 0.0 else 1.0)/                        df['len2'].apply(lambda x: x if x > 0.0 else 1.0)


# In[15]:


max_in_dup = df.loc[ix_is_dup]['ratio_len1_len2'].max()
print('Maximum among duplicates:       ', max_in_dup)
max_in_not_dups = df.loc[ix_not_dup]['ratio_len1_len2'].max()
print('Maximum among non-duplicates:      ', max_in_not_dups)
print('Number of lines greater than threshold: ', (df.loc[ix_train]['ratio_len1_len2'] > max_in_dup).sum())
std_in_dups = df.loc[ix_is_dup]['ratio_len1_len2'].std()
print('Number of lines greater than threshold: ', std_in_dups)
replace_value = max_in_dup + 2*std_in_dups
print('New value:               ', replace_value)


# In[16]:

df['ratio_len1_len2'] = df['ratio_len1_len2'].apply(lambda x: x if x < replace_value else replace_value)


# In[17]:

df['log_ratio_len1_len2'] = np.log(df['ratio_len1_len2'] + 1)


# In[18]:




id               int64
is_duplicate     int64
qid1             int64
qid2             int64
question1       object
question2       object
test_id          int64
dtype: object
0    0.630802
1    0.369198
Name: is_duplicate, dtype: float64
Maximum among duplicates:        196.0
Maximum among non-duplicates:      1080.0
Maximum among non-duplicates:  394
Standard deviation in duplicates: 14.3821
New value:               224.764198303
Maximum among duplicates:        6.66666666667
Maximum among non-duplicates:       117.0
Number of lines greater than threshold:  152
Number of lines greater than threshold:  0.376106045115
New value:                7.4188787569


In [2]:

%%time
# if os.path.isfile('./../tmp/cv_char.pkl') and os.path.isfile('./../tmp/ch_freq.pkl'):
with open('cv_char.pkl', 'rb') as f:
    cv_char = pickle.load(f)
with open('ch_freq.pkl', 'rb') as f:
    ch_freq = pickle.load(f)
# else:
#     cv_char = CountVectorizer(ngram_range=(1, 3), analyzer='char')
#     ch_freq = np.array(cv_char.fit_transform(df['question1'].tolist() + df['question2'].tolist()).sum(axis=0))[0, :]
#     with open('cv_char.pkl', 'wb') as f:
#         pickle.dump(cv_char, f)
#     with open('ch_freq.pkl', 'wb') as f:
#         pickle.dump(ch_freq, f)

CPU times: user 2.17 s, sys: 17.6 ms, total: 2.18 s
Wall time: 2.22 s


In [3]:


unigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 1])
ix_unigrams = np.sort(unigrams.values())
print('Unigrams:', len(unigrams))
bigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 2])
ix_bigrams = np.sort(bigrams.values())
print('Bigrams: ', len(bigrams))
trigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 3])
ix_trigrams = np.sort(trigrams.values())
print('Trigrams:', len(trigrams))


('Unigrams:', 1779)
('Bigrams: ', 10722)
('Trigrams:', 74806)


In [4]:

%%time
def save_sparse_csr(fname, sm):
    np.savez(fname, 
             data=sm.data, 
             indices=sm.indices,
             indptr=sm.indptr, 
             shape=sm.shape)

def load_sparse_csr(fname):
    loader = np.load(fname)
    return sparse.csr_matrix((
        loader['data'], 
        loader['indices'], 
        loader['indptr']),
        shape=loader['shape'])

# if os.path.isfile('m_q1.npz') and os.path.isfile('m_q2.npz'):
m_q1 = load_sparse_csr('m_q1.npz')
m_q2 = load_sparse_csr('m_q2.npz')
# else:
#     m_q1 = cv_char.transform(df['question1'].values)
#     m_q2 = cv_char.transform(df['question2'].values)
#     save_sparse_csr('m_q1.npz', m_q1)
#     save_sparse_csr('m_q2.npz', m_q2)

CPU times: user 22 s, sys: 14.3 s, total: 36.2 s
Wall time: 1min 35s


In [5]:
v_num = (m_q1[:, ix_unigrams] > 0).minimum((m_q2[:, ix_unigrams] > 0)).sum(axis=1)
v_den = (m_q1[:, ix_unigrams] > 0).maximum((m_q2[:, ix_unigrams] > 0)).sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['unigram_jaccard'] = v_score
# plot_real_feature('unigram_jaccard')

In [6]:

# We take into account each letter more than once
v_num = m_q1[:, ix_unigrams].minimum(m_q2[:, ix_unigrams]).sum(axis=1)
v_den = m_q1[:, ix_unigrams].sum(axis=1) + m_q2[:, ix_unigrams].sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['unigram_all_jaccard'] = v_score

In [7]:
# We take into account each letter more than once
# Normalize the maximum value, and not the sum
v_num = m_q1[:, ix_unigrams].minimum(m_q2[:, ix_unigrams]).sum(axis=1)
v_den = m_q1[:, ix_unigrams].maximum(m_q2[:, ix_unigrams]).sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['unigram_all_jaccard_max'] = v_score

In [8]:
v_num = (m_q1[:, ix_bigrams] > 0).minimum((m_q2[:, ix_bigrams] > 0)).sum(axis=1)
v_den = (m_q1[:, ix_bigrams] > 0).maximum((m_q2[:, ix_bigrams] > 0)).sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['bigram_jaccard'] = v_score

In [9]:
# We take into account each letter more than once
v_num = m_q1[:, ix_bigrams].minimum(m_q2[:, ix_bigrams]).sum(axis=1)
v_den = m_q1[:, ix_bigrams].sum(axis=1) + m_q2[:, ix_bigrams].sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['bigram_all_jaccard'] = v_score

In [11]:
df.head()

Unnamed: 0_level_0,id,is_duplicate,qid1,qid2,question1,question2,test_id,len1,len2,abs_diff_len1_len2,log_abs_diff_len1_len2,ratio_len1_len2,log_ratio_len1_len2,unigram_jaccard,unigram_all_jaccard,unigram_all_jaccard_max,bigram_jaccard,bigram_all_jaccard,bigram_all_jaccard_max
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,-1,66.0,57.0,9.0,2.302585,1.157895,0.769133,1.0,0.463415,0.863636,0.886364,0.454545,0.833333
1,1,0,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,-1,51.0,88.0,37.0,3.637586,0.579545,0.457137,0.703704,0.352518,0.544444,0.5,0.306569,0.442105
2,2,0,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,-1,73.0,59.0,14.0,2.70805,1.237288,0.805264,0.75,0.386364,0.62963,0.425,0.284615,0.397849
3,3,0,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,-1,50.0,65.0,15.0,2.772589,0.769231,0.570545,0.466667,0.286957,0.402439,0.081395,0.061947,0.066038
4,4,0,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,-1,76.0,39.0,37.0,3.637586,1.948718,1.08137,0.653846,0.313043,0.455696,0.289474,0.20354,0.255556


In [10]:
v_num = m_q1[:, ix_bigrams].minimum(m_q2[:, ix_bigrams]).sum(axis=1)
v_den = m_q1[:, ix_bigrams].maximum(m_q2[:, ix_bigrams]).sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['bigram_all_jaccard_max'] = v_score

In [12]:
m_q1 = m_q1[:, ix_trigrams]
m_q2 = m_q2[:, ix_trigrams]
# In [45]:
v_num = (m_q1 > 0).minimum((m_q2 > 0)).sum(axis=1)
v_den = (m_q1 > 0).maximum((m_q2 > 0)).sum(axis=1)
v_den[np.where(v_den == 0)] = 1
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['trigram_jaccard'] = v_score

In [13]:
v_num = m_q1.minimum(m_q2).sum(axis=1)
v_den = m_q1.sum(axis=1) + m_q2.sum(axis=1)
v_den[np.where(v_den == 0)] = 1
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['trigram_all_jaccard'] = v_score

In [14]:
v_num = m_q1.minimum(m_q2).sum(axis=1)
v_den = m_q1.maximum(m_q2).sum(axis=1)
v_den[np.where(v_den == 0)] = 1
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]

df['trigram_all_jaccard_max'] = v_score

In [15]:
tft = TfidfTransformer(
    norm='l2', 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=False)

tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)

v_num = np.array(m_q1_tf.multiply(m_q2_tf).sum(axis=1))[:, 0]
v_den = np.array(np.sqrt(m_q1_tf.multiply(m_q1_tf).sum(axis=1)))[:, 0] * \
        np.array(np.sqrt(m_q2_tf.multiply(m_q2_tf).sum(axis=1)))[:, 0]
v_num[np.where(v_den == 0)] = 1
v_den[np.where(v_den == 0)] = 1

v_score = 1 - v_num/v_den

df['trigram_tfidf_cosine'] = v_score

In [16]:

tft = TfidfTransformer(
    norm='l2', 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=False)

tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)

v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])

df['trigram_tfidf_l2_euclidean'] = v_score

In [17]:
tft = TfidfTransformer(
    norm='l1', 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=False)

tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)

v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])

df['trigram_tfidf_l1_euclidean'] = v_score

In [18]:
tft = TfidfTransformer(
    norm='l2', 
    use_idf=False, 
    smooth_idf=True, 
    sublinear_tf=False)

tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)

v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])

df['trigram_tf_l2_euclidean'] = v_score

In [3]:
import spacy

nlp = spacy.load('en_core_web_sm')
# df.head()['question1'].apply(lambda s: ' '.join([c.lemma_ for c in nlp(unicode(s)) if c.lemma_  != '?']))



    Only loading the 'en' tokenizer.



In [20]:
SYMBOLS = set(' '.join(string.punctuation).split(' ') + ['...', '“', '”', '\'ve'])

if not os.path.isfile('./../tmp/bow_lemma.pkl'):
    q1 = []

    for doc in nlp.pipe(df['question1'].str.decode('utf-8'), n_threads=16, batch_size=10000):
        q1.append([c.lemma_ for c in doc if c.lemma_ not in SYMBOLS])

    q2 = []

    for doc in nlp.pipe(df['question2'].str.decode('utf-8'), n_threads=16, batch_size=10000):
        q2.append([c.lemma_ for c in doc if c.lemma_ not in SYMBOLS])
        
    with open('bow_lemma.pkl', 'wb') as f:
        pickle.dump({
            'q1': q1,
            'q2': q2
        }, f)
else:
    with open('./../tmp/bow_lemma.pkl', 'rb') as f:
        tmp = pickle.load(f)
        q1 = tmp['q1']
        q2 = tmp['q2']
        del(tmp)

In [21]:
%%time
if os.path.isfile('./../tmp/cv_word_lemma.pkl') and os.path.isfile('./../tmp/wl_freq.pkl'):
    with open('./../tmp/cv_word_lemma.pkl', 'rb') as f:
        cv_words = pickle.load(f)
    with open('./../tmp/wl_freq.pkl', 'rb') as f:
        w_freq = pickle.load(f)
else:
    cv_words = CountVectorizer(ngram_range=(1, 1), analyzer='word')
    w_freq = np.array(cv_words.fit_transform(
        [' '.join(s) for s in q1] + [' '.join(s) for s in q2]).sum(axis=0))[0, :]
    with open('cv_word_lemma.pkl', 'wb') as f:
        pickle.dump(cv_words, f)
    with open('wl_freq.pkl', 'wb') as f:
        pickle.dump(w_freq, f)

CPU times: user 31.4 s, sys: 361 ms, total: 31.7 s
Wall time: 31.7 s


In [22]:
if os.path.isfile('./../tmp/m_q1_wl.npz') and os.path.isfile('./../tmp/m_q2_wl.npz'):
    m_q1 = load_sparse_csr('./../tmp/m_q1_wl.npz')
    m_q2 = load_sparse_csr('./../tmp/m_q2_wl.npz')
else:
    m_q1 = cv_words.transform([' '.join(s) for s in q1])
    m_q2 = cv_words.transform([' '.join(s) for s in q2])
    save_sparse_csr('m_q1_wl.npz', m_q1)
    save_sparse_csr('m_q2_wl.npz', m_q2)

In [23]:
tft = TfidfTransformer(
    norm='l2', 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=False)

tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)

v_num = np.array(m_q1_tf.multiply(m_q2_tf).sum(axis=1))[:, 0]
v_den = np.array(np.sqrt(m_q1_tf.multiply(m_q1_tf).sum(axis=1)))[:, 0] * \
        np.array(np.sqrt(m_q2_tf.multiply(m_q2_tf).sum(axis=1)))[:, 0]
v_num[np.where(v_den == 0)] = 1
v_den[np.where(v_den == 0)] = 1

v_score = 1 - v_num/v_den

df['1wl_tfidf_cosine'] = v_score

In [24]:
tft = TfidfTransformer(
    norm='l2', 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=False)

tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)

v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])

df['1wl_tfidf_l2_euclidean'] = v_score

In [25]:

tft = TfidfTransformer(
    norm='l2', 
    use_idf=False, 
    smooth_idf=True, 
    sublinear_tf=False)

tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)

v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])

df['1wl_tf_l2_euclidean'] = v_score

In [7]:
df_test = df[df.is_duplicate < 0 ]

# df_train = df[df.test_id == -1 ]

In [8]:
df_test.head()

Unnamed: 0_level_0,id,is_duplicate,qid1,qid2,question1,question2,test_id,len1,len2,abs_diff_len1_len2,log_abs_diff_len1_len2,ratio_len1_len2,log_ratio_len1_len2
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
404290,-1,-1,-1,-1,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0,57.0,68.0,11.0,2.484907,0.838235,0.608806
404291,-1,-1,-1,-1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,1,66.0,43.0,23.0,3.178054,1.534884,0.930148
404292,-1,-1,-1,-1,What but is the best way to send money from Ch...,What you send money to China?,2,60.0,29.0,31.0,3.465736,2.068966,1.121341
404293,-1,-1,-1,-1,Which food not emulsifiers?,What foods fibre?,3,27.0,17.0,10.0,2.397895,1.588235,0.950976
404294,-1,-1,-1,-1,"How ""aberystwyth"" start reading?",How their can I start reading?,4,32.0,30.0,2.0,1.098612,1.066667,0.725937


In [37]:
df_test.iloc[:,9:].to_csv('russ_test.csv', index = False)

df_train.iloc[:,9:].to_csv('russ_train.csv', index = False)

In [38]:
test = np.array(df_test.iloc[:,9:])

np.savetxt('russ_alt.csv', test, delimiter=",", fmt='%.5f')