In [36]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import Stemmer

import nltk, string
from nltk.corpus import stopwords
from nltk import word_tokenize
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV

from fuzzywuzzy import fuzz

from scipy.spatial.distance import cosine, euclidean

import gc

import seaborn as sns
sns.set_style("dark")
#plt.rcParams['figure.figsize'] = 16, 12
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [11]:
path = '/cinc/data/quora-question-pairs/'
save_path = '/cinc/data/quora-question-pairs/save_data_tf_idf/'
model_path = '/cinc/data/quora-question-pairs/save_data_tf_idf/model/'
#nltk.download()

In [12]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


In [13]:
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

In [14]:
#def cosine_sim(text1, text2):
#    tfidf = vectorizer.fit_transform([text1, text2])
#    return ((tfidf * tfidf.T).A)[0,1]

In [15]:
# stem version cosine_sim
english_stemmer = Stemmer.Stemmer('en')

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))

stem_vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', analyzer='word', ngram_range=(1,1))

def cosine_sim(text1, text2):
    tfidf = stem_vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [16]:
print cosine_sim('a little bird', 'a little bird')
print cosine_sim('a little bird', 'a little bird chirps')
print cosine_sim('a little bird', 'a big dog barks')

1.0
0.709297266606
0.0


In [17]:
stops = set(stopwords.words("english"))
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
#    if stem_words:
#        text = text.split()
#        stemmer = SnowballStemmer('english')
#        stemmed_words = [stemmer.stem(word) for word in text]
#        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)


# read train text

In [10]:
#train_data = pd.read_csv(path + 'train/train-sample.csv')
train_data = pd.read_csv(path + 'train/train.csv')

In [11]:
nb_samples = train_data.shape[0]
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [10]:
def plot_real_feature(df, fname):
    fig = plt.figure()
    ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
    ax2 = plt.subplot2grid((3, 2), (1, 0), colspan=2)
    ax3 = plt.subplot2grid((3, 2), (2, 0))
    ax4 = plt.subplot2grid((3, 2), (2, 1))
    ax1.set_title('Distribution of %s' % fname, fontsize=20)
    sns.distplot(df.loc[ix_train][fname], 
                 bins=50, 
                 ax=ax1)    
    sns.distplot(df.loc[ix_is_dup][fname], 
                 bins=50, 
                 ax=ax2,
                 label='is dup')    
    sns.distplot(df.loc[ix_not_dup][fname], 
                 bins=50, 
                 ax=ax2,
                 label='not dup')
    ax2.legend(loc='upper right', prop={'size': 18})
    sns.boxplot(y=fname, 
                x='is_duplicate', 
                data=df.loc[ix_train], 
                ax=ax3)
    sns.violinplot(y=fname, 
                   x='is_duplicate', 
                   data=df.loc[ix_train], 
                   ax=ax4)
    plt.show()
    
#plot_real_feature(train_data, 'log_diff_len')

In [11]:
def ratio_len(text1, text2):
    r = 1.0 * (len(text1)+1) / (len(text2)+1)
    if r>1:
        r = 1/r
    return r

In [12]:
def process_data(df_data):
    print 'remove stop words (1)...'
    df_data['question1'] = df_data['question1'].apply(lambda x: text_to_wordlist(str(x)))
    print 'remove stop words (2)...'
    df_data['question2'] = df_data['question2'].apply(lambda x: text_to_wordlist(str(x)))

    print 'calculate length features...'
    df_data['diff_len'] = df_data.apply(lambda row: abs(len(row['question1']) - len(row['question2'])), axis=1)
    df_data['ratio_len'] = df_data.apply(lambda row: ratio_len(row['question1'], row['question2']), axis=1)
    df_data['log_diff_len'] = np.log(df_data['diff_len']+0.01)
    return df_data

def process_cosine_sims(df_data):
    cosine_sims = [None]*df_data.shape[0]
    for i in tqdm(range(df_data.shape[0])):
        try:
            cosine_sims[i] = cosine_sim(df_data['question1'][i], df_data['question2'][i])
        except ValueError:
            cosine_sims[i] = 0
    df_data['cos_sim'] = cosine_sims

    return df_data


In [6]:
def process_calculate_ngram_score(m_q1, m_q2, ix_ngrams, name):
    print ('calculating ngram score for ' + name)
    v_num = (m_q1[:, ix_ngrams] > 0).minimum((m_q2[:, ix_ngrams] > 0)).sum(axis=1)
    v_den = (m_q1[:, ix_ngrams] > 0).maximum((m_q2[:, ix_ngrams] > 0)).sum(axis=1)
    v_score_ngram_jaccard = np.array(v_num.flatten()).astype(np.float32)[0, :] / (np.array(v_den.flatten())[0, :] + 0.001)

    v_num = m_q1[:, ix_ngrams].minimum(m_q2[:, ix_ngrams]).sum(axis=1)
    v_den = m_q1[:, ix_ngrams].sum(axis=1) + m_q2[:, ix_ngrams].sum(axis=1)
    v_score_ngram_jaccard_all = np.array(v_num.flatten()).astype(np.float32)[0, :] / (np.array(v_den.flatten())[0, :] + 0.001)
    
    v_num = m_q1[:, ix_ngrams].minimum(m_q2[:, ix_ngrams]).sum(axis=1)
    v_den = m_q1[:, ix_ngrams].maximum(m_q2[:, ix_ngrams]).sum(axis=1)
    v_score_ngram_jaccard_max = np.array(v_num.flatten()).astype(np.float32)[0, :] / (np.array(v_den.flatten())[0, :] + 0.001)
    
    return (v_score_ngram_jaccard, v_score_ngram_jaccard_all, v_score_ngram_jaccard_max)



In [7]:
def process_calculate_ngram(df_data):
    print ('calculating ngrams ...')
    cv_char = CountVectorizer(ngram_range=(1, 3), analyzer='char')
    cv_char.fit_transform(df_data['question1'][0:10000].tolist() + df_data['question2'][0:10000].tolist()) \
    .sum(axis=0)[0, :].shape
    
    ch_freq = np.array(cv_char.fit_transform(df_data['question1'].values.astype('U').tolist()
                                         + df_data['question2'].values.astype('U').tolist()).sum(axis=0))[0, :]
    
    unigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 1])
    ix_unigrams = np.sort(unigrams.values())
    print 'Unigrams:', len(unigrams)

    bigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 2])
    ix_bigrams = np.sort(bigrams.values())
    print 'Bigrams: ', len(bigrams)

    trigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 3])
    ix_trigrams = np.sort(trigrams.values())
    print 'Trigrams:', len(trigrams)
    
    print ('tranform q1 and q2...')
    m_q1 = cv_char.transform(df_data['question1'].values.astype('U'))
    m_q2 = cv_char.transform(df_data['question2'].values.astype('U'))
    
    print ('calculating  q1 and q2...')
    (jac, jac_all, jac_max) =  process_calculate_ngram_score(m_q1, m_q2, ix_unigrams, 'unigram')
    print (jac[0:5], jac_all[0:5], jac_max[0:5])
    df_data['unigram_jaccard'] = jac
    df_data['unigram_jaccard_all'] = jac_all
    df_data['unigram_jaccard_max'] = jac_max
    
    (jac, jac_all, jac_max) =  process_calculate_ngram_score(m_q1, m_q2, ix_bigrams, 'bigram')
    print (jac[0:5], jac_all[0:5], jac_max[0:5])
    df_data['bigram_jaccard'] = jac
    df_data['bigram_jaccard_all'] = jac_all
    df_data['bigram_jaccard_max'] = jac_max
    
    (jac, jac_all, jac_max) =  process_calculate_ngram_score(m_q1, m_q2, ix_trigrams, 'trigram')
    print (jac[0:5], jac_all[0:5], jac_max[0:5])
    df_data['trigram_jaccard'] = jac
    df_data['trigram_jaccard_all'] = jac_all
    df_data['trigram_jaccard_max'] = jac_max
    
    #return (ix_unigrams, ix_bigrams, ix_trigrams)
    return



In [14]:
#train_data['question1'] = train_data['question1'].apply(lambda x: text_to_wordlist(str(x)))
#train_data['question2'] = train_data['question2'].apply(lambda x: text_to_wordlist(str(x)))
#train_data.head()

#cosine_sims = [None]*train_data.shape[0]
#for i in range(train_data.shape[0]):
#    try:
#        cosine_sims[i] = cosine_sim(train_data['question1'][i], train_data['question2'][i])
#    except ValueError:
#        cosine_sims[i] = 0

#train_data['cos_sim'] = cosine_sims
#train_data['diff_len'] = train_data.apply(lambda row: abs(len(row['question1']) - len(row['question2'])), axis=1)
#train_data['ratio_len'] = train_data.apply(lambda row: ratio_len(row['question1'], row['question2']), axis=1)

In [14]:
train_data = process_data(train_data)

remove stop words (1)...
remove stop words (2)...
calculate length features...


In [17]:
train_data = process_cosine_sims(train_data)

100%|██████████| 404290/404290 [19:46<00:00, 340.71it/s]


In [None]:
process_calculate_ngram(train_data)

In [44]:
#train_data.head()

In [67]:
#train_data['question1'] = train_data['question1'].fillna('')
#train_data['question2'] = train_data['question2'].fillna('')

In [33]:
#train_data = pd.read_csv(save_path + 'train_data.csv')
save_array(save_path + 'train_data_cos_sim', list(train_data['cos_sim']))

In [24]:
# load back
cos_sims = load_array(save_path + 'train_data_cos_sim')
train_data['cos_sim'] = cos_sims

In [34]:
# save for later
#save_array(save_path + 'train_data', train_data)
train_data.to_csv(save_path + 'train_data.csv', index=False)

In [10]:
# load back
train_data = pd.read_csv(save_path + 'train_data.csv')
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,diff_len,ratio_len,cos_sim
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,9,0.865672,0.895532
1,1,3,4,what is the story of kohinoor koh - i - noor d...,what would happen if the indian government sto...,0,37,0.593407,0.474331
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,14,0.810811,0.380873
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math 23 ^ 24 math is d...,0,13,0.793651,0.0
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0,35,0.533333,0.206084


In [6]:
train_data.columns[6:]

Index([u'diff_len', u'ratio_len', u'cos_sim'], dtype='object')

# read test data

In [9]:
test_data = pd.read_csv(path + 'test/test.csv')
#test_data.index = test_data.index-1
#print test_data.shape
#test_data = test_data[0:1000]

print test_data.shape
test_data.head()

(2345796, 3)


Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [57]:
test_data = process_data(test_data)

remove stop words (1)...
remove stop words (2)...
calculate length features...


In [15]:
test_data = process_cosine_sims(test_data)

100%|██████████| 2345796/2345796 [1:23:06<00:00, 470.46it/s]


In [9]:
process_calculate_ngram(test_data)

calculating ngrams ...
Unigrams: 45
Bigrams:  1451
Trigrams: 20406
tranform q1 and q2...
calculating  q1 and q2...
calculating ngram score for unigram
(array([ 0.80948526,  0.77269215,  0.83328704,  0.61107716,  0.81244922]), array([ 0.39837074,  0.35848718,  0.32183538,  0.3333254 ,  0.39654489]), array([ 0.66215321,  0.55881531,  0.47456823,  0.49998214,  0.65712408]))
calculating ngram score for bigram
(array([ 0.30666258,  0.46550922,  0.4599908 ,  0.21874316,  0.44735665]), array([ 0.21487426,  0.27884347,  0.27058505,  0.17499563,  0.30356601]), array([ 0.27368133,  0.38666151,  0.37096176,  0.21211478,  0.43588626]))
calculating ngram score for trigram
(array([ 0.14285569,  0.35210772,  0.34482164,  0.08571184,  0.34999125]), array([ 0.11764607,  0.24509564,  0.24096095,  0.07894529,  0.25925446]), array([ 0.13333206,  0.32467111,  0.31745528,  0.08571184,  0.34999125]))


In [11]:
#save_array(save_path + 'test_data_cos_sim', list(test_data['cos_sim']))
cos_sims = load_array(save_path + 'test_data_cos_sim')
test_data['cos_sim'] = cos_sims

(1,)

In [11]:
# save for later
test_data.to_csv(save_path + 'test_data.csv', index=False)

In [12]:
# load back
test_data = pd.read_csv(save_path + 'test_data.csv')
print test_data.shape
test_data.head()

In [10]:
test_data.head()

Unnamed: 0,test_id,question1,question2,diff_len,ratio_len,log_diff_len,cos_sim,unigram_jaccard,unigram_jaccard_all,unigram_jaccard_max,bigram_jaccard,bigram_jaccard_all,bigram_jaccard_max,trigram_jaccard,trigram_jaccard_all,trigram_jaccard_max
0,0,how does the surface pro himself 4 compare wit...,why did microsoft choose core m3 and not core ...,11,0.838235,2.398804,0.194593,0.809485,0.398371,0.662153,0.306663,0.214874,0.273681,0.142856,0.117646,0.133332
1,1,should i have a hair transplant at age 24 how ...,how much cost does hair transplant require,22,0.661538,3.091497,0.431613,0.772692,0.358487,0.558815,0.465509,0.278843,0.386662,0.352108,0.245096,0.324671
2,2,what but is the best way to send money from ch...,what you send money to china,31,0.483333,3.43431,0.656973,0.833287,0.321835,0.474568,0.459991,0.270585,0.370962,0.344822,0.240961,0.317455
3,3,which food not emulsifiers,what foods fibre,10,0.62963,2.303585,0.336097,0.611077,0.333325,0.499982,0.218743,0.174996,0.212115,0.085712,0.078945,0.085712
4,4,how aberystwyth start reading,how their can i start reading,0,1.0,-4.60517,0.709297,0.812449,0.396545,0.657124,0.447357,0.303566,0.435886,0.349991,0.259254,0.349991


In [48]:
gc.collect()

7

# merge training and test data

In [28]:
df_train = train_data
df_train['test_id'] = -1

df_test = test_data
df_test['id'] = -1
df_test['qid1'] = -1
df_test['qid2'] = -1
df_test['is_duplicate'] = -1

In [29]:
df = pd.concat([df_train, df_test])
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')
df['uid'] = np.arange(df.shape[0])
df = df.set_index(['uid'])
print df.shape


(2750086, 21)


In [21]:
df.columns

Index([u'bigram_jaccard', u'bigram_jaccard_all', u'bigram_jaccard_max',
       u'cos_sim', u'diff_len', u'id', u'is_duplicate', u'log_diff_len',
       u'qid1', u'qid2', u'question1', u'question2', u'ratio_len', u'test_id',
       u'trigram_jaccard', u'trigram_jaccard_all', u'trigram_jaccard_max',
       u'unigram_jaccard', u'unigram_jaccard_all', u'unigram_jaccard_max'],
      dtype='object')

In [24]:
columns = [u'id', u'test_id', u'is_duplicate', 
              u'qid1', u'qid2', u'question1', u'question2', 
              u'log_diff_len', u'ratio_len', u'cos_sim', u'diff_len']
df = df[columns]
df.head()

Unnamed: 0_level_0,id,test_id,is_duplicate,qid1,qid2,question1,question2,log_diff_len,ratio_len,cos_sim,diff_len
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,-1,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,2.198335,0.865672,0.895532,9
1,1,-1,0,3,4,what is the story of kohinoor koh - i - noor d...,what would happen if the indian government sto...,3.611188,0.593407,0.474331,37
2,2,-1,0,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,2.639771,0.810811,0.380873,14
3,3,-1,0,7,8,why am i mentally very lonely how can i solve it,find the remainder when math 23 ^ 24 math is d...,2.565718,0.793651,0.0,13
4,4,-1,0,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,3.555634,0.533333,0.206084,35


In [25]:
df.columns[7:]

Index([u'log_diff_len', u'ratio_len', u'cos_sim', u'diff_len'], dtype='object')

In [26]:
ix_train = np.where(df['id'] >= 0)[0]
ix_test = np.where(df['id'] == -1)[0]
ix_is_dup = np.where(df['is_duplicate'] == 1)[0]
ix_not_dup = np.where(df['is_duplicate'] == 0)[0]

In [20]:
#plot_real_feature(df, 'log_diff_len')

In [21]:
#plot_real_feature(train_data, 'ratio_len')

In [22]:
#plot_real_feature(train_data, 'cos_sim')

# edge feature

In [22]:
import networkx as nx

In [23]:
g = nx.Graph()

In [24]:
g.add_nodes_from(df.question1)
g.add_nodes_from(df.question2)

In [25]:
edges = list(df[['question1', 'question2']].to_records(index=False))

In [27]:
g.add_edges_from(edges)

In [39]:
def get_intersection_count(row):
    try:
        result = len(set(g.neighbors(row.question1)).intersection(set(g.neighbors(row.question2))))
    except nx.NetworkXError:
        result = 0
    return result

In [40]:
#train_data[0:1]
train_data['intersection_count'] = train_data.apply(lambda row: get_intersection_count(row), axis=1)

In [51]:
#train_data['intersection_count']
#train_data.head()

In [52]:
test_data['intersection_count'] = test_data.apply(lambda row: get_intersection_count(row), axis=1)

In [73]:
#test_data.head(10)

In [53]:
# save for later
train_data.to_csv(save_path + 'train_data.csv', index=False)
test_data.to_csv(save_path + 'test_data.csv', index=False)

# fuzzy features

In [26]:
print "calculating qratio..."
train_data['fuzz_qratio'] = train_data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)

print "calculating wratio..."
train_data['fuzz_wratio'] = train_data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)

print "calculating partial ratio..."
train_data['fuzz_partial_ratio'] = train_data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)

calculating qratio...
calculating wratio...
calculating partial ratio...


In [34]:
print "calculating partial token set ratio..."
train_data['fuzz_partial_token_set_ratio'] = train_data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)

print "calculating partial token sort ratio..."
train_data['fuzz_partial_token_sort_ratio'] = train_data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

print "calculating token set ratio..."
train_data['fuzz_token_set_ratio'] = train_data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)

print "calculating token sort ratio..."
train_data['fuzz_token_sort_ratio'] = train_data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)


calculating partial token set ratio...
calculating partial token sort ratio...
calculating token set ratio...
calculating token sort ratio...


In [50]:
def process_fuzz(df_data):
    print "calculating qratio..."
    df_data['fuzz_qratio'] = df_data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)

    print "calculating wratio..."
    df_data['fuzz_wratio'] = df_data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)

    print "calculating partial ratio..."
    df_data['fuzz_partial_ratio'] = df_data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)

    print "calculating partial token set ratio..."
    df_data['fuzz_partial_token_set_ratio'] = df_data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)

    print "calculating partial token sort ratio..."
    df_data['fuzz_partial_token_sort_ratio'] = df_data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

    print "calculating token set ratio..."
    df_data['fuzz_token_set_ratio'] = df_data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)

    print "calculating token sort ratio..."
    df_data['fuzz_token_sort_ratio'] = df_data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

    return


In [17]:
process_fuzz(train_data)

calculating wratio...
calculating partial ratio...
calculating partial token set ratio...
calculating partial token sort ratio...
calculating token set ratio...
calculating token sort ratio...


In [52]:
process_fuzz(test_data)

calculating qratio...
calculating wratio...
calculating partial ratio...
calculating partial token set ratio...
calculating partial token sort ratio...
calculating token set ratio...
calculating token sort ratio...


In [54]:
#test_data.head()

In [57]:
train_data.to_csv(save_path + 'train_data.csv', index=False)

In [56]:
test_data.to_csv(save_path + 'test_data.csv', index=False)

# distance features

In [8]:
embeddings_index = {}

glove_file = '/cinc/data/glove/glove.840B.300d.txt'
f = open(glove_file)

count = 0
for line in f:
    count = count+1
    if (count % 100000) == 0:
        print 'processing ', count
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

processing  100000
processing  200000
processing  300000
processing  400000
processing  500000
processing  600000
processing  700000
processing  800000
processing  900000
processing  1000000
processing  1100000
processing  1200000
processing  1300000
processing  1400000
processing  1500000
processing  1600000
processing  1700000
processing  1800000
processing  1900000
processing  2000000
processing  2100000


In [27]:
def pre_processing(s):
    words = word_tokenize(s)
    words = [w for w in words if w not in stopwords.words('english')]
    words = [w for w in words if w.isalpha()]
    return words

In [29]:
train_data['question1'][0], pre_processing(train_data['question1'][0])

('what is the step by step guide to invest in share market in india ',
 ['step', 'step', 'guide', 'invest', 'share', 'market', 'india'])

In [30]:
def sent2vec(s):
    embedding_matrix = []
    for w in s:
        embedding_vector = embeddings_index.get(w)
        if embedding_vector is not None:
            embedding_matrix.append(embedding_vector)
        #else:
            #print w
    # normalize
    embedding_norm = np.array(embedding_matrix).sum(axis=0)
    denom = np.sqrt((embedding_norm ** 2).sum())
    return embedding_norm / denom

In [42]:
v1 = sent2vec(pre_processing(train_data['question1'][0]))
v2 = sent2vec(pre_processing(train_data['question2'][0]))
cosine(v1, v2)

0.031762361526489258

In [53]:
def process_distance(df_data):
    q1_vec = np.zeros((df_data.shape[0], 300))
    for i in tqdm(range(df_data.shape[0])):
        q1_vec[i,:] = sent2vec(df_data['question1'][i])
        
    q2_vec = np.zeros((df_data.shape[0], 300))
    for i in tqdm(range(df_data.shape[0])):
        q2_vec[i,:] = sent2vec(df_data['question2'][i])

    df_data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(q1_vec, q2_vec)]
    df_data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(q1_vec, q2_vec)]

#train_data_10 = train_data.iloc[0:10]
process_distance(train_data)
#train_data_10

MemoryError: 

# load data

In [4]:
# load back
train_data = pd.read_csv(save_path + 'train_data.csv')
#train_data.head()

In [31]:
# load back
test_data = pd.read_csv(save_path + 'test_data.csv')
#test_data.head()

In [32]:
#test_data['intersection_count']

# plot data

In [15]:
#train_data.groupby('diff_len')['id'].count().plot.bar()

In [16]:
#plt.hist(train_data['log_diff_len'])

In [17]:
#plt.hist(train_data['ratio_len'])

# logistic regression

In [5]:
predictors = train_data.columns[6:]
predictors

Index([u'diff_len', u'ratio_len', u'cos_sim', u'fuzz_qratio', u'fuzz_wratio',
       u'fuzz_partial_ratio', u'fuzz_partial_token_set_ratio',
       u'fuzz_partial_token_sort_ratio', u'fuzz_token_set_ratio',
       u'fuzz_token_sort_ratio'],
      dtype='object')

In [6]:
#X_train = train_data[['cos_sim', 'diff_len', 'ratio_len']]
X_train = train_data[predictors]
y_train = train_data['is_duplicate']
model=LogisticRegression(penalty='l2').fit(X_train, y_train)

In [7]:
model.score(X_train, y_train)

0.66420638650473673

In [61]:
y_pred = model.predict_proba(X_train)

In [62]:
y_pred[0:10,1]  #, np.argmax(y_pred[0:10], axis=1)

array([  4.11177693e-01,   2.21360878e-01,   9.45436483e-02,
         1.54366628e-04,   4.92499783e-02,   2.99633996e-01,
         1.54035376e-04,   1.51472548e-01,   8.62901512e-01,
         9.26676989e-02])

# grid search

In [64]:
classifier = lambda: SGDClassifier(
    loss='log', 
    penalty='elasticnet', 
    fit_intercept=True, 
    n_iter=100, 
    shuffle=True, 
    n_jobs=-1,
    class_weight=None)

model = Pipeline(steps=[
    ('ss', StandardScaler()),
    ('en', classifier())
])

parameters = {
    'en__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.02, 0.1, 0.5, 0.9, 1],
    'en__l1_ratio': [0, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.75, 0.9, 1]
}

folder = StratifiedKFold(n_splits=5, shuffle=True)

In [65]:
grid_search = GridSearchCV(
    model, 
    parameters, 
    cv=folder, 
    n_jobs=-1, 
    verbose=1)

In [66]:
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 444 tasks      | elapsed: 31.3min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 31.7min finished


In [67]:
print model.best_score_
print model.best_params_

0.845022137575
{'en__l1_ratio': 0.3, 'en__alpha': 1e-05}


In [68]:
with open(model_path + 'model_fuzz.pkl', 'wb') as f:
    pickle.dump(model, f)

# xgboost

In [69]:
import xgboost as xgb



In [70]:
msk = np.random.rand(y_train.shape[0]) < 0.8

In [71]:
dtrain = xgb.DMatrix(X_train[msk], y_train[msk], feature_names=predictors)
dvalid = xgb.DMatrix(X_train[~msk], y_train[~msk], feature_names=predictors)

In [72]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}


In [73]:
partial_model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=[(dvalid, 'val')],
                       early_stopping_rounds=20, verbose_eval=20)

[0]	val-rmse:0.485236
Will train until val-rmse hasn't improved in 20 rounds.
[20]	val-rmse:0.368616
[40]	val-rmse:0.327322
[60]	val-rmse:0.314667
[80]	val-rmse:0.31111


In [30]:
#fig, ax = plt.subplots(1, 1, figsize=(8, 16))
#xgb.plot_importance(partial_model, height=0.5, ax=ax)

# read test data

In [74]:
X_test = test_data[predictors]
X_test.shape

(2345796, 22)

In [75]:
predictors

Index([u'diff_len', u'ratio_len', u'cos_sim', u'log_diff_len', u'test_id',
       u'unigram_jaccard', u'unigram_jaccard_all', u'unigram_jaccard_max',
       u'bigram_jaccard', u'bigram_jaccard_all', u'bigram_jaccard_max',
       u'trigram_jaccard', u'trigram_jaccard_all', u'trigram_jaccard_max',
       u'intersection_count', u'fuzz_qratio', u'fuzz_wratio',
       u'fuzz_partial_ratio', u'fuzz_partial_token_set_ratio',
       u'fuzz_partial_token_sort_ratio', u'fuzz_token_set_ratio',
       u'fuzz_token_sort_ratio'],
      dtype='object')

In [76]:
y_test_pred = model.predict_proba(X_test)

In [77]:
print y_test_pred.shape
y_test_pred[0:10,1]

(2345796, 2)


array([  2.80286874e-02,   1.81131941e-01,   3.75304679e-01,
         4.50084808e-04,   2.16294791e-01,   3.95692383e-02,
         1.00000000e+00,   4.55689053e-02,   4.11544304e-01,
         4.12248822e-02])

# prepare submission

In [78]:
df_submission = pd.read_csv(path + 'submission/sample_submission.csv')

In [79]:
print df_submission.shape
df_submission.head()

(2345796, 2)


Unnamed: 0,test_id,is_duplicate
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [80]:
#df_submission['is_duplicate'] = test_data['cos_sim']
df_submission['is_duplicate'] = y_test_pred[:,1]
df_submission['is_duplicate'] = df_submission['is_duplicate'].apply(lambda x: float("{:.4f}".format(x)))
df_submission.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.028
1,1,0.1811
2,2,0.3753
3,3,0.0005
4,4,0.2163


In [81]:
df_submission.to_csv(path + 'submission/quora_feature_engineering_fuzz_20170603.csv', index=False)