In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
eng_stopwords = set(stopwords.words('english'))


class Quora_class():
    def __init__(self):
        self.train_df = None

    def load_data(self,path):
        return pd.read_csv(path,delimiter='\t',encoding='utf-8')[:50000]
    
    def common_words(self,x):
        q1, q2 = x
        return len(set(str(q1).lower().split()) & set(str(q2).lower().split()))

    def words_count(self,question):
        return len(str(question).split())

    def length(self,question):
        return len(str(question))

    def vect(self,train_df,num):
        vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, max_df=0.5, ngram_range=(1, num))
        all_questions = pd.concat([train_df["question1"], train_df["question2"]], ignore_index=True)
        all_Q = vectorizer.fit_transform(all_questions.values)
        Q1 = all_Q[0:int(all_Q.shape[0]/2)]
        Q2 = all_Q[int(all_Q.shape[0]/2):]
        return pd.Series(np.array([np.dot(Q1[i,:], Q2[i,:].T).A[0,0] for i in range(Q1.shape[0])])).values
    
    def word_share(self,x):
        w1 = set(map(lambda word: word.lower().strip(), str(x['question1']).split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), str(x['question2']).split(" ")))    
        return 1.0 * len(w1 & w2)/(len(w1) + len(w2))


    def feature_engineering(self,train_df):
        self.train_df = train_df
        train_df['q1_words_num'] = train_df['question1'].map(self.words_count)
        train_df['q2_words_num'] = train_df['question2'].map(self.words_count)
        train_df['q1_length'] = train_df['question1'].map(self.length)
        train_df['q2_length'] = train_df['question2'].map(self.length)
        train_df['common_words'] = train_df[['question1', 'question2']].apply(self.common_words, axis=1)
        train_df['tf_idf_dot_product'] = self.vect(train_df,1)
        train_df['tf_idf_2gram_dot_products'] = self.vect(train_df,2)
        train_df['word_share'] = self.word_share(train_df)
        return train_df

In [3]:
qc = Quora_class()
data = qc.load_data('quora_duplicate_questions.tsv')
train_df = qc.feature_engineering(data)
del train_df['id']

In [7]:
train_df.head(2)

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate,q1_words_num,q2_words_num,q1_length,q2_length,common_words,tf_idf_dot_product,tf_idf_2gram_dot_products,word_share
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,14,12,66,57,10,0.981873,0.940866,0.263914
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,8,13,51,88,4,0.774992,0.672048,0.263914


In [8]:

print(len(train_df[train_df['is_duplicate']==0]))
print(len(train_df[train_df['is_duplicate']==1]))

31351
18649


In [9]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_df, test_size=0.2)
X_sample = train[['q1_words_num','q2_words_num','q1_length','q2_length','common_words','tf_idf_dot_product','tf_idf_2gram_dot_products','word_share']]
y_sample = train.is_duplicate
X_test = val[['q1_words_num','q2_words_num','q1_length','q2_length','common_words','tf_idf_dot_product','tf_idf_2gram_dot_products','word_share']]
y_test = val.is_duplicate

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

rf = RandomForestClassifier(max_depth=20,min_samples_split=2,n_estimators=200,random_state=9,max_features='sqrt',min_samples_leaf=4)
rf.fit(X_sample,y_sample)
accuracy_score = rf.score(X_test,y_test)
y_prob=rf.predict_proba(X_test)
print("Accuracy using RandomForest",accuracy_score)
print("Logloss using RandomForest",log_loss(y_test,y_prob))

Accuracy using RandomForest 0.7314
Logloss using RandomForest 0.4786834621252199


In [27]:
names = X_sample.columns
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), 
             reverse=True))

Features sorted by their score:
[(0.2343, 'tf_idf_dot_product'), (0.2244, 'tf_idf_2gram_dot_products'), (0.1947, 'common_words'), (0.1106, 'q1_length'), (0.1051, 'q2_length'), (0.0692, 'q2_words_num'), (0.0619, 'q1_words_num'), (0.0, 'word_share')]


In [111]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
xgclass=XGBClassifier(max_delta_step=2,base_score=0.5,max_depth=10,n_estimators=200,min_child_weight=2)
xgclass.fit(X_sample,y_sample)
score = xgclass.score(X_test,y_test)
y_predprob=xgclass.predict_proba(X_test)
print("Accuracy using XGBoost",score)
print("Logloss using XGBoost",log_loss(y_test,y_predprob))

Accuracy using XGBoost 0.7272
Logloss using XGBoost 0.4892863171164543


  if diff:


In [112]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 10
params['n_estimators'] = 100
params['max_delta_step'] = 5
params['base_score'] = 0.5

d_train = xgb.DMatrix(X_sample, label=y_sample)
d_valid = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)


[0]	train-logloss:0.686267	valid-logloss:0.686971
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.629629	valid-logloss:0.636476
[20]	train-logloss:0.588346	valid-logloss:0.600405
[30]	train-logloss:0.557115	valid-logloss:0.573673
[40]	train-logloss:0.532893	valid-logloss:0.553569
[50]	train-logloss:0.513938	valid-logloss:0.538542
[60]	train-logloss:0.498571	valid-logloss:0.52694
[70]	train-logloss:0.485935	valid-logloss:0.517719
[80]	train-logloss:0.475472	valid-logloss:0.510693
[90]	train-logloss:0.466935	valid-logloss:0.505249
[100]	train-logloss:0.459906	valid-logloss:0.500949
[110]	train-logloss:0.453825	valid-logloss:0.49766
[120]	train-logloss:0.448631	valid-logloss:0.494983
[130]	train-logloss:0.444379	valid-logloss:0.492991
[140]	train-logloss:0.440869	valid-logloss:0.491453
[150]	train-logloss:0.437603	valid-logloss:0.490192
[160]	train-logloss:0.434374	v