In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import re
from fuzzywuzzy import fuzz
from string import punctuation
stops = set(stopwords.words("english"))
%matplotlib inline

In [None]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [None]:
def extract_features(df):
    df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
    print(1)
    df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
    print(2)
    df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
    print(3)
    df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    print(4)
    df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
    print(5)
    df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    return df

In [None]:
df_train = extract_features(df_train)
df_test = extract_features(df_test)

In [None]:
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)

In [None]:
test_qs = pd.Series(df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str)

In [None]:
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [None]:
from collections import Counter

# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [None]:
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [None]:
train_word_match = df_train.apply(word_match_share, axis=1, raw=True)
tfidf_train_word_match = df_train.apply(tfidf_word_match_share, axis=1, raw=True)
test_word_match = df_test.apply(word_match_share, axis=1, raw=True)
tfidf_test_word_match = df_test.apply(tfidf_word_match_share, axis=1, raw=True)

In [None]:
from sklearn.metrics import roc_auc_score
print('Original AUC:', roc_auc_score(df_train['is_duplicate'], train_word_match))
print('   TFIDF AUC:', roc_auc_score(df_train['is_duplicate'], tfidf_train_word_match.fillna(0)))

In [None]:
# First we create our training and testing data
x_train = pd.DataFrame()
x_train['word_match'] = train_word_match
x_train['tfidf_word_match'] = tfidf_train_word_match
x_train['len_q1'] = df_train["question1"].apply(lambda x: len(x) if type(x) == str else 0)
x_train['len_q2'] = df_train["question2"].apply(lambda x: len(x) if type(x) == str else 0)
x_train['nb_q1_words'] = df_train["question1"].apply(lambda x: len(x.split()) if type(x) == str else 0)
x_train['nb_q2_words'] = df_train["question2"].apply(lambda x: len(x.split()) if type(x) == str else 0)
x_train['len_q1_words'] = df_train["question1"].apply(lambda x: len(x.split()) if type(x) == str else 0)
x_train['len_q2_words'] = df_train["question2"].apply(lambda x: len(x.split()) if type(x) == str else 0)
x_train['len_diff'] = abs(x_train["len_q1"] - x_train["len_q2"])
x_train['len_diff_words'] = abs(x_train["len_q1_words"] - x_train["len_q2_words"])
x_train['fuzz_qratio'] = df_train['fuzz_qratio']
x_train['fuzz_partial_ratio'] = df_train['fuzz_partial_ratio']
x_train['fuzz_partial_token_set_ratio'] = df_train['fuzz_partial_token_set_ratio']
x_train['fuzz_partial_token_sort_ratio'] = df_train['fuzz_partial_token_sort_ratio']
x_train['fuzz_token_set_ratio'] = df_train['fuzz_token_set_ratio']
x_train['fuzz_token_sort_ratio'] = df_train['fuzz_token_sort_ratio']

y_train = df_train['is_duplicate'].values

In [None]:
# First we create our training and testing data
x_test = pd.DataFrame()
x_test['word_match'] = test_word_match
x_test['tfidf_word_match'] = tfidf_test_word_match
x_test['len_q1'] = df_test["question1"].apply(lambda x: len(x) if type(x) == str else 0)
x_test['len_q2'] = df_test["question2"].apply(lambda x: len(x) if type(x) == str else 0)
x_test['nb_q1_words'] = df_test["question1"].apply(lambda x: len(x.split()) if type(x) == str else 0)
x_test['nb_q2_words'] = df_test["question2"].apply(lambda x: len(x.split()) if type(x) == str else 0)
x_test['len_q1_words'] = df_test["question1"].apply(lambda x: len(x.split()) if type(x) == str else 0)
x_test['len_q2_words'] = df_test["question2"].apply(lambda x: len(x.split()) if type(x) == str else 0)
x_test['len_diff'] = abs(x_test["len_q1"] - x_test["len_q2"])
x_test['len_diff_words'] = abs(x_test["len_q1_words"] - x_test["len_q2_words"])
x_test['fuzz_qratio'] = df_test['fuzz_qratio']
x_test['fuzz_partial_ratio'] = df_test['fuzz_partial_ratio']
x_test['fuzz_partial_token_set_ratio'] = df_test['fuzz_partial_token_set_ratio']
x_test['fuzz_partial_token_sort_ratio'] = df_test['fuzz_partial_token_sort_ratio']
x_test['fuzz_token_set_ratio'] = df_test['fuzz_token_set_ratio']
x_test['fuzz_token_sort_ratio'] = df_test['fuzz_token_sort_ratio']

In [None]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

In [None]:
# Finally, we split some of the data off for validation
from sklearn.cross_validation import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

In [None]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

In [None]:

d_valid = xgb.DMatrix(x_valid)
p_valid = bst.predict(d_valid)
print('Predicted score:', log_loss(y_valid, p_valid))

In [None]:
d_test = xgb.DMatrix(x_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv('simple_xgb.csv', index=False)

In [None]:
importance = bst.get_fscore()

In [None]:
import operator
importance = sorted(importance.items(), key=operator.itemgetter(1))
ft = pd.DataFrame(importance, columns=['feature', 'fscore'])
ft.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 25))