In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
input = '../input/'

In [None]:
df_train = pd.read_csv(input+"train.csv")

In [None]:
df_train.head()

In [None]:
print("lenght of train data: "+str(len(df_train)))

In [None]:
print("length of duplicate pairs: "+str(df_train['is_duplicate'].sum()
                                       ))

In [None]:
qids = pd.Series(df_train['qid1'].tolist()+df_train['qid2'].tolist())

In [None]:
print("How many unique qids: "+ str(len(np.unique(qids))))

In [None]:
print('Number of questions that appear multiple times: {}'.format(np.sum(qids.value_counts() > 1)))

In [None]:
qids.value_counts()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(qids.value_counts(), bins=50,log = 'True')
plt.title('Log-Histogram of question appearance counts')
plt.xlabel('Number of occurences of question')
plt.ylabel('Number of questions')
# plt.yscale('log', nonposy='clip')

In [None]:
qids.value_counts()

In [None]:
df_test = pd.read_csv(input+"test.csv")

In [None]:
train_qs = pd.Series(df_train["question1"].tolist()+df_train['question2'].tolist()).astype(str)
test_qs = pd.Series(df_test["question1"].tolist()+df_test['question2'].tolist()).astype(str)

In [None]:
pal = sns.color_palette()

In [None]:
dist_train = train_qs.apply(len)
dist_test = test_qs.apply(len)
plt.figure(figsize=(15, 10))
plt.hist(dist_train, bins=200, range=[0, 200], color=pal[2], normed=True, label='train')
plt.hist(dist_test, bins=200, range=[0, 200], color=pal[1], normed=True, alpha = 0.5 ,label='test')
plt.title('Normalised histogram of character count in questions', fontsize=15)
plt.legend()
plt.xlabel('Number of characters', fontsize=15)
plt.ylabel('Probability', fontsize=15)

if 150 characters' steep down are the limit, how could there be some question got over 150 words?

In [None]:
import re
r = re.compile("[ ,.?|]")#|'
dist_train = train_qs.apply(lambda x: len(r.split(x)))
dist_test = test_qs.apply(lambda x: len(r.split(x)))

In [None]:
plt.figure(figsize=(15, 10))
plt.hist(dist_train, bins=100, range=[0, 100], color=pal[2], normed=True, label='train')
plt.hist(dist_test, bins=100, range=[0, 100], color=pal[1], normed=True, alpha=0.5, label='test')
plt.title('Normalised histogram of word count in questions', fontsize=15)
plt.legend()
plt.xlabel('Number of words', fontsize=15)
plt.ylabel('Probability', fontsize=15)

max word limit may be 70? The two distribution are very similar.

By using the feature like number of shared words, we have the following idea.

In [None]:
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

In [None]:
def word_match(row):
    q1_word = {}
    q2_word = {}
    for word in r.split(str(row["question1"]).lower()):
        if word not in stops:
            q1_word[word] = 1
    for word in r.split(str(row["question2"]).lower()):
        if word not in stops:
            q2_word[word] = 1
    if len(q1_word) == 0 or len(q2_word)==0:
        return 0
    shared_word_in_q1 = [w for w in q1_word.keys() if w in q2_word]
    shared_word_in_q2 = [w for w in q2_word.keys() if w in q1_word]
    R = (len(shared_word_in_q1)+len(shared_word_in_q2))/(len(q1_word)+len(q2_word))
    return R

plt.figure(figsize=(15,5))
train_word_related = df_train.apply(word_match,axis = 1,raw = 1)
plt.hist(train_word_related[df_train['is_duplicate'] == 0].fillna(0), bins=20, normed=True, label='Not Duplicate')
plt.hist(train_word_related[df_train['is_duplicate'] == 1].fillna(0), bins=20, normed=True, alpha=0.7, label='Duplicate')
plt.legend()
plt.title('Label distribution over word_match_share', fontsize=15)
plt.xlabel('word_match_share', fontsize=15)

Used stops to filter all words, and compute the relateness in question 1 and question 2.

In [None]:
df_train.head()

In [None]:
from collections import Counter

# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000 
words = r.split((" ".join(train_qs)).lower())
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items() if word != ""}

Because we filter the only-once appearance, so we need some specific operation on x==0.
Also, the re.split() will create a lot null str, I also remove that from weights.

In [None]:
print("Most commonly used words are: \n")
print(sorted(weights.items(),key=lambda x: x[1] if x[1]>0 else 9999)[:10])
print("\nLeast commonly used words are: \n")
print(sorted(weights.items(),key=lambda x: x[1],reverse = True)[:10])

In [None]:
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in r.split(str(row['question1']).lower()):
        if word not in stops:
            q1words[word] = 1
    for word in r.split(str(row['question2']).lower()):
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [None]:
plt.figure(figsize=(15, 5))
tfidf_train_word_share = df_train.apply(tfidf_word_match_share,axis = 1, raw = True)
plt.hist(tfidf_train_word_share[df_train['is_duplicate']==1].fillna(0), bins=20, normed = True, label = 'Duplicate')
plt.hist(tfidf_train_word_share[df_train['is_duplicate']==0].fillna(0), bins=20, normed = True, label = 'Not Duplicate',alpha = 0.7)
plt.legend()
plt.title('Label distribution over tfidf train data')
plt.xlabel('tfidf train word share')

In [None]:
from sklearn.metrics import roc_auc_score
print('Original AUC:', roc_auc_score(df_train['is_duplicate'], train_word_related))
print('   TFIDF AUC:', roc_auc_score(df_train['is_duplicate'], tfidf_train_word_share.fillna(0)))

Rebalancing data

In [None]:
# First we create our training and testing data
x_train = pd.DataFrame()
x_test = pd.DataFrame()
x_train['word_match'] = train_word_related
x_train['tfidf_word_match'] = tfidf_train_word_share
x_test['word_match'] = df_test.apply(word_match, axis=1, raw=True)
x_test['tfidf_word_match'] = df_test.apply(tfidf_word_match_share, axis=1, raw=True)

y_train = df_train['is_duplicate'].values

In [None]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

In [None]:
from sklearn.cross_validation import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

In [None]:
x_train

In [None]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

In [None]:
d_test = xgb.DMatrix(x_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test

In [None]:
sub.head()

In [None]:

sub.to_csv('prediction.csv')

In [None]:
nsub = pd.read_csv('prediction.csv')

In [None]:
nsub