In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
from collections import Counter
from nltk.corpus import stopwords

from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [None]:
print(df_train.info())
print(df_test.info())

In [None]:
from nltk.corpus import stopwords

stops = set(stopwords.words("english"))

def WordMatch(row):
    q1 = set(str(row['question1']).split())
    q2 = set(str(row['question2']).split())
    a = len(q1.union(q2).difference(stops))
    if (a == 0):
        return 0
    else:
        return (len(q1.intersection(q2).difference(stops)) + .0) / a

In [None]:
#Чистка
df_train.question1 = df_train.question1.map(lambda x : str(x).lower())
df_train.question2 = df_train.question2.map(lambda x : str(x).lower())
df_train.is_duplicate = df_train.is_duplicate.fillna(0)

In [None]:
df_train['WordMatch'] = df_train.apply(WordMatch, axis=1, raw=True)

In [None]:
sns.distplot(df_train[df_train['is_duplicate']==0].WordMatch, kde=False)
sns.distplot(df_train[df_train['is_duplicate']==1].WordMatch, kde=False)

In [None]:
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
test_qs = pd.Series(df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str)

In [None]:
# Считаем повторяемость слов

def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [None]:
def tfidf(row):
    q1 =  set(str(row['question1']).split()).difference(stops)
    q2 =  set(str(row['question2']).split()).difference(stops)
    
    if len(q1) == 0 or len(q2) == 0:        
        return 0
    inter = q1.intersection(q2)
    
    shared_weights = [2 * weights.get(w, 0) for w in inter]
    total_weights = [weights.get(w, 0) for w in q1] + [weights.get(w, 0) for w in q2]
    
    R = np.sum(shared_weights) / (np.sum(total_weights) + 0.01)
    return R

In [None]:
df_train['tfidf'] = df_train.apply(tfidf, axis=1, raw=True)

In [None]:
sns.distplot(df_train[df_train['is_duplicate']==0].tfidf, kde=False)
sns.distplot(df_train[df_train['is_duplicate']==1].tfidf, kde=False)

In [None]:
import xgboost as xgb
# Параметры
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

In [None]:
df_test['tfidf'] = df_test.apply(tfidf, axis=1, raw=True)

In [None]:
df_test['WordMatch'] = df_test.apply(WordMatch, axis=1, raw=True)

In [None]:
x_train = df_train.drop(['question1', 'question2', 'qid1', 'qid2', 'is_duplicate', 'id'], axis=1).values
y_train = df_train.is_duplicate.values
x_test = df_test.drop(['question1', 'question2', 'test_id'], axis=1).values

In [None]:
d_train = xgb.DMatrix(x_train, label=y_train)
watchlist = [(d_train, 'train')]

In [None]:
bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

In [None]:
d_test = xgb.DMatrix(x_test)
y_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = y_test
sub.to_csv('sub.csv', index=False)