In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

color = sns.color_palette()

## Read Data ##

In [None]:
df_train = pd.read_csv('../input/train.csv', encoding='utf-8')
df_train['id'] = df_train['id'].apply(str)

In [None]:
df_train.drop_duplicates(inplace=True)
df_train.dropna(inplace=True)

In [None]:
df_test = pd.read_csv('../input/test.csv', encoding='utf-8')
df_test['test_id'] = df_test['test_id'].apply(str)

In [None]:
df_all = pd.concat((df_train, df_test))
df_all['question1'].fillna('', inplace=True)
df_all['question2'].fillna('', inplace=True)

In [None]:
df_train.groupby("is_duplicate")['id'].count().plot.bar()

In [None]:
import nltk

from gensim.models import word2vec
STOP_WORDS = nltk.corpus.stopwords.words()

In [None]:
corpus = []
for col in ['question1', 'question2']:
    for sentence in df_train[col].iteritems():
        word_list = sentence[1].split(" ")
        corpus.append(word_list)

In [None]:
corpus[0:2]

In [None]:
model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)
model.wv['india']

In [None]:
from sklearn.manifold import TSNE
def dataplot(data):
    labels = []
    tokens = []
    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    model_tnse = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    new_tokens = model_tnse.fit_transform(tokens)
    x_axis = []
    y_axis = []
    for i in range(len(x_axis)):
        plt.scatter(x_axis[i],y_axis[i])
        plt.annotate(labels[i],
                     xy=(x_axis[i], y_axis[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
dataplot(model)

In [None]:
model.most_similar('trump')

## Create Vocab ##

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import itertools

In [None]:
counts_vectorizer = CountVectorizer(max_features=10000-1).fit(
    itertools.chain(df_all['question1'], df_all['question2']))
other_index = len(counts_vectorizer.vocabulary_)

##Prep Data##

In [None]:
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
words_tokenizer = re.compile(counts_vectorizer.token_pattern)

In [None]:
def create_padded_seqs(texts, max_len=10):
    seqs = texts.apply(lambda s: 
        [counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
         for w in words_tokenizer.findall(s.lower())])
    return pad_sequences(seqs, maxlen=max_len)

In [None]:
df_all = df_all.sample(1000) # Just for debugging

In [None]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
    train_test_split(create_padded_seqs(df_all[df_all['id'].notnull()]['question1']), 
                     create_padded_seqs(df_all[df_all['id'].notnull()]['question2']),
                     df_all[df_all['id'].notnull()]['is_duplicate'].values,
                     stratify=df_all[df_all['id'].notnull()]['is_duplicate'].values,
                     test_size=0.3, random_state=1989)

##Training##

In [None]:
import keras.layers as lyr
from keras.models import Model

In [None]:
input1_tensor = lyr.Input(X1_train.shape[1:])
input2_tensor = lyr.Input(X2_train.shape[1:])

words_embedding_layer = lyr.Embedding(X1_train.max() + 1, 100)
seq_embedding_layer = lyr.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = lyr.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = lyr.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = lyr.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

In [None]:
model.fit([X1_train, X2_train], y_train, 
          validation_data=([X1_val, X2_val], y_val), 
          batch_size=128, epochs=6, verbose=2)

##Extract Features From Model##

In [None]:
features_model = Model([input1_tensor, input2_tensor], merge_layer)
features_model.compile(loss='mse', optimizer='adam')

In [None]:
F_train = features_model.predict([X1_train, X2_train], batch_size=128)
F_val = features_model.predict([X1_val, X2_val], batch_size=128)

##Train XGBoost##

In [None]:
import xgboost as xgb

In [None]:
dTrain = xgb.DMatrix(F_train, label=y_train)
dVal = xgb.DMatrix(F_val, label=y_val)

In [None]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4



watchlist = [(dTrain, 'train'), (dVal, 'valid')]

bst = xgb.train(params, dTrain, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

##Predict Test##

In [None]:
X1_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question1'])
X2_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question2'])

In [None]:
F_test = features_model.predict([X1_test, X2_test], batch_size=128)

In [None]:
dTest = xgb.DMatrix(F_test)

In [None]:
df_sub = pd.DataFrame({
        'test_id': df_all[df_all['test_id'].notnull()]['test_id'].values,
        'is_duplicate': bst.predict(dTest, ntree_limit=bst.best_ntree_limit)
    }).set_index('test_id')

In [None]:
df_sub.head()

In [None]:
'df_sub['is_duplicate'].hist(bins=100)

In [None]:
df_sub.to_csv('final.csv')