## Import Libraries

In [15]:
import codecs
import spacy
import nltk
import numpy as np
# import torch
import matplotlib.pyplot as plt
import gensim, logging
import json
import h5py
# from wordcloud import WordCloud
import pandas as panda
from nltk.corpus import stopwords
import re
import tensorflow as tf
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, BatchNormalization, Bidirectional
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.callbacks import Callback, ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.layers import TimeDistributed
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.utils import class_weight as cw 

nlp = spacy.load("en")

In [75]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Read Training and Test Data

In [2]:
training_data = panda.read_csv("D:/Chirag B/AI/data/train.csv")

test_data = panda.read_csv("D:/Chirag B/AI/data/test.csv")

In [3]:
#checking for empty data values. If there are null values, a boolean array is returned. 
#We then sum all the boolean values(1 if null) to count the number of null values.
print(training_data.isnull().sum())
print(test_data.isnull().sum())


#replacing the null values with 'empty'
training_data = training_data.fillna(value='empty', axis=0)
test_data = test_data.fillna(value='empty', axis=0)

print ("########")

#verifying whether the dataframe still has null rows.
print(training_data.isnull().sum())
print(test_data.isnull().sum())

training_data

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64
test_id      0
question1    2
question2    4
dtype: int64
########
id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64
test_id      0
question1    0
question2    0
dtype: int64


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


## Clean Strings

In [4]:
def cleaner(question):

    
    #regex substitutions to clean question strings. This function is based on a discussion board in
    #Kaggle on 'The Importance of Cleaning Text'
    #https://www.kaggle.com/currie32/the-importance-of-cleaning-text/notebook
    

    
#     question = re.sub(r"[^A-Za-z0-9]", " ", question, flags=re.IGNORECASE)
    question = re.sub(r"'ve", " have ", question, flags=re.IGNORECASE)
    question = re.sub(r"what's", " what is ", question, flags=re.IGNORECASE)
    question = re.sub(r"whats", " what is ", question, flags=re.IGNORECASE)
    question = re.sub(r"can't", " cannot ", question, flags=re.IGNORECASE)
    question = re.sub(r"shan't", " shall not ", question, flags=re.IGNORECASE)
    question = re.sub(r"wouldn't", " would not ", question, flags=re.IGNORECASE)
    question = re.sub(r"hasn't", " has not ", question, flags=re.IGNORECASE)
    question = re.sub(r"couldn't", " could not ", question, flags=re.IGNORECASE)
    question = re.sub(r"couldnt", " could not ", question, flags=re.IGNORECASE)
    question = re.sub(r"didn't", " did not ", question, flags=re.IGNORECASE)
    question = re.sub(r"n't", " not", question, flags=re.IGNORECASE)
#     question = re.sub(r"I'm", "I am", question, flags=re.IGNORECASE)
    question = re.sub(r"\s{2,}", " ", question, flags=re.IGNORECASE)
#     question = re.sub(r",", " ", question, flags=re.IGNORECASE)
#     question = re.sub(r"\.", " ", question, flags=re.IGNORECASE)
    question = re.sub(r"\/", " / ", question, flags=re.IGNORECASE)
    question = re.sub(r"\+", " + ", question, flags=re.IGNORECASE)
    question = re.sub(r"\-", " - ", question, flags=re.IGNORECASE)
    question = re.sub(r"\*", " * ", question, flags=re.IGNORECASE)
    question = re.sub(r"\bu s\b", "American", question, flags=re.IGNORECASE)
    question = re.sub(r"\:", " : ", question, flags=re.IGNORECASE)
    question = re.sub(r"\?", "", question, flags=re.IGNORECASE)
    question = re.sub(r"e - mail", "email", question, flags=re.IGNORECASE)
    question = re.sub(r"(\d+)(k)\b", "\g<1>000", question, flags=re.IGNORECASE)
    question = re.sub(r"\0s", "0", question, flags=re.IGNORECASE)
    question = re.sub(r"\^", " ^ ", question, flags=re.IGNORECASE)
    question = re.sub(r"\=", " = ", question, flags=re.IGNORECASE)
    question = re.sub(r"\'ll", " will", question, flags=re.IGNORECASE)
    question = re.sub(r"\'d", " would ", question, flags=re.IGNORECASE)
    question = re.sub(r"\'re", " are", question, flags=re.IGNORECASE)
    question = re.sub(r" b g ", " bg ", question, flags=re.IGNORECASE)
    question = re.sub(r" e g ", " eg ", question, flags=re.IGNORECASE)
    question = re.sub(r" US ", " America ", question)
    question = re.sub(r" U.S.A ", " America ", question, flags=re.IGNORECASE)
    question = re.sub(r" U.S. ", " America ", question, flags=re.IGNORECASE)
    
    

    
    return question



In [5]:
#code to clean text
#making a call to "cleaner" for each row of both the train and test data
training_data['question1'] = training_data.apply((lambda x: cleaner(x['question1'])), axis=1)
training_data['question2'] = training_data.apply((lambda x: cleaner(x['question2'])), axis=1)

test_data['question1'] = test_data.apply((lambda x: cleaner(x['question1'])), axis=1)
test_data['question2'] = test_data.apply((lambda x: cleaner(x['question2'])), axis=1)


In [6]:
train = training_data
test = test_data
# train

print("TEST:", len(test))
print("TRAIN:", len(train))

TEST: 2345796
TRAIN: 404290


## Tokenization using Keras

In [7]:

#The tokenizer function is a Keras text preprocessing function. It is useful for creating sequences out of text sentences.
#The "tokenizr" object is made to "fit" on all strings/questions in the train and test dataset
tokenizr = Tokenizer(num_words = 200000)
tokenizer_train = [q1 for q1 in train["question1"]] + [q2 for q2 in train["question2"]] + [q3 for q3 in test["question1"]] + [q4 for q4 in test["question2"]]
tokenizr.fit_on_texts(tokenizer_train)
print("Done")

Done


## Computing GloVe embeddings for words

In [8]:
##GLOVE EMBEDDINGS

embed_dim = 300 #embedding dimensions
embed_file = codecs.open("glove.840B.300d/glove.840B.300d.txt", 'r', 'utf-8') #using GloVe common crawl embeddings

#index dictionary
#This dictionary stores a word as the key and its corresponding GloVe word vector as a value.
#This is useful for embedding text sequences.
vector_dict = {}  
for aline in embed_file:
    line = aline.split()
    
    try:
        word = line[0]
        word_vector = np.asarray(line[1:], dtype='float32')
    except:
        print("ISSUE --> ", line)
    vector_dict[word] = word_vector
embed_file.close()

print("Done --> ",len(vector_dict))


ISSUE -->  ['.', '.', '.', '-0.1573', '-0.29517', '0.30453', '-0.54773', '0.098293', '-0.1776', '0.21662', '0.19261', '-0.21101', '0.53788', '-0.047755', '0.40675', '0.023592', '-0.32814', '0.046858', '0.19367', '0.25565', '-0.021019', '-0.15957', '-0.1023', '0.20303', '-0.043333', '0.11618', '-0.18486', '0.0011948', '-0.052301', '0.34587', '0.052335', '0.16774', '-0.21384', '0.055947', '0.24934', '-0.12179', '0.16749', '0.28922', '-0.033739', '0.3015', '-0.13241', '0.092635', '0.37155', '-0.2884', '-0.0052731', '-0.001005', '-0.51153', '-0.28476', '-0.20139', '0.11837', '-0.0055891', '0.43604', '0.16796', '-0.2701', '0.063957', '-0.093253', '-0.22079', '0.36501', '0.06545', '0.23941', '-0.19292', '0.098293', '0.12172', '-0.1168', '-0.027436', '0.20507', '-0.39139', '-0.23111', '0.46239', '0.22888', '-0.028415', '-0.1798', '0.23817', '0.28093', '-0.47935', '0.23177', '-0.35587', '0.14246', '0.11861', '0.011018', '0.091986', '0.0054809', '-0.39955', '-0.40183', '-0.10629', '-0.30851', '

ISSUE -->  ['.', '.', '0.035974', '-0.024421', '0.71402', '-0.61127', '0.012771', '-0.11201', '0.16847', '-0.14069', '-0.053491', '-0.87539', '-0.13959', '0.29731', '0.072308', '-0.084514', '-0.1879', '0.12358', '0.37639', '-0.39238', '-0.01111', '-0.04924', '0.63649', '0.058814', '0.19076', '-0.20828', '-0.11036', '0.14934', '0.24667', '-0.39438', '0.22853', '-0.11201', '0.33539', '-0.32929', '-0.049727', '-0.090764', '0.29095', '0.27504', '0.22802', '-0.15616', '0.37302', '0.3752', '-0.3677', '0.1518', '-0.27551', '-0.63281', '-0.31298', '-0.22441', '-0.15435', '-0.64802', '0.28404', '0.12356', '0.0034255', '0.03094', '0.35345', '-0.46781', '0.59203', '-0.17966', '0.27702', '-0.46738', '0.19438', '0.21939', '-0.36743', '-0.084781', '0.03253', '-0.51323', '-0.55466', '0.49585', '0.066985', '0.47906', '-0.25118', '0.011123', '0.15605', '-1.0761', '0.60875', '-0.15764', '0.066122', '0.12779', '-0.089209', '0.4311', '0.045732', '-0.29364', '-0.19994', '-0.065952', '0.26236', '0.34039', '

ISSUE -->  ['contact', 'name@domain.com', '0.016426', '0.13728', '0.18781', '0.75784', '0.44012', '0.096794', '0.060987', '0.31293', '-0.15884', '-1.2367', '0.43769', '0.10465', '0.048858', '-0.23182', '0.71125', '0.022376', '0.63524', '-1.4974', '0.12243', '-0.07386', '-0.021514', '-0.37652', '0.17503', '-0.011225', '-0.12668', '-0.0090601', '0.38418', '0.11132', '0.15851', '-0.47498', '0.33619', '-0.48833', '0.23423', '0.13258', '0.29362', '0.13526', '-0.05115', '-0.0055236', '0.27734', '-0.23565', '0.19571', '-0.29095', '0.062419', '-0.47502', '-0.71402', '-0.36384', '0.53562', '0.40136', '0.30963', '0.16238', '-0.11662', '-0.16201', '0.30672', '0.21663', '0.086839', '-0.38895', '-0.19644', '-0.52311', '-0.33153', '0.27012', '-0.89654', '-0.15193', '0.12447', '-0.19112', '-0.494', '-0.011873', '-0.41412', '0.52585', '0.27316', '-0.047525', '-0.1178', '-0.3371', '0.61151', '-0.012169', '0.36935', '0.32679', '-0.098269', '0.038729', '0.003551', '-0.51871', '-0.48189', '-0.079238', '-0

ISSUE -->  ['by', 'name@domain.com', '0.6882', '-0.36436', '0.62079', '1.1482', '-0.055475', '-0.37936', '0.0064471', '-0.33046', '-0.43406', '-1.3468', '0.70312', '-0.41314', '-0.65868', '0.64324', '0.13018', '0.65846', '0.86269', '-0.93108', '0.3476', '0.73912', '-0.51405', '-0.15113', '0.27331', '0.51396', '-0.74688', '0.87989', '-0.11887', '0.3641', '0.37838', '0.36177', '-0.45182', '0.16173', '-0.36353', '-0.55643', '-1.1186', '0.70117', '-0.48075', '0.074095', '0.43022', '0.4625', '0.011133', '0.030287', '-0.73342', '-0.772', '0.31058', '0.022106', '-0.16845', '-0.70695', '-0.16243', '-0.15454', '-0.12034', '0.018702', '0.51626', '-0.17255', '0.37335', '-0.059377', '0.013126', '-0.30727', '0.1581', '0.74527', '-0.7927', '-0.34603', '-0.01438', '-1.055', '-0.95074', '-0.81794', '0.27925', '-0.35405', '-0.26783', '-0.30391', '0.16093', '-0.064806', '0.69283', '-1.1955', '0.18414', '-0.71183', '0.062622', '-0.62435', '-0.16458', '-0.74362', '-0.19251', '-0.1841', '0.99035', '-0.2055

ISSUE -->  ['at', 'name@domain.com', '0.44321', '-0.40005', '-0.20065', '1.1209', '0.34041', '0.086082', '-0.067128', '0.0022702', '-0.94649', '-1.4669', '0.61248', '0.34827', '-0.20983', '-0.61434', '0.41102', '0.57759', '0.69071', '-1.9301', '0.75265', '-0.13238', '0.22003', '0.28856', '0.35234', '0.45989', '-0.21944', '0.1931', '-0.11664', '0.14996', '0.70354', '-0.039238', '0.55298', '-0.53503', '-0.3221', '-0.28595', '-0.1246', '0.054544', '-0.45937', '0.1447', '0.8203', '-0.33182', '0.10864', '-0.56552', '0.39898', '-0.65012', '-0.20285', '0.11557', '0.35711', '-0.23958', '-0.30281', '0.51593', '0.71883', '-0.30403', '0.59458', '-0.3217', '-0.23967', '-0.2576', '-0.50224', '-0.36055', '-0.71763', '0.4981', '-0.69945', '-0.0072578', '0.37327', '-0.029839', '-0.42705', '0.93128', '-0.046928', '0.045162', '-0.44879', '0.16579', '-0.26272', '-0.35286', '0.17395', '-0.24436', '-0.1439', '-0.39857', '0.25342', '-0.44737', '0.37618', '-0.80252', '-0.87776', '-0.19282', '-0.48746', '0.06

## Create Sequences

In [2]:
#Creating text sequences for both the train and test questions.

train_sequence_q1 = train['question1'].tolist()
train_sequence_q2 = train['question2'].tolist()
test_sequence_q1 = test['question1'].tolist()
test_sequence_q2 = test['question2'].tolist()


#get test ID and validation labels.
train_labels = train['is_duplicate'].tolist()
test_id = test['test_id'].tolist()


#Using the text_to_sequences function as part of the Keras library(tokenizer class) for computing sequences.
# trained "tokeinzr" object on test and train dataset.
_train_sequence_q1 = tokenizr.texts_to_sequences(train_sequence_q1)
_train_sequence_q2 = tokenizr.texts_to_sequences(train_sequence_q2)
_test_sequence_q1 = tokenizr.texts_to_sequences(test_sequence_q1)
_test_sequence_q2 = tokenizr.texts_to_sequences(test_sequence_q2)


#padding sequences to be of same length 30
train_q1 = pad_sequences(_train_sequence_q1, maxlen = 30)
train_q2 = pad_sequences(_train_sequence_q2, maxlen = 30)
test_q1 = pad_sequences(_test_sequence_q1, maxlen = 30)
test_q2 = pad_sequences(_test_sequence_q2, maxlen = 30)

#creating a numpy array of labels for input to the LSTM neural network.
train_labels = np.array(train_labels)

#creating numpy array of test id to be used for the submission files in the end.
test_id = np.array(test_id)


# print (len(val_q1))
# X_train = 
# X_val =

## GloVe Matrix for Embedding Layer

In [41]:
#creating a GloVe embedding matrix for the embedding layer of the LSTM.
embed_mat = np.zeros((len(tokenizr.word_index)+1, embed_dim))
for key,value in tokenizr.word_index.items():
    embed_vector = vector_dict.get(key)
    if embed_vector is not None:
        embed_mat[value] = embed_vector


print("\nDone creating GloVe matrix\n")


Done creating GloVe matrix



## Implementation

In [4]:
#Parameters for the LSTM.
input_dim = len(tokenizr.word_index) + 1
output_dim = 300
input_len = 60
lstm_units = 250
drop = 0.4

In [42]:
#Read all features from the feature csv files.
feature_train = panda.read_csv("C:/dat/features_train.csv")
feature_test = panda.read_csv("C:/dat/features_test.csv")
feature_kcore_train = panda.read_csv("C:/dat/kcore_features_train.csv")
feature_kcore_test = panda.read_csv("C:/dat/kcore_features_test.csv")

In [43]:
#Get all training labels for input to LSTM
list_of_train_labels = feature_train['is_duplicate']

#Get labels for input to XGBoost
y_train = feature_train['is_duplicate'].values

#Drop 'id' and 'is_duplicate' fields from the input data
feature_train.drop(["id","is_duplicate"], axis=1, inplace=True)

#create a dataframe for the input.
result_1 = panda.concat([feature_train, feature_kcore_train], axis=1)
result_2 = panda.concat([feature_test, feature_kcore_test], axis=1)

#list of all training input data
list_of_all_training_features = result_1.values.tolist()
list_of_all_test_features = result_2.values.tolist()

#numpy array of input data. Used as input to the LSTM.
list_of_all_training_features = np.array(list_of_all_training_features)
list_of_all_test_features = np.array(list_of_all_test_features)

In [39]:
#numpy array of training labels
list_of_train_labels = np.array(list_of_train_labels)

# print(type(list_of_all_training_features))
test_id = test_data['test_id'].tolist()
test_id = np.array(test_id)

In [44]:
#Handling unbalanced classes using class weights from scikit-learn
y_labels = np.asarray(list_of_train_labels)

imbalance_count = np.count_nonzero(y_labels)

classes = np.unique(y_labels)

class_weights = cw.compute_class_weight('balanced', classes, y_labels)

print(len(y_labels))
print(imbalance_count)
print(classes)
print(class_weights)

# [ 0.79264156  1.3542874 ]
c_w = {0:1.3542874, 1:0.79264156 }


404290
149263
[0 1]
[ 0.79264156  1.3542874 ]


### Simple LSTM Experiment 1 - First ever neural network

In [12]:
#First neural network implementation
#Made use of tutorial on Machine Learning Mastery
#https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

#model definition
model = Sequential()
model.add(Embedding(input_dim, output_dim, weights=[embed_mat], input_length=input_len))
model.add(LSTM(250, dropout=0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='nadam',
              metrics=['accuracy'])

fit = model.fit(X_train, train_labels, epochs=200, batch_size=2000, shuffle=True)


scores = model.evaluate(X_val, val_labels, verbose=0)
# print("Accuracy: %.2f%%" % (scores[1]*100))
print (type(scores))
print ("Loss : ", scores[0])
print ("Accuracy : ", scores[1]*100)

Epoch 1/150
Epoch 2/150
Epoch 3/150


Epoch 4/150
Epoch 5/150


Epoch 6/150
Epoch 7/150


Epoch 8/150
Epoch 9/150


Epoch 10/150
Epoch 11/150


Epoch 12/150
Epoch 13/150


Epoch 14/150
Epoch 15/150


Epoch 16/150
Epoch 17/150


Epoch 18/150
Epoch 19/150


Epoch 20/150
Epoch 21/150


Epoch 22/150
Epoch 23/150


Epoch 24/150
Epoch 25/150


Epoch 26/150
Epoch 27/150


Epoch 28/150
Epoch 29/150


Epoch 30/150
Epoch 31/150


Epoch 32/150
Epoch 33/150


Epoch 34/150
Epoch 35/150


Epoch 36/150
Epoch 37/150


Epoch 38/150
Epoch 39/150


Epoch 40/150
Epoch 41/150


Epoch 42/150
Epoch 43/150


Epoch 44/150
Epoch 45/150


Epoch 46/150
Epoch 47/150


Epoch 48/150
Epoch 49/150


Epoch 50/150
Epoch 51/150


Epoch 52/150
Epoch 53/150


Epoch 54/150
Epoch 55/150


Epoch 56/150
Epoch 57/150


Epoch 58/150
Epoch 59/150


Epoch 60/150
Epoch 61/150


Epoch 62/150
Epoch 63/150


Epoch 64/150
Epoch 65/150


Epoch 66/150
Epoch 67/150


Epoch 68/150
Epoch 69/150


Epoch 70/150
Epoch 71/150


Epoch 72/150
Epoch 73/150


Epoch 74/150
Epoch 75/150


Epoch 76/150
Epoch 77/150


Epoch 78/150
Epoch 79/150


Epoch 80/150
Epoch 81/150


Epoch 82/150
Epoch 83/150


Epoch 84/150
Epoch 85/150


Epoch 86/150
Epoch 87/150


Epoch 88/150
Epoch 89/150


Epoch 90/150
Epoch 91/150


Epoch 92/150
Epoch 93/150


Epoch 94/150
Epoch 95/150


Epoch 96/150
Epoch 97/150


Epoch 98/150
Epoch 99/150


Epoch 100/150
Epoch 101/150


Epoch 102/150
Epoch 103/150


Epoch 104/150
Epoch 105/150


Epoch 106/150
Epoch 107/150


Epoch 108/150
Epoch 109/150


Epoch 110/150
Epoch 111/150


Epoch 112/150
Epoch 113/150


Epoch 114/150
Epoch 115/150


Epoch 116/150
Epoch 117/150


Epoch 118/150
Epoch 119/150


Epoch 120/150
Epoch 121/150


Epoch 122/150
Epoch 123/150


Epoch 124/150
Epoch 125/150


Epoch 126/150
Epoch 127/150


Epoch 128/150
Epoch 129/150


Epoch 130/150
Epoch 131/150


Epoch 132/150
Epoch 133/150


Epoch 134/150
Epoch 135/150


Epoch 136/150
Epoch 137/150


Epoch 138/150
Epoch 139/150


Epoch 140/150
Epoch 141/150


Epoch 142/150
Epoch 143/150


Epoch 144/150
Epoch 145/150


Epoch 146/150
Epoch 147/150


Epoch 148/150
Epoch 149/150


Epoch 150/150
<class 'list'>
Loss :  1.34034659535
Accuracy :  80.6784733734


### Siamese LSTM Experiment 1 (Just GloVe vectors)

In [14]:
#For this Siamese experiment we use only 300 dimension GloVe embedding vectors for sentences. No other features.

#siamese parameters
in_dim = len(tokenizr.word_index) + 1
out_dim = 300
_len = 30
_units = 250
_drop = 0.1

In [9]:
#embedding layer which uses previously copmuted GloVe embedding matrix
embed_layer = Embedding(in_dim, out_dim, weights=[embed_mat], input_length=_len)

#LSTM layers. Outputs tensors.
lstm = LSTM(_units, dropout=_drop, recurrent_dropout=_drop)

#input sequences
question_1 = Input(shape=(30,), dtype="int32")
question_2 = Input(shape=(30,), dtype="int32")



#embedding the two input sequences
emb_question_1 = embed_layer(question_1)
emb_question_2 = embed_layer(question_2)


#create representations
_question_1 = lstm(emb_question_1)
_question_2 = lstm(emb_question_2)


#merge the two representations
_merge = concatenate([_question_1, _question_2])
_merge = Dense(150, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge) #dropout layer to prevent overfitting
_merge = BatchNormalization()(_merge)
_merge = Dense(150, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge) #dropout layer to prevent overfitting
_merge = BatchNormalization()(_merge)


is_duplicate = Dense(1, activation='sigmoid')(_merge)



model = Model(inputs=[question_1, question_2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# stop = EarlyStopping(monitor='val_loss', patience=2)
# checkpoint = ModelCheckpoint('model1.h5', monitor='val_acc', save_best_only=True, save_weights_only=False)

fit = model.fit([train_q1, train_q2], train_labels, epochs=50, batch_size=2000, shuffle=True)


score = model.evaluate([val_q1, val_q2], val_labels, verbose=0)

print("\n\n")
print("LOSS : ", score[0])
print("Accuracy : ", score[1]*100)

Epoch 1/25
Epoch 2/25
Epoch 3/25


Epoch 4/25
Epoch 5/25


Epoch 6/25
Epoch 7/25


Epoch 8/25
Epoch 9/25


Epoch 10/25
Epoch 11/25


Epoch 12/25
Epoch 13/25


Epoch 14/25
Epoch 15/25


Epoch 16/25
Epoch 17/25


Epoch 18/25
Epoch 19/25


Epoch 20/25
Epoch 21/25


Epoch 22/25
Epoch 23/25


Epoch 24/25
Epoch 25/25





LOSS :  0.721456297381
Accuracy :  83.8618318528


### Siamese LTSM Experiment 2 (Just GloVe vectors)

#### Note : This program is almost similar to the one above but with some validation.

In [11]:
#siamese parameters
in_dim = len(tokenizr.word_index) + 1
out_dim = 300
_len = 30
_units = 250
_drop = 0.1

In [5]:
embed_layer = Embedding(in_dim, out_dim, weights=[embed_mat], input_length=_len)
lstm = LSTM(_units, dropout=_drop, recurrent_dropout=_drop)

#input sequences
question_1 = Input(shape=(30,), dtype="int32")
question_2 = Input(shape=(30,), dtype="int32")


#embedding the two input sequences
emb_question_1 = embed_layer(question_1)
emb_question_2 = embed_layer(question_2)


#create representations
_question_1 = lstm(emb_question_1)
_question_2 = lstm(emb_question_2)


_merge = concatenate([_question_1, _question_2])
_merge = Dense(150, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(150, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
is_duplicate = Dense(1, activation='sigmoid')(_merge)


model = Model(inputs=[question_1, question_2, features], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


fit = model.fit([train_q1, train_q2, list_of_all_training_features], train_labels, epochs=50, validation_split=0.1, batch_size=2000, shuffle=True)

# model.save('quora_model_1.h5')
# print("\n\n")
# print("LOSS : ", score[0])
# print("Accuracy : ", score[1]*100)
# print("\n\n")


predictions = model.predict([test_q1, test_q2, list_of_all_test_features], batch_size=8192, verbose=1)

submission = panda.DataFrame({'test_id':test_id, 'is_duplicate':predictions.ravel()})
submission.to_csv('result3.csv', index=False)

### Simple LSTM Experiment 2 - using just 20 features

In [12]:
#This LSTM was run using 20 features as input computed during the feature engineering phase.
#By implementing this code it was possible to save weights and stop training when there was no improvement 
#in the loss.

features = Input(shape=(20,), dtype='float32', name='main_input')
x = Embedding(output_dim=300, input_dim=10000, input_length=20)(features)
lstm_out = LSTM(lstm_units)(x)

_merge = Dense(300, activation='relu')(lstm_out)
_merge = Dropout(drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(drop)(_merge)
_merge = BatchNormalization()(_merge)
is_duplicate = Dense(1, activation='sigmoid')(_merge)


model = Model(inputs=[features], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_stop =EarlyStopping(monitor='val_loss', patience=5)

_checkpoint = ModelCheckpoint('mymodel_250_1125_041217.h5', save_best_only=True, save_weights_only=True)

fit = model.fit([list_of_all_training_features], list_of_train_labels, epochs=100, verbose=1, validation_split=0.1, batch_size=384, shuffle=True, callbacks=[_stop, _checkpoint])


model.load_weights('mymodel_250_1125_041217.h5')


predictions = model.predict([list_of_all_test_features], batch_size=8192, verbose=1)

submission = panda.DataFrame({'test_id':test_id, 'is_duplicate':predictions.ravel()})
submission.to_csv('result6.csv', index=False)

Train on 363861 samples, validate on 40429 samples
Epoch 1/100
Epoch 2/100




Epoch 3/100
Epoch 4/100




Epoch 5/100
Epoch 6/100




Epoch 7/100
Epoch 8/100




Epoch 9/100
Epoch 10/100




Epoch 11/100
Epoch 12/100




Epoch 13/100
Epoch 14/100






### Simple LSTM Experiment 3 - Denser Network

In [24]:
#A similar experiment as above. But with a more denser fully connected layers.

features = Input(shape=(20,), dtype='float32', name='main_input')
x = Embedding(output_dim=300, input_dim=10000, input_length=20)(features)
lstm_out = LSTM(lstm_units)(x)

#added more dense layers
_merge = Dense(200, activation='relu')(lstm_out)
_merge = Dropout(drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(200, activation='relu')(_merge)
_merge = Dropout(drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(200, activation='relu')(_merge)
_merge = Dropout(drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(200, activation='relu')(_merge)
_merge = Dropout(drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(200, activation='relu')(_merge)
_merge = Dropout(drop)(_merge)
_merge = BatchNormalization()(_merge)
is_duplicate = Dense(1, activation='sigmoid')(_merge)


model = Model(inputs=[features], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_stop =EarlyStopping(monitor='val_loss', patience=6)

_checkpoint = ModelCheckpoint('mymodel_dense_250_1151_041217.h5', save_best_only=True, save_weights_only=True)

fit = model.fit([list_of_all_training_features], list_of_train_labels, epochs=200, verbose=1, validation_split=0.1, batch_size=384, shuffle=True, callbacks=[_stop, _checkpoint])


model.load_weights('mymodel_dense_250_1151_041217.h5')


predictions = model.predict([list_of_all_test_features], batch_size=8192, verbose=1)

submission = panda.DataFrame({'test_id':test_id, 'is_duplicate':predictions.ravel()})
submission.to_csv('result8.csv', index=False)

Train on 363861 samples, validate on 40429 samples
Epoch 1/200


KeyboardInterrupt: 

### Back to Siamese - Experiment 3 - A much denser Siamese Network

In [20]:
in_dim = len(tokenizr.word_index) + 1
out_dim = 300
_len = 30
_units = 250
_drop = 0.4

In [6]:
#Used more dense layers to test.

embed_layer = Embedding(in_dim, out_dim, weights=[embed_mat], input_length=_len)
lstm = LSTM(_units, dropout=_drop, recurrent_dropout=_drop)

#input sequences
question_1 = Input(shape=(30,), dtype="int32")
question_2 = Input(shape=(30,), dtype="int32")


#embedding the two input sequences
emb_question_1 = embed_layer(question_1)
emb_question_2 = embed_layer(question_2)


#create representations
_question_1 = lstm(emb_question_1)
_question_2 = lstm(emb_question_2)


_merge = concatenate([_question_1, _question_2])
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
is_duplicate = Dense(1, activation='sigmoid')(_merge)


model = Model(inputs=[question_1, question_2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_stop =EarlyStopping(monitor='val_loss', patience=6)
_checkpoint = ModelCheckpoint('dense_siamese_lstm_250_342_051217.h5', save_best_only=True, save_weights_only=True)
fit = model.fit([train_q1, train_q2], train_labels, epochs=200, validation_split=0.1, batch_size=2000, shuffle=True, callbacks=[_stop, _checkpoint])

# model.save('quora_model_1.h5')
# print("\n\n")
# print("LOSS : ", score[0])
# print("Accuracy : ", score[1]*100)
# print("\n\n")


model.load_weights('dense_siamese_lstm_250_342_051217.h5')
predictions = model.predict([test_q1, test_q2], batch_size=8192, verbose=1)

submission = panda.DataFrame({'test_id':test_id, 'is_duplicate':predictions.ravel()})
submission.to_csv('result9.csv', index=False)

NameError: name 'Embedding' is not defined

###         

### A Slightly different architecture using the Siamese LSTM


#### After many experiments, finally found that this architecture gives a good log loss score on Kaggle
#### This gave around 0.181 score for the log loss.

In [45]:
input_dim = len(tokenizr.word_index) + 1
output_dim = 300
input_len = 60
lstm_units = 250
drop = 0.4
in_dim = len(tokenizr.word_index) + 1
out_dim = 300
_len = 30
_units = 250
_drop = 0.4

In [8]:
#embedding layer for sentence vectors
embed_layer = Embedding(in_dim, out_dim, weights=[embed_mat], input_length=_len)
lstm = LSTM(_units, dropout=_drop, recurrent_dropout=_drop)


#getting the features
features = Input(shape=(20,), dtype='float32', name='main_input')
x = Embedding(output_dim=300, input_dim=10000, input_length=20)(features)
lstm_out = LSTM(lstm_units)(x)


#input sequences
question_1 = Input(shape=(30,), dtype="int32")
question_2 = Input(shape=(30,), dtype="int32")


#embedding the two input sequences
emb_question_1 = embed_layer(question_1)
emb_question_2 = embed_layer(question_2)


#create representations
_question_1 = lstm(emb_question_1)
_question_2 = lstm(emb_question_2)


#combining features and sentence vectors
_merge = concatenate([_question_1, _question_2, lstm_out])
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
is_duplicate = Dense(1, activation='sigmoid')(_merge)


model = Model(inputs=[question_1, question_2, features], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_stop =EarlyStopping(monitor='val_loss', patience=6)
_checkpoint = ModelCheckpoint('feature_siamese_lstm_250_543_071217.h5', save_best_only=True, save_weights_only=True)
fit = model.fit([train_q1, train_q2, list_of_all_training_features], train_labels, epochs=200, validation_split=0.1, batch_size=2000, shuffle=True, callbacks=[_stop, _checkpoint])

# model.save('quora_model_1.h5')
# print("\n\n")
# print("LOSS : ", score[0])
# print("Accuracy : ", score[1]*100)
# print("\n\n")


model.load_weights('feature_siamese_lstm_250_543_071217.h5')
predictions = model.predict([test_q1, test_q2, list_of_all_test_features], batch_size=8192, verbose=1)

submission = panda.DataFrame({'test_id':test_id, 'is_duplicate':predictions.ravel()})
submission.to_csv('result13.csv', index=False)

NameError: name 'Embedding' is not defined

###       

### Bidirectional Siamese LSTM
#### An experiment to see how this network performs
#### Got a log loss of 0.22 on Kaggle. Not a good result.

In [29]:
input_dim = len(tokenizr.word_index) + 1
output_dim = 300
input_len = 60
lstm_units = 250
drop = 0.4
in_dim = len(tokenizr.word_index) + 1
out_dim = 300
_len = 30
_units = 250
_drop = 0.4

In [30]:
embed_layer = Embedding(in_dim, out_dim, weights=[embed_mat], input_length=_len)
lstm = Bidirectional(LSTM(_units, dropout=_drop, recurrent_dropout=_drop))

features = Input(shape=(20,), dtype='float32', name='main_input')
x = Embedding(output_dim=300, input_dim=10000, input_length=20)(features)
lstm_out = Bidirectional(LSTM(lstm_units))(x)

#input sequences
question_1 = Input(shape=(30,), dtype="int32")
question_2 = Input(shape=(30,), dtype="int32")


#embedding the two input sequences
emb_question_1 = embed_layer(question_1)
emb_question_2 = embed_layer(question_2)


#create representations
_question_1 = lstm(emb_question_1)
_question_2 = lstm(emb_question_2)


_merge = concatenate([_question_1, _question_2, lstm_out])
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
_merge = Dense(300, activation='relu')(_merge)
_merge = Dropout(_drop)(_merge)
_merge = BatchNormalization()(_merge)
is_duplicate = Dense(1, activation='sigmoid')(_merge)


model = Model(inputs=[question_1, question_2, features], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_stop =EarlyStopping(monitor='val_loss', patience=6)
_checkpoint = ModelCheckpoint('feature_siamese_bilstm_250_219_071217.h5', save_best_only=True, save_weights_only=True)
fit = model.fit([train_q1, train_q2, list_of_all_training_features], train_labels, epochs=200, validation_split=0.1, batch_size=2000, shuffle=True, callbacks=[_stop, _checkpoint])

# model.save('quora_model_1.h5')
# print("\n\n")
# print("LOSS : ", score[0])
# print("Accuracy : ", score[1]*100)
# print("\n\n")


model.load_weights('feature_siamese_bilstm_250_219_071217.h5')
predictions = model.predict([test_q1, test_q2, list_of_all_test_features], batch_size=8192, verbose=1)

submission = panda.DataFrame({'test_id':test_id, 'is_duplicate':predictions.ravel()})
submission.to_csv('result11.csv', index=False)

Train on 363861 samples, validate on 40429 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200


Epoch 4/200
Epoch 5/200


Epoch 6/200
Epoch 7/200


Epoch 8/200
Epoch 9/200


Epoch 10/200




###   

### XGBoost
#### This alone gave a log loss of 0.17+ on Kaggle

In [31]:
pos_train = result_1[y_train == 1]
neg_train = result_1[y_train == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = panda.concat([neg_train, neg_train])
    scale -=1
neg_train = panda.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

x_train = panda.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train


# Finally, we split some of the data off for validation
# from sklearn.cross_validation import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)



# import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.01
params['max_depth'] = 4
params['min_child_weight'] = 6
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['alpha'] = 0.005 #default 0

params['seed'] = 27
# n_estimators=5000,

#params['scale_pos_weight'] = 1


d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 50000, watchlist, early_stopping_rounds=100, verbose_eval=10)





d_test = xgb.DMatrix(result_2)
p_test = bst.predict(d_test)

sub = panda.DataFrame()
sub['test_id'] = test_data['test_id']
sub['is_duplicate'] = p_test
sub.to_csv('20features_xgb_50k.csv', index=False)

0.19124366100096607
[0]	train-logloss:0.686128	valid-logloss:0.686151
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[10]	train-logloss:0.62278	valid-logloss:0.623065
[20]	train-logloss:0.570144	valid-logloss:0.570651
[30]	train-logloss:0.525754	valid-logloss:0.526485
[40]	train-logloss:0.488858	valid-logloss:0.489792
[50]	train-logloss:0.456377	valid-logloss:0.457491
[60]	train-logloss:0.428695	valid-logloss:0.42998
[70]	train-logloss:0.404315	valid-logloss:0.405764
[80]	train-logloss:0.383566	valid-logloss:0.385159
[90]	train-logloss:0.365265	valid-logloss:0.366986
[100]	train-logloss:0.348919	valid-logloss:0.350773
[110]	train-logloss:0.334549	valid-logloss:0.33653
[120]	train-logloss:0.321685	valid-logloss:0.323798
[130]	train-logloss:0.310313	valid-logloss:0.312546
[140]	train-logloss:0.300273	valid-logloss:0.302608
[150]	train-logloss:0.291349	valid-logloss:0.293782
[160]	trai

[1540]	train-logloss:0.197161	valid-logloss:0.202066
[1550]	train-logloss:0.197114	valid-logloss:0.202026
[1560]	train-logloss:0.197067	valid-logloss:0.20199
[1570]	train-logloss:0.197035	valid-logloss:0.201965
[1580]	train-logloss:0.196997	valid-logloss:0.201936
[1590]	train-logloss:0.196961	valid-logloss:0.201906
[1600]	train-logloss:0.196919	valid-logloss:0.201874
[1610]	train-logloss:0.196876	valid-logloss:0.201838
[1620]	train-logloss:0.196831	valid-logloss:0.2018
[1630]	train-logloss:0.196792	valid-logloss:0.201766
[1640]	train-logloss:0.196747	valid-logloss:0.201732
[1650]	train-logloss:0.196711	valid-logloss:0.201701
[1660]	train-logloss:0.196663	valid-logloss:0.201658
[1670]	train-logloss:0.196624	valid-logloss:0.201625
[1680]	train-logloss:0.196592	valid-logloss:0.2016
[1690]	train-logloss:0.196549	valid-logloss:0.201565
[1700]	train-logloss:0.1965	valid-logloss:0.201523
[1710]	train-logloss:0.196449	valid-logloss:0.20148
[1720]	train-logloss:0.196406	valid-logloss:0.201446
[

[3100]	train-logloss:0.192631	valid-logloss:0.198647
[3110]	train-logloss:0.192606	valid-logloss:0.198629
[3120]	train-logloss:0.192583	valid-logloss:0.198613
[3130]	train-logloss:0.192558	valid-logloss:0.198595
[3140]	train-logloss:0.192542	valid-logloss:0.198584
[3150]	train-logloss:0.192522	valid-logloss:0.198573
[3160]	train-logloss:0.192502	valid-logloss:0.198561
[3170]	train-logloss:0.19248	valid-logloss:0.198545
[3180]	train-logloss:0.192458	valid-logloss:0.198533
[3190]	train-logloss:0.192444	valid-logloss:0.198526
[3200]	train-logloss:0.192422	valid-logloss:0.19851
[3210]	train-logloss:0.192405	valid-logloss:0.198501
[3220]	train-logloss:0.192386	valid-logloss:0.198486
[3230]	train-logloss:0.192368	valid-logloss:0.198474
[3240]	train-logloss:0.192348	valid-logloss:0.198462
[3250]	train-logloss:0.192326	valid-logloss:0.198446
[3260]	train-logloss:0.192308	valid-logloss:0.198435
[3270]	train-logloss:0.192293	valid-logloss:0.198425
[3280]	train-logloss:0.192277	valid-logloss:0.19

[4660]	train-logloss:0.190074	valid-logloss:0.197061
[4670]	train-logloss:0.19006	valid-logloss:0.197054
[4680]	train-logloss:0.190047	valid-logloss:0.197047
[4690]	train-logloss:0.190034	valid-logloss:0.197041
[4700]	train-logloss:0.19002	valid-logloss:0.197035
[4710]	train-logloss:0.190005	valid-logloss:0.197027
[4720]	train-logloss:0.189992	valid-logloss:0.197019
[4730]	train-logloss:0.189978	valid-logloss:0.197009
[4740]	train-logloss:0.189964	valid-logloss:0.197002
[4750]	train-logloss:0.189946	valid-logloss:0.196992
[4760]	train-logloss:0.189936	valid-logloss:0.196987
[4770]	train-logloss:0.189923	valid-logloss:0.196979
[4780]	train-logloss:0.189914	valid-logloss:0.196975
[4790]	train-logloss:0.1899	valid-logloss:0.196967
[4800]	train-logloss:0.189884	valid-logloss:0.196957
[4810]	train-logloss:0.18987	valid-logloss:0.196949
[4820]	train-logloss:0.189854	valid-logloss:0.196937
[4830]	train-logloss:0.189839	valid-logloss:0.196928
[4840]	train-logloss:0.18983	valid-logloss:0.196923

[6220]	train-logloss:0.188121	valid-logloss:0.196019
[6230]	train-logloss:0.188111	valid-logloss:0.196017
[6240]	train-logloss:0.188099	valid-logloss:0.19601
[6250]	train-logloss:0.188088	valid-logloss:0.196006
[6260]	train-logloss:0.188076	valid-logloss:0.196
[6270]	train-logloss:0.188066	valid-logloss:0.195996
[6280]	train-logloss:0.188056	valid-logloss:0.195992
[6290]	train-logloss:0.188045	valid-logloss:0.195987
[6300]	train-logloss:0.188033	valid-logloss:0.195981
[6310]	train-logloss:0.188018	valid-logloss:0.195972
[6320]	train-logloss:0.188007	valid-logloss:0.195966
[6330]	train-logloss:0.187996	valid-logloss:0.195961
[6340]	train-logloss:0.187988	valid-logloss:0.195959
[6350]	train-logloss:0.187978	valid-logloss:0.195954
[6360]	train-logloss:0.187966	valid-logloss:0.195946
[6370]	train-logloss:0.187953	valid-logloss:0.19594
[6380]	train-logloss:0.18794	valid-logloss:0.195932
[6390]	train-logloss:0.187926	valid-logloss:0.195925
[6400]	train-logloss:0.187914	valid-logloss:0.195919

[7780]	train-logloss:0.186458	valid-logloss:0.195207
[7790]	train-logloss:0.186446	valid-logloss:0.1952
[7800]	train-logloss:0.186436	valid-logloss:0.195196
[7810]	train-logloss:0.186426	valid-logloss:0.195193
[7820]	train-logloss:0.186415	valid-logloss:0.195188
[7830]	train-logloss:0.186403	valid-logloss:0.195183
[7840]	train-logloss:0.186394	valid-logloss:0.195179
[7850]	train-logloss:0.186385	valid-logloss:0.195176
[7860]	train-logloss:0.186374	valid-logloss:0.195169
[7870]	train-logloss:0.186367	valid-logloss:0.195167
[7880]	train-logloss:0.186354	valid-logloss:0.195159
[7890]	train-logloss:0.186344	valid-logloss:0.195154
[7900]	train-logloss:0.186335	valid-logloss:0.195149
[7910]	train-logloss:0.186323	valid-logloss:0.195145
[7920]	train-logloss:0.18631	valid-logloss:0.195139
[7930]	train-logloss:0.186301	valid-logloss:0.195135
[7940]	train-logloss:0.186293	valid-logloss:0.195131
[7950]	train-logloss:0.186282	valid-logloss:0.195126
[7960]	train-logloss:0.186272	valid-logloss:0.195

[9340]	train-logloss:0.184954	valid-logloss:0.194509
[9350]	train-logloss:0.184944	valid-logloss:0.194504
[9360]	train-logloss:0.184934	valid-logloss:0.194499
[9370]	train-logloss:0.184924	valid-logloss:0.194495
[9380]	train-logloss:0.184917	valid-logloss:0.194492
[9390]	train-logloss:0.184906	valid-logloss:0.194485
[9400]	train-logloss:0.184899	valid-logloss:0.194482
[9410]	train-logloss:0.18489	valid-logloss:0.194478
[9420]	train-logloss:0.18488	valid-logloss:0.194471
[9430]	train-logloss:0.184868	valid-logloss:0.194466
[9440]	train-logloss:0.184858	valid-logloss:0.194461
[9450]	train-logloss:0.18485	valid-logloss:0.194457
[9460]	train-logloss:0.184841	valid-logloss:0.194454
[9470]	train-logloss:0.184832	valid-logloss:0.194449
[9480]	train-logloss:0.184822	valid-logloss:0.194444
[9490]	train-logloss:0.184814	valid-logloss:0.194441
[9500]	train-logloss:0.184805	valid-logloss:0.19444
[9510]	train-logloss:0.184795	valid-logloss:0.194437
[9520]	train-logloss:0.184785	valid-logloss:0.1944

[10880]	train-logloss:0.183611	valid-logloss:0.193944
[10890]	train-logloss:0.183601	valid-logloss:0.19394
[10900]	train-logloss:0.183593	valid-logloss:0.193936
[10910]	train-logloss:0.183583	valid-logloss:0.19393
[10920]	train-logloss:0.183574	valid-logloss:0.193927
[10930]	train-logloss:0.183566	valid-logloss:0.193924
[10940]	train-logloss:0.183559	valid-logloss:0.193921
[10950]	train-logloss:0.183549	valid-logloss:0.193919
[10960]	train-logloss:0.183538	valid-logloss:0.193911
[10970]	train-logloss:0.18353	valid-logloss:0.193907
[10980]	train-logloss:0.183523	valid-logloss:0.193905
[10990]	train-logloss:0.183514	valid-logloss:0.193901
[11000]	train-logloss:0.183507	valid-logloss:0.193898
[11010]	train-logloss:0.183499	valid-logloss:0.193896
[11020]	train-logloss:0.183491	valid-logloss:0.193893
[11030]	train-logloss:0.183484	valid-logloss:0.19389
[11040]	train-logloss:0.183474	valid-logloss:0.193886
[11050]	train-logloss:0.183465	valid-logloss:0.193882
[11060]	train-logloss:0.183456	v

[12410]	train-logloss:0.182366	valid-logloss:0.193425
[12420]	train-logloss:0.182358	valid-logloss:0.193422
[12430]	train-logloss:0.18235	valid-logloss:0.193418
[12440]	train-logloss:0.182341	valid-logloss:0.193414
[12450]	train-logloss:0.182334	valid-logloss:0.19341
[12460]	train-logloss:0.182327	valid-logloss:0.193407
[12470]	train-logloss:0.182317	valid-logloss:0.193403
[12480]	train-logloss:0.182308	valid-logloss:0.1934
[12490]	train-logloss:0.182301	valid-logloss:0.193399
[12500]	train-logloss:0.182292	valid-logloss:0.193395
[12510]	train-logloss:0.182284	valid-logloss:0.193391
[12520]	train-logloss:0.182278	valid-logloss:0.193388
[12530]	train-logloss:0.182269	valid-logloss:0.193385
[12540]	train-logloss:0.18226	valid-logloss:0.193383
[12550]	train-logloss:0.182252	valid-logloss:0.19338
[12560]	train-logloss:0.182243	valid-logloss:0.193377
[12570]	train-logloss:0.182235	valid-logloss:0.193374
[12580]	train-logloss:0.182227	valid-logloss:0.19337
[12590]	train-logloss:0.182221	vali

[13940]	train-logloss:0.181182	valid-logloss:0.192943
[13950]	train-logloss:0.181177	valid-logloss:0.192943
[13960]	train-logloss:0.18117	valid-logloss:0.192941
[13970]	train-logloss:0.181164	valid-logloss:0.192938
[13980]	train-logloss:0.181156	valid-logloss:0.192935
[13990]	train-logloss:0.181149	valid-logloss:0.192931
[14000]	train-logloss:0.181144	valid-logloss:0.192928
[14010]	train-logloss:0.181134	valid-logloss:0.192924
[14020]	train-logloss:0.181127	valid-logloss:0.192922
[14030]	train-logloss:0.181119	valid-logloss:0.192919
[14040]	train-logloss:0.181111	valid-logloss:0.192917
[14050]	train-logloss:0.181104	valid-logloss:0.192914
[14060]	train-logloss:0.181095	valid-logloss:0.192911
[14070]	train-logloss:0.181088	valid-logloss:0.19291
[14080]	train-logloss:0.18108	valid-logloss:0.192907
[14090]	train-logloss:0.181073	valid-logloss:0.192906
[14100]	train-logloss:0.181067	valid-logloss:0.192904
[14110]	train-logloss:0.181059	valid-logloss:0.192901
[14120]	train-logloss:0.181053	

[15470]	train-logloss:0.180073	valid-logloss:0.192525
[15480]	train-logloss:0.180065	valid-logloss:0.192521
[15490]	train-logloss:0.180059	valid-logloss:0.192518
[15500]	train-logloss:0.180052	valid-logloss:0.192517
[15510]	train-logloss:0.180044	valid-logloss:0.192515
[15520]	train-logloss:0.180037	valid-logloss:0.192511
[15530]	train-logloss:0.18003	valid-logloss:0.192509
[15540]	train-logloss:0.180023	valid-logloss:0.192507
[15550]	train-logloss:0.180016	valid-logloss:0.192504
[15560]	train-logloss:0.180007	valid-logloss:0.1925
[15570]	train-logloss:0.180001	valid-logloss:0.192497
[15580]	train-logloss:0.179993	valid-logloss:0.192493
[15590]	train-logloss:0.179985	valid-logloss:0.192491
[15600]	train-logloss:0.179978	valid-logloss:0.192488
[15610]	train-logloss:0.179971	valid-logloss:0.192486
[15620]	train-logloss:0.179964	valid-logloss:0.192483
[15630]	train-logloss:0.179958	valid-logloss:0.19248
[15640]	train-logloss:0.179949	valid-logloss:0.192478
[15650]	train-logloss:0.179941	v

[17000]	train-logloss:0.17899	valid-logloss:0.192118
[17010]	train-logloss:0.178983	valid-logloss:0.192117
[17020]	train-logloss:0.178976	valid-logloss:0.192116
[17030]	train-logloss:0.178967	valid-logloss:0.192111
[17040]	train-logloss:0.17896	valid-logloss:0.19211
[17050]	train-logloss:0.178953	valid-logloss:0.192108
[17060]	train-logloss:0.178945	valid-logloss:0.192106
[17070]	train-logloss:0.178939	valid-logloss:0.192103
[17080]	train-logloss:0.178934	valid-logloss:0.192102
[17090]	train-logloss:0.178928	valid-logloss:0.192099
[17100]	train-logloss:0.178921	valid-logloss:0.192097
[17110]	train-logloss:0.178914	valid-logloss:0.192095
[17120]	train-logloss:0.178907	valid-logloss:0.192091
[17130]	train-logloss:0.1789	valid-logloss:0.192088
[17140]	train-logloss:0.178892	valid-logloss:0.192086
[17150]	train-logloss:0.178886	valid-logloss:0.192084
[17160]	train-logloss:0.178879	valid-logloss:0.192081
[17170]	train-logloss:0.178872	valid-logloss:0.192078
[17180]	train-logloss:0.178866	va

[18530]	train-logloss:0.177965	valid-logloss:0.191731
[18540]	train-logloss:0.177958	valid-logloss:0.191729
[18550]	train-logloss:0.177951	valid-logloss:0.191727
[18560]	train-logloss:0.177944	valid-logloss:0.191726
[18570]	train-logloss:0.177937	valid-logloss:0.191724
[18580]	train-logloss:0.177931	valid-logloss:0.191722
[18590]	train-logloss:0.177926	valid-logloss:0.191721
[18600]	train-logloss:0.17792	valid-logloss:0.191719
[18610]	train-logloss:0.177913	valid-logloss:0.191718
[18620]	train-logloss:0.177907	valid-logloss:0.191716
[18630]	train-logloss:0.177901	valid-logloss:0.191715
[18640]	train-logloss:0.177893	valid-logloss:0.191712
[18650]	train-logloss:0.177886	valid-logloss:0.191709
[18660]	train-logloss:0.177881	valid-logloss:0.191707
[18670]	train-logloss:0.177874	valid-logloss:0.191705
[18680]	train-logloss:0.177867	valid-logloss:0.191701
[18690]	train-logloss:0.177862	valid-logloss:0.1917
[18700]	train-logloss:0.177855	valid-logloss:0.191696
[18710]	train-logloss:0.177849	

[20060]	train-logloss:0.176964	valid-logloss:0.19138
[20070]	train-logloss:0.176958	valid-logloss:0.191377
[20080]	train-logloss:0.176952	valid-logloss:0.191375
[20090]	train-logloss:0.176946	valid-logloss:0.191374
[20100]	train-logloss:0.17694	valid-logloss:0.191371
[20110]	train-logloss:0.176935	valid-logloss:0.191369
[20120]	train-logloss:0.176927	valid-logloss:0.191366
[20130]	train-logloss:0.176921	valid-logloss:0.191363
[20140]	train-logloss:0.176914	valid-logloss:0.191361
[20150]	train-logloss:0.176907	valid-logloss:0.191359
[20160]	train-logloss:0.176902	valid-logloss:0.191356
[20170]	train-logloss:0.176895	valid-logloss:0.191354
[20180]	train-logloss:0.17689	valid-logloss:0.191351
[20190]	train-logloss:0.176884	valid-logloss:0.191352
[20200]	train-logloss:0.176877	valid-logloss:0.191348
[20210]	train-logloss:0.176871	valid-logloss:0.191348
[20220]	train-logloss:0.176865	valid-logloss:0.191346
[20230]	train-logloss:0.176858	valid-logloss:0.191343
[20240]	train-logloss:0.176852	

[21590]	train-logloss:0.176007	valid-logloss:0.191055
[21600]	train-logloss:0.176	valid-logloss:0.191053
[21610]	train-logloss:0.175994	valid-logloss:0.19105
[21620]	train-logloss:0.175987	valid-logloss:0.191046
[21630]	train-logloss:0.17598	valid-logloss:0.191044
[21640]	train-logloss:0.175973	valid-logloss:0.191043
[21650]	train-logloss:0.175969	valid-logloss:0.19104
[21660]	train-logloss:0.175962	valid-logloss:0.191037
[21670]	train-logloss:0.175956	valid-logloss:0.191035
[21680]	train-logloss:0.17595	valid-logloss:0.191032
[21690]	train-logloss:0.175944	valid-logloss:0.191029
[21700]	train-logloss:0.175937	valid-logloss:0.191028
[21710]	train-logloss:0.175932	valid-logloss:0.191026
[21720]	train-logloss:0.175925	valid-logloss:0.191022
[21730]	train-logloss:0.175919	valid-logloss:0.19102
[21740]	train-logloss:0.175912	valid-logloss:0.191018
[21750]	train-logloss:0.175907	valid-logloss:0.191016
[21760]	train-logloss:0.1759	valid-logloss:0.191013
[21770]	train-logloss:0.175893	valid-l

[23120]	train-logloss:0.175073	valid-logloss:0.19073
[23130]	train-logloss:0.175068	valid-logloss:0.190727
[23140]	train-logloss:0.175063	valid-logloss:0.190724
[23150]	train-logloss:0.175058	valid-logloss:0.190723
[23160]	train-logloss:0.175052	valid-logloss:0.190721
[23170]	train-logloss:0.175045	valid-logloss:0.190719
[23180]	train-logloss:0.17504	valid-logloss:0.190717
[23190]	train-logloss:0.175034	valid-logloss:0.190714
[23200]	train-logloss:0.175026	valid-logloss:0.190714
[23210]	train-logloss:0.17502	valid-logloss:0.190711
[23220]	train-logloss:0.175015	valid-logloss:0.190709
[23230]	train-logloss:0.17501	valid-logloss:0.190707
[23240]	train-logloss:0.175003	valid-logloss:0.190705
[23250]	train-logloss:0.174998	valid-logloss:0.190704
[23260]	train-logloss:0.174992	valid-logloss:0.190703
[23270]	train-logloss:0.174986	valid-logloss:0.190702
[23280]	train-logloss:0.174981	valid-logloss:0.190701
[23290]	train-logloss:0.174974	valid-logloss:0.1907
[23300]	train-logloss:0.17497	vali

[24650]	train-logloss:0.174168	valid-logloss:0.190417
[24660]	train-logloss:0.174163	valid-logloss:0.190416
[24670]	train-logloss:0.174158	valid-logloss:0.190415
[24680]	train-logloss:0.174151	valid-logloss:0.190411
[24690]	train-logloss:0.174146	valid-logloss:0.190411
[24700]	train-logloss:0.174141	valid-logloss:0.19041
[24710]	train-logloss:0.174135	valid-logloss:0.190407
[24720]	train-logloss:0.17413	valid-logloss:0.190407
[24730]	train-logloss:0.174124	valid-logloss:0.190406
[24740]	train-logloss:0.174117	valid-logloss:0.190404
[24750]	train-logloss:0.174112	valid-logloss:0.190402
[24760]	train-logloss:0.174106	valid-logloss:0.1904
[24770]	train-logloss:0.174101	valid-logloss:0.190401
[24780]	train-logloss:0.174095	valid-logloss:0.190398
[24790]	train-logloss:0.174089	valid-logloss:0.190396
[24800]	train-logloss:0.174083	valid-logloss:0.190396
[24810]	train-logloss:0.174076	valid-logloss:0.190395
[24820]	train-logloss:0.174071	valid-logloss:0.190393
[24830]	train-logloss:0.174064	v

[26180]	train-logloss:0.173299	valid-logloss:0.190155
[26190]	train-logloss:0.173294	valid-logloss:0.190154
[26200]	train-logloss:0.173289	valid-logloss:0.190152
[26210]	train-logloss:0.173282	valid-logloss:0.190148
[26220]	train-logloss:0.173277	valid-logloss:0.190147
[26230]	train-logloss:0.173272	valid-logloss:0.190144
[26240]	train-logloss:0.173266	valid-logloss:0.190144
[26250]	train-logloss:0.17326	valid-logloss:0.190142
[26260]	train-logloss:0.173253	valid-logloss:0.190139
[26270]	train-logloss:0.173246	valid-logloss:0.190135
[26280]	train-logloss:0.173241	valid-logloss:0.190133
[26290]	train-logloss:0.173235	valid-logloss:0.190133
[26300]	train-logloss:0.173229	valid-logloss:0.190132
[26310]	train-logloss:0.173223	valid-logloss:0.190131
[26320]	train-logloss:0.173217	valid-logloss:0.19013
[26330]	train-logloss:0.17321	valid-logloss:0.190127
[26340]	train-logloss:0.173204	valid-logloss:0.190125
[26350]	train-logloss:0.1732	valid-logloss:0.190124
[26360]	train-logloss:0.173193	va

[27710]	train-logloss:0.172445	valid-logloss:0.189888
[27720]	train-logloss:0.17244	valid-logloss:0.189885
[27730]	train-logloss:0.172434	valid-logloss:0.189883
[27740]	train-logloss:0.172429	valid-logloss:0.189883
[27750]	train-logloss:0.172423	valid-logloss:0.18988
[27760]	train-logloss:0.172418	valid-logloss:0.189879
[27770]	train-logloss:0.172412	valid-logloss:0.189876
[27780]	train-logloss:0.172407	valid-logloss:0.189875
[27790]	train-logloss:0.172402	valid-logloss:0.189873
[27800]	train-logloss:0.172397	valid-logloss:0.189873
[27810]	train-logloss:0.172392	valid-logloss:0.189871
[27820]	train-logloss:0.172386	valid-logloss:0.18987
[27830]	train-logloss:0.172382	valid-logloss:0.189868
[27840]	train-logloss:0.172376	valid-logloss:0.189866
[27850]	train-logloss:0.172371	valid-logloss:0.189863
[27860]	train-logloss:0.172364	valid-logloss:0.189861
[27870]	train-logloss:0.172358	valid-logloss:0.18986
[27880]	train-logloss:0.172352	valid-logloss:0.189857
[27890]	train-logloss:0.172346	v

[29240]	train-logloss:0.171612	valid-logloss:0.189631
[29250]	train-logloss:0.171607	valid-logloss:0.189629
[29260]	train-logloss:0.1716	valid-logloss:0.189626
[29270]	train-logloss:0.171595	valid-logloss:0.189625
[29280]	train-logloss:0.171588	valid-logloss:0.189623
[29290]	train-logloss:0.171583	valid-logloss:0.189622
[29300]	train-logloss:0.171577	valid-logloss:0.18962
[29310]	train-logloss:0.171572	valid-logloss:0.18962
[29320]	train-logloss:0.171567	valid-logloss:0.189615
[29330]	train-logloss:0.171562	valid-logloss:0.189613
[29340]	train-logloss:0.171557	valid-logloss:0.189611
[29350]	train-logloss:0.171552	valid-logloss:0.18961
[29360]	train-logloss:0.171547	valid-logloss:0.189607
[29370]	train-logloss:0.171542	valid-logloss:0.189605
[29380]	train-logloss:0.171537	valid-logloss:0.189603
[29390]	train-logloss:0.171531	valid-logloss:0.189602
[29400]	train-logloss:0.171527	valid-logloss:0.189601
[29410]	train-logloss:0.17152	valid-logloss:0.189598
[29420]	train-logloss:0.171514	val

[30770]	train-logloss:0.170799	valid-logloss:0.189382
[30780]	train-logloss:0.170794	valid-logloss:0.189381
[30790]	train-logloss:0.170789	valid-logloss:0.189379
[30800]	train-logloss:0.170783	valid-logloss:0.189378
[30810]	train-logloss:0.170778	valid-logloss:0.189376
[30820]	train-logloss:0.170773	valid-logloss:0.189374
[30830]	train-logloss:0.170766	valid-logloss:0.189371
[30840]	train-logloss:0.170761	valid-logloss:0.189369
[30850]	train-logloss:0.170755	valid-logloss:0.189367
[30860]	train-logloss:0.170751	valid-logloss:0.189367
[30870]	train-logloss:0.170746	valid-logloss:0.189365
[30880]	train-logloss:0.170739	valid-logloss:0.189363
[30890]	train-logloss:0.170735	valid-logloss:0.189362
[30900]	train-logloss:0.170728	valid-logloss:0.189361
[30910]	train-logloss:0.170723	valid-logloss:0.189359
[30920]	train-logloss:0.170718	valid-logloss:0.189359
[30930]	train-logloss:0.170714	valid-logloss:0.189357
[30940]	train-logloss:0.170708	valid-logloss:0.189356
[30950]	train-logloss:0.1707

[32300]	train-logloss:0.170001	valid-logloss:0.189141
[32310]	train-logloss:0.169997	valid-logloss:0.18914
[32320]	train-logloss:0.169992	valid-logloss:0.189139
[32330]	train-logloss:0.169986	valid-logloss:0.189137
[32340]	train-logloss:0.16998	valid-logloss:0.189134
[32350]	train-logloss:0.169976	valid-logloss:0.189133
[32360]	train-logloss:0.169971	valid-logloss:0.18913
[32370]	train-logloss:0.169966	valid-logloss:0.189129
[32380]	train-logloss:0.169961	valid-logloss:0.189127
[32390]	train-logloss:0.169955	valid-logloss:0.189124
[32400]	train-logloss:0.16995	valid-logloss:0.189121
[32410]	train-logloss:0.169945	valid-logloss:0.189119
[32420]	train-logloss:0.169939	valid-logloss:0.189116
[32430]	train-logloss:0.169934	valid-logloss:0.189114
[32440]	train-logloss:0.16993	valid-logloss:0.189113
[32450]	train-logloss:0.169925	valid-logloss:0.189112
[32460]	train-logloss:0.169921	valid-logloss:0.189111
[32470]	train-logloss:0.169915	valid-logloss:0.18911
[32480]	train-logloss:0.169909	val

[33830]	train-logloss:0.169219	valid-logloss:0.188891
[33840]	train-logloss:0.169214	valid-logloss:0.188891
[33850]	train-logloss:0.169209	valid-logloss:0.18889
[33860]	train-logloss:0.169204	valid-logloss:0.188888
[33870]	train-logloss:0.169199	valid-logloss:0.188886
[33880]	train-logloss:0.169194	valid-logloss:0.188885
[33890]	train-logloss:0.169188	valid-logloss:0.188883
[33900]	train-logloss:0.169183	valid-logloss:0.188882
[33910]	train-logloss:0.169178	valid-logloss:0.188882
[33920]	train-logloss:0.169173	valid-logloss:0.18888
[33930]	train-logloss:0.169169	valid-logloss:0.188879
[33940]	train-logloss:0.169163	valid-logloss:0.188878
[33950]	train-logloss:0.169158	valid-logloss:0.188875
[33960]	train-logloss:0.169153	valid-logloss:0.188874
[33970]	train-logloss:0.169149	valid-logloss:0.188873
[33980]	train-logloss:0.169145	valid-logloss:0.188871
[33990]	train-logloss:0.16914	valid-logloss:0.188869
[34000]	train-logloss:0.169134	valid-logloss:0.188868
[34010]	train-logloss:0.16913	v

[35360]	train-logloss:0.168464	valid-logloss:0.188689
[35370]	train-logloss:0.168459	valid-logloss:0.18869
[35380]	train-logloss:0.168453	valid-logloss:0.188688
[35390]	train-logloss:0.168448	valid-logloss:0.188685
[35400]	train-logloss:0.168444	valid-logloss:0.188683
[35410]	train-logloss:0.168439	valid-logloss:0.18868
[35420]	train-logloss:0.168434	valid-logloss:0.188678
[35430]	train-logloss:0.168429	valid-logloss:0.188676
[35440]	train-logloss:0.168423	valid-logloss:0.188675
[35450]	train-logloss:0.168418	valid-logloss:0.188674
[35460]	train-logloss:0.168413	valid-logloss:0.188674
[35470]	train-logloss:0.168408	valid-logloss:0.188672
[35480]	train-logloss:0.168403	valid-logloss:0.188671
[35490]	train-logloss:0.168398	valid-logloss:0.18867
[35500]	train-logloss:0.168393	valid-logloss:0.188669
[35510]	train-logloss:0.168389	valid-logloss:0.188668
[35520]	train-logloss:0.168384	valid-logloss:0.188667
[35530]	train-logloss:0.168379	valid-logloss:0.188666
[35540]	train-logloss:0.168374	

[36890]	train-logloss:0.167715	valid-logloss:0.18848
[36900]	train-logloss:0.167712	valid-logloss:0.188481
[36910]	train-logloss:0.167707	valid-logloss:0.188479
[36920]	train-logloss:0.167701	valid-logloss:0.188476
[36930]	train-logloss:0.167697	valid-logloss:0.188475
[36940]	train-logloss:0.167692	valid-logloss:0.188474
[36950]	train-logloss:0.167687	valid-logloss:0.188472
[36960]	train-logloss:0.167681	valid-logloss:0.188472
[36970]	train-logloss:0.167677	valid-logloss:0.188471
[36980]	train-logloss:0.167672	valid-logloss:0.18847
[36990]	train-logloss:0.167667	valid-logloss:0.188467
[37000]	train-logloss:0.167662	valid-logloss:0.188465
[37010]	train-logloss:0.167658	valid-logloss:0.188463
[37020]	train-logloss:0.167654	valid-logloss:0.188462
[37030]	train-logloss:0.167649	valid-logloss:0.18846
[37040]	train-logloss:0.167644	valid-logloss:0.18846
[37050]	train-logloss:0.16764	valid-logloss:0.188459
[37060]	train-logloss:0.167635	valid-logloss:0.188456
[37070]	train-logloss:0.167629	va

[38420]	train-logloss:0.166992	valid-logloss:0.188276
[38430]	train-logloss:0.166988	valid-logloss:0.188275
[38440]	train-logloss:0.166982	valid-logloss:0.188272
[38450]	train-logloss:0.166978	valid-logloss:0.188271
[38460]	train-logloss:0.166973	valid-logloss:0.18827
[38470]	train-logloss:0.166968	valid-logloss:0.188267
[38480]	train-logloss:0.166963	valid-logloss:0.188267
[38490]	train-logloss:0.166958	valid-logloss:0.188265
[38500]	train-logloss:0.166953	valid-logloss:0.188263
[38510]	train-logloss:0.166947	valid-logloss:0.188262
[38520]	train-logloss:0.166943	valid-logloss:0.188261
[38530]	train-logloss:0.166938	valid-logloss:0.18826
[38540]	train-logloss:0.166933	valid-logloss:0.188257
[38550]	train-logloss:0.166928	valid-logloss:0.188256
[38560]	train-logloss:0.166924	valid-logloss:0.188255
[38570]	train-logloss:0.166919	valid-logloss:0.188253
[38580]	train-logloss:0.166914	valid-logloss:0.188253
[38590]	train-logloss:0.166909	valid-logloss:0.188251
[38600]	train-logloss:0.166904

[39950]	train-logloss:0.166276	valid-logloss:0.188078
[39960]	train-logloss:0.166272	valid-logloss:0.188078
[39970]	train-logloss:0.166268	valid-logloss:0.188077
[39980]	train-logloss:0.166263	valid-logloss:0.188076
[39990]	train-logloss:0.166257	valid-logloss:0.188073
[40000]	train-logloss:0.166252	valid-logloss:0.188071
[40010]	train-logloss:0.166249	valid-logloss:0.18807
[40020]	train-logloss:0.166244	valid-logloss:0.188069
[40030]	train-logloss:0.166239	valid-logloss:0.188067
[40040]	train-logloss:0.166234	valid-logloss:0.188065
[40050]	train-logloss:0.16623	valid-logloss:0.188066
[40060]	train-logloss:0.166225	valid-logloss:0.188064
[40070]	train-logloss:0.166221	valid-logloss:0.188063
[40080]	train-logloss:0.166216	valid-logloss:0.188062
[40090]	train-logloss:0.166211	valid-logloss:0.188061
[40100]	train-logloss:0.166206	valid-logloss:0.188062
[40110]	train-logloss:0.166201	valid-logloss:0.18806
[40120]	train-logloss:0.166197	valid-logloss:0.188059
[40130]	train-logloss:0.166193	

[41480]	train-logloss:0.165573	valid-logloss:0.187887
[41490]	train-logloss:0.165568	valid-logloss:0.187886
[41500]	train-logloss:0.165563	valid-logloss:0.187884
[41510]	train-logloss:0.165559	valid-logloss:0.187884
[41520]	train-logloss:0.165555	valid-logloss:0.187883
[41530]	train-logloss:0.165552	valid-logloss:0.187882
[41540]	train-logloss:0.165548	valid-logloss:0.18788
[41550]	train-logloss:0.165543	valid-logloss:0.187881
[41560]	train-logloss:0.165538	valid-logloss:0.18788
[41570]	train-logloss:0.165534	valid-logloss:0.18788
[41580]	train-logloss:0.165529	valid-logloss:0.187879
[41590]	train-logloss:0.165525	valid-logloss:0.187876
[41600]	train-logloss:0.16552	valid-logloss:0.187874
[41610]	train-logloss:0.165516	valid-logloss:0.187872
[41620]	train-logloss:0.16551	valid-logloss:0.18787
[41630]	train-logloss:0.165506	valid-logloss:0.18787
[41640]	train-logloss:0.165503	valid-logloss:0.187868
[41650]	train-logloss:0.165497	valid-logloss:0.187867
[41660]	train-logloss:0.165492	vali

[43010]	train-logloss:0.164876	valid-logloss:0.187695
[43020]	train-logloss:0.164871	valid-logloss:0.187692
[43030]	train-logloss:0.164867	valid-logloss:0.187692
[43040]	train-logloss:0.164863	valid-logloss:0.187691
[43050]	train-logloss:0.164859	valid-logloss:0.187689
[43060]	train-logloss:0.164855	valid-logloss:0.187687
[43070]	train-logloss:0.164851	valid-logloss:0.187685
[43080]	train-logloss:0.164846	valid-logloss:0.187683
[43090]	train-logloss:0.164842	valid-logloss:0.187682
[43100]	train-logloss:0.164837	valid-logloss:0.18768
[43110]	train-logloss:0.164833	valid-logloss:0.187679
[43120]	train-logloss:0.164828	valid-logloss:0.187677
[43130]	train-logloss:0.164823	valid-logloss:0.187676
[43140]	train-logloss:0.164818	valid-logloss:0.187675
[43150]	train-logloss:0.164814	valid-logloss:0.187676
[43160]	train-logloss:0.164809	valid-logloss:0.187675
[43170]	train-logloss:0.164804	valid-logloss:0.187675
[43180]	train-logloss:0.164801	valid-logloss:0.187673
[43190]	train-logloss:0.16479

[44540]	train-logloss:0.164192	valid-logloss:0.187506
[44550]	train-logloss:0.164187	valid-logloss:0.187504
[44560]	train-logloss:0.164184	valid-logloss:0.187503
[44570]	train-logloss:0.16418	valid-logloss:0.187502
[44580]	train-logloss:0.164175	valid-logloss:0.187501
[44590]	train-logloss:0.16417	valid-logloss:0.1875
[44600]	train-logloss:0.164165	valid-logloss:0.187499
[44610]	train-logloss:0.164161	valid-logloss:0.187496
[44620]	train-logloss:0.164156	valid-logloss:0.187494
[44630]	train-logloss:0.164151	valid-logloss:0.187492
[44640]	train-logloss:0.164147	valid-logloss:0.187492
[44650]	train-logloss:0.164142	valid-logloss:0.187492
[44660]	train-logloss:0.164137	valid-logloss:0.18749
[44670]	train-logloss:0.164134	valid-logloss:0.187489
[44680]	train-logloss:0.16413	valid-logloss:0.187488
[44690]	train-logloss:0.164126	valid-logloss:0.187487
[44700]	train-logloss:0.164121	valid-logloss:0.187487
[44710]	train-logloss:0.164116	valid-logloss:0.187486
[44720]	train-logloss:0.164112	val

[46070]	train-logloss:0.163521	valid-logloss:0.187328
[46080]	train-logloss:0.163516	valid-logloss:0.187326
[46090]	train-logloss:0.163513	valid-logloss:0.187324
[46100]	train-logloss:0.163508	valid-logloss:0.187323
[46110]	train-logloss:0.163503	valid-logloss:0.187321
[46120]	train-logloss:0.1635	valid-logloss:0.18732
[46130]	train-logloss:0.163495	valid-logloss:0.18732
[46140]	train-logloss:0.163491	valid-logloss:0.18732
[46150]	train-logloss:0.163487	valid-logloss:0.187319
[46160]	train-logloss:0.163483	valid-logloss:0.187316
[46170]	train-logloss:0.163479	valid-logloss:0.187314
[46180]	train-logloss:0.163474	valid-logloss:0.187312
[46190]	train-logloss:0.163471	valid-logloss:0.187311
[46200]	train-logloss:0.163467	valid-logloss:0.187311
[46210]	train-logloss:0.163462	valid-logloss:0.18731
[46220]	train-logloss:0.163458	valid-logloss:0.187311
[46230]	train-logloss:0.163454	valid-logloss:0.187308
[46240]	train-logloss:0.16345	valid-logloss:0.187308
[46250]	train-logloss:0.163445	vali

[47600]	train-logloss:0.162866	valid-logloss:0.187163
[47610]	train-logloss:0.162861	valid-logloss:0.187161
[47620]	train-logloss:0.162857	valid-logloss:0.18716
[47630]	train-logloss:0.162853	valid-logloss:0.187158
[47640]	train-logloss:0.162849	valid-logloss:0.187157
[47650]	train-logloss:0.162844	valid-logloss:0.187156
[47660]	train-logloss:0.16284	valid-logloss:0.187156
[47670]	train-logloss:0.162836	valid-logloss:0.187156
[47680]	train-logloss:0.162831	valid-logloss:0.187153
[47690]	train-logloss:0.162827	valid-logloss:0.187152
[47700]	train-logloss:0.162822	valid-logloss:0.187152
[47710]	train-logloss:0.162817	valid-logloss:0.187152
[47720]	train-logloss:0.162813	valid-logloss:0.187151
[47730]	train-logloss:0.162809	valid-logloss:0.187149
[47740]	train-logloss:0.162805	valid-logloss:0.187148
[47750]	train-logloss:0.162801	valid-logloss:0.187147
[47760]	train-logloss:0.162796	valid-logloss:0.187145
[47770]	train-logloss:0.162791	valid-logloss:0.187143
[47780]	train-logloss:0.162788

[49130]	train-logloss:0.162223	valid-logloss:0.186989
[49140]	train-logloss:0.162218	valid-logloss:0.186987
[49150]	train-logloss:0.162215	valid-logloss:0.186987
[49160]	train-logloss:0.162211	valid-logloss:0.186986
[49170]	train-logloss:0.162206	valid-logloss:0.186985
[49180]	train-logloss:0.162201	valid-logloss:0.186984
[49190]	train-logloss:0.162197	valid-logloss:0.186983
[49200]	train-logloss:0.162193	valid-logloss:0.186982
[49210]	train-logloss:0.162188	valid-logloss:0.18698
[49220]	train-logloss:0.162185	valid-logloss:0.186979
[49230]	train-logloss:0.162181	valid-logloss:0.186978
[49240]	train-logloss:0.162178	valid-logloss:0.186979
[49250]	train-logloss:0.162174	valid-logloss:0.186978
[49260]	train-logloss:0.162169	valid-logloss:0.186977
[49270]	train-logloss:0.162164	valid-logloss:0.186976
[49280]	train-logloss:0.16216	valid-logloss:0.186976
[49290]	train-logloss:0.162156	valid-logloss:0.186975
[49300]	train-logloss:0.162151	valid-logloss:0.186974
[49310]	train-logloss:0.162146

###    

### Ensemble Siamese LSTM + XGBOOST

#### This ensemble got a log loss of 0.16+ on Kaggle 
#### It is a simple averaging ensemble. An average of the predictions from the XGBoost and the Siamese LSTM Network.

In [10]:
#Average ensemble

#read predictions
final_result1 = panda.read_csv('result10.csv')
final_result2 = panda.read_csv('20features_xgb_50k.csv')

#create dataframe and average the predictions
df = panda.DataFrame()
df['test_id'] = final_result1['test_id']
df['is_duplicate'] = (final_result1['is_duplicate'] + final_result2['is_duplicate'])/2


df.to_csv('ensemble_submission.csv', index=False)