## XGB Models
Initially single xgb model was created, finally 7 LSTM models were created with random dense layers and dropouts and averaged with 6 xgb models with random hyperparameters.

## Importing Packages

In [1]:
import argparse
import functools
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from xgboost import XGBClassifier



## Loading Features
Features were created using feature_engineering.ipynb and page_rank.ipynb

In [5]:
df_train = pd.read_csv('train.csv')

In [6]:
y_train = df_train['is_duplicate'].values

In [7]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')

X_train.shape, X_test.shape

((404290, 79), (2345796, 79))

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242)

## UpDown Sampling

In [9]:
#UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

0.189752932122


In [10]:
pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
y_valid = np.array([0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
print(np.mean(y_valid))
del pos_valid, neg_valid

0.189234677675


## Lots of XG boost

In [69]:
sub = pd.read_csv('xgb_final.csv')

In [81]:
for i in range(9):
    np.random.seed(i+1)
    params = {}
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    et = [.02,.025,.01,.015]
    params['eta'] = np.random.choice(et)
    params['n_jobs'] = 5
    depth = [4,5,6,7]
    params['max_depth'] = np.random.choice(depth)
    sub = [.5,.6,.7,.4]
    params['subsample'] = np.random.choice(sub)
    params['base_score'] = 0.2
    col = [1,.7]
    params['colsample_bytree'] = np.random.choice(col)
    #params['scale_pos_weight'] = 0.36

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_valid, label=y_valid)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    print ('training for fold = '+ str(i+1))
    bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
    print ('-> predicting')
    d_test = xgb.DMatrix(X_test)
    p_test = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)
    print('saving to csv')
    sub = pd.read_csv('xgb_final.csv')
    sub[str('fold'+ str(i+1))] = p_test
    sub.to_csv('xgb_final.csv', index=False)

training for fold = 3
[0]	train-logloss:0.481289	valid-logloss:0.4806
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.354035	valid-logloss:0.35419
[100]	train-logloss:0.295429	valid-logloss:0.295922
[150]	train-logloss:0.264295	valid-logloss:0.265022
[200]	train-logloss:0.24581	valid-logloss:0.246716
[250]	train-logloss:0.233024	valid-logloss:0.234021
[300]	train-logloss:0.224399	valid-logloss:0.225483
[350]	train-logloss:0.218405	valid-logloss:0.21954
[400]	train-logloss:0.21409	valid-logloss:0.215287
[450]	train-logloss:0.210928	valid-logloss:0.21218
[500]	train-logloss:0.208236	valid-logloss:0.209555
[550]	train-logloss:0.20598	valid-logloss:0.20737
[600]	train-logloss:0.203961	valid-logloss:0.205427
[650]	train-logloss:0.20218	valid-logloss:0.203758
[700]	train-logloss:0.200623	valid-logloss:0.20231
[750]	train-logloss:0.199224	valid-logloss:0.201009
[800]	tra

KeyboardInterrupt: 

## Training single model

In [24]:
    params = {}
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    params['eta'] = 0.02
    params['n_jobs'] = 5
    params['max_depth'] = 6
    params['subsample'] = 0.6
    params['base_score'] = 0.2
    #params['scale_pos_weight'] = 0.36

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_valid, label=y_valid)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [37]:
bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
print(log_loss(y_valid, bst.predict(d_valid)))

[0]	train-logloss:0.717087	valid-logloss:0.716136
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.401809	valid-logloss:0.40393
[100]	train-logloss:0.330451	valid-logloss:0.334219
[150]	train-logloss:0.30513	valid-logloss:0.309944
[200]	train-logloss:0.293646	valid-logloss:0.299264
[250]	train-logloss:0.286529	valid-logloss:0.293
[300]	train-logloss:0.280016	valid-logloss:0.28747
[350]	train-logloss:0.274416	valid-logloss:0.283213
[400]	train-logloss:0.269995	valid-logloss:0.280187
[450]	train-logloss:0.266099	valid-logloss:0.277612
[500]	train-logloss:0.262861	valid-logloss:0.275592
[550]	train-logloss:0.259844	valid-logloss:0.273687
[600]	train-logloss:0.257057	valid-logloss:0.27208
[650]	train-logloss:0.254442	valid-logloss:0.270704
[700]	train-logloss:0.252072	valid-logloss:0.269568
[750]	train-logloss:0.250111	valid-logloss:0.268627
[800]	train-logloss:0.24804

KeyboardInterrupt: 

## Test

In [14]:
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)
sub = pd.DataFrame()
sub['test_id'] = test_ids
sub['is_duplicate'] = p_test

In [15]:
sub.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.000103
1,1,0.163087
2,2,0.202292
3,3,0.000119
4,4,0.067887


In [16]:
sub.to_csv('sub_xgb_1.csv',index = False)
! rm -rf test.zip
! zip -r test_try.zip sub_xgb_1.csv

  adding: sub_xgb_1.csv (deflated 59%)
