In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import pickle
import time
import gc

from multiprocessing import *
from xgboost import XGBClassifier
#from lightgbm import LBGMClassifier
from sklearn import *
from numba import jit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score



In [2]:
DATA_TRAIN_PATH = './Data/train.csv'
DATA_TEST_PATH =  './Data/test.csv'


def load_data(train_path = DATA_TRAIN_PATH, test_path = DATA_TEST_PATH):
    train = pd.read_csv(train_path, na_values = "-1")
    test = pd.read_csv(test_path, na_values = "-1")
    
    x_train = train.drop(['target', 'id'], axis = 1)
    y_train = train['target']
    id_train = train['id'].values
    print('Train data shape: ', x_train.shape)
    
    x_test = test.drop(['id'], axis = 1)
    id_test = test['id'].values
    print('Test data shape: ', x_test.shape)   
    
    return x_train, y_train, id_train, x_test, id_test

def feature_info(df):
    cat_features = df.columns[df.columns.str.endwith('cat')].tolist()
    bin_features = df.columns[df.columns.str.endwith('bin')].tolist()
    num_features = [feature for feature in df.columns.tolist() 
                    if feature not in cat_features and features not in bin_features]
    return cat_features, bin_features, num_features

def submit(filename, id_test, pred_test):
    sub = pd.DataFrame()
    sub['id'] = id_test
    sub['target'] = pred_test
    sub.to_csv(filename, index = False)

def gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_normalized(y_true, y_pred):
    return gini(y_true, y_pred)/gini(y_true, y_true)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return[('gini', gini_score)]

def gini_lgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score, True    
    
def add_noise(series, noise_level):
    return series*( 1+ noise_level * np.random.randn(len(series)))

In [3]:
# params = {'objective' : 'binary:logistic',
#           'eta' : 0.015, 
#           'silent' : False,
#           'max_depth' : 7, 
#           'subsample' : 0.9,
#           'colsample_bytree' : 0.9
# }

params = {'eta': 0.02, 
          'max_depth': 5, 
          'subsample': 0.9, 
          'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'seed': 99, 
          'silent': True}

In [4]:
x_train, y_train, id_train, x_test, id_test = load_data(train_path = DATA_TRAIN_PATH, test_path = DATA_TEST_PATH)

Train data shape:  (595212, 57)
Test data shape:  (892816, 57)


In [None]:
kfold = 5
stratkfold = StratifiedKFold(n_splits=kfold, random_state=0, shuffle = True)
y_pred = np.zeros((x_test.shape[0], ))
num_round = 2000

In [None]:
for i, (train_index, test_index) in enumerate(stratkfold.split(x_train, y_train)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))    
    d_train = xgb.DMatrix(x_train.iloc[train_index], y_train.iloc[train_index]) 
    d_valid = xgb.DMatrix(x_train.iloc[test_index], y_train.iloc[test_index]) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    
    model = xgb.train(params, d_train, num_round, watchlist, early_stopping_rounds = 50, 
                  feval = gini_xgb, maximize = True, verbose_eval = 10)

    y_pred += model.predict(xgb.DMatrix(x_test), ntree_limit=(model.best_ntree_limit+50) )

pickle.dump(model, open("xgb_model.pickle.dat", "wb"))    

 xgb kfold: 1  of  5 : 
[0]	train-gini:0.216161	valid-gini:0.205707
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 50 rounds.
[10]	train-gini:0.246681	valid-gini:0.236489
[20]	train-gini:0.255201	valid-gini:0.242475
[30]	train-gini:0.255973	valid-gini:0.242937
[40]	train-gini:0.257625	valid-gini:0.244077
[50]	train-gini:0.261214	valid-gini:0.246571
[60]	train-gini:0.26166	valid-gini:0.245787
[70]	train-gini:0.263165	valid-gini:0.247258
[80]	train-gini:0.264795	valid-gini:0.248616
[90]	train-gini:0.266621	valid-gini:0.250027
[100]	train-gini:0.267636	valid-gini:0.250143
[110]	train-gini:0.268771	valid-gini:0.250526
[120]	train-gini:0.270903	valid-gini:0.251459
[130]	train-gini:0.27309	valid-gini:0.252376
[140]	train-gini:0.276704	valid-gini:0.254462
[150]	train-gini:0.278946	valid-gini:0.255162
[160]	train-gini:0.283036	valid-gini:0.257442
[170]	train-gini:0.28655	valid-gini:0.259284
[180]	train-gini:

In [None]:
params ={'learning_rate' : 0.02,
         'max_depth' : 5,
         'max_bin' : 10,
         'feature_fraction' : 0.9,
         'bagging_fraction' : 0.9,
         'bagging_frequency': 10,
         'min_data' : 500,
         'objective' : 'binary',
         'metric' : 'auc',
         'bagging_seed' : 99
}

In [None]:
for i, (train_index, test_index) in enumerate(stratkfold.split(x_train, y_train)):
    
    print(' light gbm kfold: {}  of  {} : '.format(i+1, kfold))    
    d_train = lgb.Dataset(x_train.iloc[train_index], label = y_train.iloc[train_index]) 
    d_valid = lgb.Dataset(x_train.iloc[test_index], label = y_train.iloc[test_index]) 
        
    model = lgb.train(params, d_train, num_round, d_valid, early_stopping_rounds = 50, 
                  feval = gini_lgb, verbose_eval = 10)
    y_pred += model.predict(x_test, num_iteration=(model.best_iteration+50) / (2*kfold))

model.save_model('lgb_model.txt')
    

In [None]:
y_pred /= (2*kfold)
submit('xgb1.csv', id_test, y_pred)