In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import pickle
import time
import gc

from multiprocessing import *
from xgboost import XGBClassifier
from sklearn import *
from numba import jit
from sklearn.datasets import make_friedman1
from sklearn.svm import SVC, SVR
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Imputer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score



In [2]:
DATA_TRAIN_PATH = './Data/train.csv'
DATA_TEST_PATH =  './Data/test.csv'

def load_data(train_path = DATA_TRAIN_PATH, test_path = DATA_TEST_PATH):
    train = pd.read_csv(train_path, na_values = "-1", dtype = np.float64)
    train = train.fillna(-1)
    test = pd.read_csv(test_path, na_values = "-1")
    test = test.fillna(-1)
    
    x_train = train.drop(['target', 'id'], axis = 1)
    y_train = train['target']
    id_train = train['id'].values
    print('Train data shape: ', x_train.shape)
    
    x_test = test.drop(['id'], axis = 1)
    id_test = test['id'].values
    print('Test data shape: ', x_test.shape)   
    
    return x_train, y_train, id_train, x_test, id_test

def feature_info(df):
    cat_features = df.columns[df.columns.str.endswith('cat')].tolist()
    bin_features = df.columns[df.columns.str.endswith('bin')].tolist()
    num_features = [feature for feature in df.columns.tolist() 
                    if feature not in cat_features and feature not in bin_features]
    return cat_features, bin_features, num_features

def submit(filename, id_test, pred_test):
    sub = pd.DataFrame()
    sub['id'] = id_test
    sub['target'] = pred_test
    sub.to_csv(filename, index = False)

def gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_normalized(y_true, y_pred):
    return gini(y_true, y_pred)/gini(y_true, y_true)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return[('gini', gini_score)]

def gini_lgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score, True    
    
def add_noise(series, noise_level):
    return series*( 1+ noise_level * np.random.randn(len(series)))


def target_encoder(trn_series=None, tst_series=None, target=None, 
                    min_samples_leaf=1, smoothing=1, noise_level=0):
    
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
        
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    
    # Apply average function to all target data
    prior = target.mean()
    
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    
    # pd.merge does not keep the index so restore it    
    ft_trn_series.index = trn_series.index 
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)   

def rfe_svc(x_train, y_train):
    estimator = SVC(kernel='rbf', C=1, verbose = True)    
    #rfe = RFE(estimator = estimator, n_features_to_select=5, step = 1, verbose = 1)    
    rfe = RFECV(estimator = estimator, step = 1, cv = 5 , verbose = 1)
    rfe = rfe.fit(x_train, y_train)    
    return rfe.ranking_, rfe.support_

def rfe_gb(x_train, y_train):
    estimator = ExtraTreesClassifier(n_estimators = 50, max_depth=None, min_samples_split=2, random_state=0, verbose=1)    
    estimator = estimator.fit(x_train, y_train)
    model = SelectFromModel(estimator, prefit = True)
    
    return model


def transform_df(df_train, df_test, df_label):             
    
    df_train = pd.DataFrame(df_train)   
    df_test = pd.DataFrame(df_test)
    
    cat_features, bin_features, num_features = feature_info(df_train)
    
    for i in cat_features:       
        imp = Imputer(missing_values = -1, strategy = 'most_frequent', axis = 0, verbose =1, copy = False)       
        df_train[i] = imp.fit_transform(df_train[i].reshape(-1,1))        
        df_test[i] = imp.fit_transform(df_test[i].reshape(-1,1))        
        
        df_train[i], df_test[i] = target_encoder(df_train[i], df_test[i], df_label, 
                                                min_samples_leaf = 100, smoothing = 10,
                                               noise_level = 0.01)    
    
    df_train['ps_car_13_x_ps_reg_03'] = df_train['ps_car_13'] * df_train['ps_reg_03']    
    df_train['ps_car_02_cat_x_ps_reg_01'] = df_train['ps_car_02_cat'] * df_train['ps_reg_01']
    df_train['ps_car_04_cat_x_ps_reg_01'] = df_train['ps_car_04_cat'] * df_train['ps_reg_01']
        
    df_test['ps_car_13_x_ps_reg_03'] = df_test['ps_car_13'] * df_test['ps_reg_03']
    df_test['ps_car_02_cat_x_ps_reg_01'] = df_test['ps_car_02_cat'] * df_test['ps_reg_01']
    df_test['ps_car_04_cat_x_ps_reg_01'] = df_test['ps_car_04_cat'] * df_test['ps_reg_01']            
    
    
    print('Train data shape: ', df_train.shape)
    print('Test data shape: ', df_test.shape)
    
    return df_train, df_test   



In [3]:
x_train, y_train, id_train, x_test, id_test = load_data(train_path = DATA_TRAIN_PATH, test_path = DATA_TEST_PATH)
x_train, x_test=transform_df(x_train, x_test, y_train)

Train data shape:  (595212, 57)
Test data shape:  (892816, 57)
Train data shape:  (595212, 60)
Test data shape:  (892816, 60)


In [4]:
# ranking, support = rfe_svc(x_train, y_train)

In [5]:
# rfe_model = rfe_gb(x_train, y_train)
# x_train = pd.DataFrame(rfe_model.transform(x_train))
# x_test =  pd.DataFrame(rfe_model.transform(x_test))
# print(x_train.shape, x_test.shape)

In [6]:
kfold = 5
num_round = 10000
early_stopping_rounds = 100

In [None]:
lgb_params = {}

#best parameter
lgb_params[0] = {}
lgb_params[0]['boosting'] = 'gbdt'
lgb_params[0]['learning_rate'] = 0.01
lgb_params[0]['max_depth'] = 8
lgb_params[0]['max_bin'] = 10
lgb_params[0]['feature_fraction'] = 0.8
lgb_params[0]['bagging_fraction'] = 0.9
lgb_params[0]['bagging_frequency'] = 5
lgb_params[0]['min_data'] = 500
lgb_params[0]['objective'] = 'binary'
lgb_params[0]['metric'] = 'auc'
lgb_params[0]['bagging_seed'] = 99

lgb_params[1] = {}
lgb_params[1]['boosting'] = 'gbdt'
lgb_params[1]['learning_rate'] = 0.02
lgb_params[1]['max_depth'] = 6
lgb_params[1]['max_bin'] = 10
lgb_params[1]['feature_fraction'] = 0.8
lgb_params[1]['bagging_fraction'] = 0.9
lgb_params[1]['bagging_frequency'] = 5
lgb_params[1]['min_data'] = 500
lgb_params[1]['objective'] = 'binary'
lgb_params[1]['metric'] = 'auc'
lgb_params[1]['bagging_seed'] = 99

lgb_params[2] = {}
lgb_params[2]['boosting'] = 'gbdt'
lgb_params[2]['learning_rate'] = 0.02
lgb_params[2]['max_depth'] = 4
lgb_params[2]['num_leaves'] = 0.70 * (2**lgb_params[2]['max_depth'])
lgb_params[2]['max_bin'] = 10
lgb_params[2]['feature_fraction'] = 0.8
lgb_params[2]['bagging_fraction'] = 0.9
lgb_params[2]['bagging_frequency'] = 5
lgb_params[2]['min_data'] = 500
lgb_params[2]['objective'] = 'binary'
lgb_params[2]['metric'] = 'auc'
lgb_params[2]['bagging_seed'] = 99

lgb_params[3] = {}
lgb_params[3]['boosting'] = 'gbdt'
lgb_params[3]['learning_rate'] = 0.02
lgb_params[3]['max_depth'] = 10
lgb_params[3]['num_leaves'] = 0.50 * (2**lgb_params[2]['max_depth'])
lgb_params[3]['max_bin'] = 10
lgb_params[3]['feature_fraction'] = 0.8
lgb_params[3]['bagging_fraction'] = 0.9
lgb_params[3]['bagging_frequency'] = 5
lgb_params[3]['min_data'] = 500
lgb_params[3]['objective'] = 'binary'
lgb_params[3]['metric'] = 'auc'
lgb_params[3]['bagging_seed'] = 99
lgb_params[3]['lambda_l2'] = 0.00005

lgb_params[4] = {}
lgb_params[4]['boosting'] = 'gbdt'
lgb_params[4]['learning_rate'] = 0.02
lgb_params[4]['max_depth'] = 10
lgb_params[4]['num_leaves'] = 0.70 * (2**lgb_params[2]['max_depth'])
lgb_params[4]['max_bin'] = 10
lgb_params[4]['feature_fraction'] = 0.8
lgb_params[4]['bagging_fraction'] = 0.9
lgb_params[4]['bagging_frequency'] = 5
lgb_params[4]['min_data'] = 500
lgb_params[4]['objective'] = 'binary'
lgb_params[4]['metric'] = 'auc'
lgb_params[4]['bagging_seed'] = 99
lgb_params[4]['lambda_l1'] = 0.00001

lgb_params[5] = {}
lgb_params[5]['boosting'] = 'dart'
lgb_params[5]['xgboost_dart_mode'] = True
lgb_params[5]['learning_rate'] = 0.02
lgb_params[5]['max_depth'] = 6
lgb_params[5]['max_bin'] = 10
lgb_params[5]['feature_fraction'] = 0.8
lgb_params[5]['bagging_fraction'] = 0.9
lgb_params[5]['bagging_frequency'] = 10
lgb_params[5]['min_data'] = 500
lgb_params[5]['objective'] = 'binary'
lgb_params[5]['metric'] = 'auc'
lgb_params[5]['bagging_seed'] = 99

lgb_params[6] = {}
lgb_params[6]['boosting'] = 'dart'
lgb_params[5]['xgboost_dart_mode'] = True
lgb_params[6]['learning_rate'] = 0.02
lgb_params[6]['max_depth'] = 10
lgb_params[6]['max_bin'] = 10
lgb_params[6]['feature_fraction'] = 0.8
lgb_params[6]['bagging_fraction'] = 0.9
lgb_params[6]['bagging_frequency'] = 10
lgb_params[6]['min_data'] = 500
lgb_params[6]['objective'] = 'binary'
lgb_params[6]['metric'] = 'auc'
lgb_params[6]['bagging_seed'] = 99

In [None]:

y_pred = []
for k in range(2):       
    stratkfold = StratifiedKFold(n_splits=kfold, random_state=None, shuffle = True)
    
    for i, (train_index, test_index) in enumerate(stratkfold.split(x_train, y_train)):   
        print(' light gbm kfold: {}  of  {} : '.format(i+1, kfold))    
        d_train = lgb.Dataset(x_train.iloc[train_index], label = y_train.iloc[train_index]) 
        d_valid = lgb.Dataset(x_train.iloc[test_index], label = y_train.iloc[test_index]) 

        model = lgb.train(lgb_params[k], d_train, num_round, d_valid, early_stopping_rounds = early_stopping_rounds, 
                      feval = gini_lgb, verbose_eval = 100)
        y_pred.append(model.predict(x_test, num_iteration=(model.best_iteration)))

        print("_"*90 + '\n')

    model.save_model('./Model/lgb_model_'+ str(k) + '.txt')
    print("*"*90)
    print("*"*90 + '\n')

 light gbm kfold: 1  of  5 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.621082	valid_0's gini: 0.242169
[200]	valid_0's auc: 0.622571	valid_0's gini: 0.245141
[300]	valid_0's auc: 0.626138	valid_0's gini: 0.252275
[400]	valid_0's auc: 0.631174	valid_0's gini: 0.262347
[500]	valid_0's auc: 0.633941	valid_0's gini: 0.267883


In [None]:
y_pred_lgb = np.mean(np.array(y_pred), axis=0)

In [None]:
xgb_params = {}

xgb_params[0] = {}
xgb_params[0]['eta'] = 0.02 
xgb_params[0]['max_depth'] = 6
xgb_params[0]['subsample'] = 0.8
xgb_params[0]['colsample_bytree'] = 0.8
xgb_params[0]['objective'] = 'binary:logistic'
xgb_params[0]['eval_metric'] = 'auc'
xgb_params[0]['seed'] = 99
xgb_params[0]['silent'] = True

xgb_params[1] = {}
xgb_params[1]['eta'] = 0.02 
xgb_params[1]['gamma'] = 0.0005
xgb_params[1]['max_depth'] = 5
xgb_params[1]['subsample'] = 0.8
xgb_params[1]['colsample_bytree'] = 0.8
xgb_params[1]['objective'] = 'binary:logistic'
xgb_params[1]['eval_metric'] = 'auc'
xgb_params[1]['seed'] = 99
xgb_params[1]['silent'] = True

xgb_params[2] = {}
xgb_params[2]['eta'] = 0.05 
xgb_params[1]['alpha'] = 0.0005
xgb_params[2]['max_depth'] = 10
xgb_params[2]['subsample'] = 0.8
xgb_params[2]['colsample_bytree'] = 0.8
xgb_params[2]['objective'] = 'binary:logistic'
xgb_params[2]['eval_metric'] = 'auc'
xgb_params[2]['seed'] = 99
xgb_params[2]['silent'] = True

xgb_params[3] = {}
xgb_params[3]['eta'] = 0.05
xgb_params[1]['alpha'] = 0.0002
xgb_params[3]['max_depth'] = 6
xgb_params[3]['subsample'] = 0.8
xgb_params[3]['colsample_bytree'] = 0.8
xgb_params[3]['objective'] = 'binary:logistic'
xgb_params[3]['eval_metric'] = 'auc'
xgb_params[3]['seed'] = 99
xgb_params[3]['silent'] = True

# xgb_params[4] = {}
# xgb_params[4]['eta'] = 0.05 
# xgb_params[4]['max_depth'] = 10
# xgb_params[4]['subsample'] = 0.8
# xgb_params[4]['colsample_bytree'] = 0.8
# xgb_params[4]['objective'] = 'binary:logistic'
# xgb_params[4]['eval_metric'] = 'auc'
# xgb_params[4]['seed'] = 99
# xgb_params[4]['silent'] = True

# xgb_params[5] = {}
# xgb_params[5]['eta'] = 0.05 
# xgb_params[5]['max_depth'] = 10
# xgb_params[5]['subsample'] = 0.8
# xgb_params[5]['colsample_bytree'] = 0.8
# xgb_params[5]['objective'] = 'binary:logistic'
# xgb_params[5]['eval_metric'] = 'auc'
# xgb_params[5]['seed'] = 99
# xgb_params[5]['silent'] = True

# xgb_params[6] = {}
# xgb_params[6]['eta'] = 0.05 
# xgb_params[6]['max_depth'] = 10
# xgb_params[6]['subsample'] = 0.8
# xgb_params[6]['colsample_bytree'] = 0.8
# xgb_params[6]['objective'] = 'binary:logistic'
# xgb_params[6]['eval_metric'] = 'auc'
# xgb_params[6]['seed'] = 99
# xgb_params[6]['silent'] = True

In [None]:
y_pred = []

for k in range(2):
    stratkfold = StratifiedKFold(n_splits=kfold, random_state=None, shuffle = True)
    
    for i, (train_index, test_index) in enumerate(stratkfold.split(x_train, y_train)):
        print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))    
        d_train = xgb.DMatrix(x_train.iloc[train_index], y_train.iloc[train_index]) 
        d_valid = xgb.DMatrix(x_train.iloc[test_index], y_train.iloc[test_index]) 
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        model = xgb.train(xgb_params[k], d_train, num_round, watchlist, early_stopping_rounds = early_stopping_rounds, 
                      feval = gini_xgb, maximize = True, verbose_eval = 50)

        y_pred.append(model.predict(xgb.DMatrix(x_test), ntree_limit=model.best_ntree_limit))
        print("_"*90 + '\n')
        
    pickle.dump(model, open("./Model/xgb_model_" + str(k) + ".pickle.dat", "wb")) 
    print("*"*90)
    print("*"*90 + '\n')

In [None]:
y_pred_xgb = np.mean(np.array(y_pred), axis=0)

In [None]:
y_final = np.mean(np.array([y_pred_lgb, y_pred_xgb]), axis=0)
submit('gb_mean.csv', id_test, y_final)