In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import pickle
import time
import gc

from multiprocessing import *
from xgboost import XGBClassifier
from sklearn import *
from numba import jit
from sklearn.datasets import make_friedman1
from sklearn.svm import SVC, SVR
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score



In [2]:
DATA_TRAIN_PATH = './Data/train.csv'
DATA_TEST_PATH =  './Data/test.csv'

def load_data(train_path = DATA_TRAIN_PATH, test_path = DATA_TEST_PATH):
    train = pd.read_csv(train_path, na_values = "-1", dtype = np.float64)
    train = train.fillna(-1)
    test = pd.read_csv(test_path, na_values = "-1")
    test = test.fillna(-1)
    
    x_train = train.drop(['target', 'id'], axis = 1)
    y_train = train['target']
    id_train = train['id'].values
    print('Train data shape: ', x_train.shape)
    
    x_test = test.drop(['id'], axis = 1)
    id_test = test['id'].values
    print('Test data shape: ', x_test.shape)   
    
    return x_train, y_train, id_train, x_test, id_test

def feature_info(df):
    cat_features = df.columns[df.columns.str.endwith('cat')].tolist()
    bin_features = df.columns[df.columns.str.endwith('bin')].tolist()
    num_features = [feature for feature in df.columns.tolist() 
                    if feature not in cat_features and features not in bin_features]
    return cat_features, bin_features, num_features

def submit(filename, id_test, pred_test):
    sub = pd.DataFrame()
    sub['id'] = id_test
    sub['target'] = pred_test
    sub.to_csv(filename, index = False)

def gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_normalized(y_true, y_pred):
    return gini(y_true, y_pred)/gini(y_true, y_true)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return[('gini', gini_score)]

def gini_lgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score, True    
    
def add_noise(series, noise_level):
    return series*( 1+ noise_level * np.random.randn(len(series)))


def rfe_svc(x_train, y_train):
    estimator = SVC(kernel='rbf', C=1, verbose = True)    
    #rfe = RFE(estimator = estimator, n_features_to_select=5, step = 1, verbose = 1)    
    rfe = RFECV(estimator = estimator, step = 1, cv = 5 , verbose = 1)
    rfe = rfe.fit(x_train, y_train)    
    return rfe.ranking_, rfe.support_

def rfe_gb(x_train, y_train):
    estimator = ExtraTreesClassifier(n_estimators = 50, max_depth=None, min_samples_split=2, random_state=0, verbose=1)    
    estimator = estimator.fit(x_train, y_train)
    model = SelectFromModel(estimator, prefit = True)
    
    return model

In [3]:
x_train, y_train, id_train, x_test, id_test = load_data(train_path = DATA_TRAIN_PATH, test_path = DATA_TEST_PATH)

Train data shape:  (595212, 57)
Test data shape:  (892816, 57)


In [None]:
# ranking, support = rfe_svc(x_train, y_train)

In [None]:
rfe_model = rfe_gb(x_train, y_train)
x_train = pd.DataFrame(rfe_model.transform(x_train))
x_test =  pd.DataFrame(rfe_model.transform(x_test))
print(x_train.shape, x_test.shape)

In [None]:
kfold = 5
stratkfold = StratifiedKFold(n_splits=kfold, random_state=0, shuffle = True)
y_pred=[]
num_round = 1000

In [None]:
# params ={'learning_rate' : 0.02,
#          'max_depth' : 6,
#          'max_bin' : 10,
#          'feature_fraction' : 0.9,
#          'bagging_fraction' : 0.9,
#          'bagging_frequency': 10,
#          'min_data' : 500,
#          'objective' : 'binary',
#          'metric' : 'auc',
#          'bagging_seed' : 99
# }

   
params ={'learning_rate' : 0.02,
     'max_depth' : 10,
     'max_bin' : 20,
     'feature_fraction' : 0.9,
     'bagging_fraction' : 0.9,
     'bagging_frequency': 10,
     'min_data' : 500,
     'objective' : 'binary',
     'metric' : 'auc',
     'bagging_seed' : 99
}
    
for i, (train_index, test_index) in enumerate(stratkfold.split(x_train, y_train)):   
    print(' light gbm kfold: {}  of  {} : '.format(i+1, kfold))    
    d_train = lgb.Dataset(x_train.iloc[train_index], label = y_train.iloc[train_index]) 
    d_valid = lgb.Dataset(x_train.iloc[test_index], label = y_train.iloc[test_index]) 

    model = lgb.train(params, d_train, num_round, d_valid, early_stopping_rounds = 50, 
                  feval = gini_lgb, verbose_eval = 100)
    y_pred.append(model.predict(x_test, num_iteration=(model.best_iteration+50)))

model.save_model('lgb_model_'+ str(k) + '.txt')

In [None]:
# params = {'eta': 0.02, 
#           'max_depth': 5, 
#           'subsample': 0.9, 
#           'colsample_bytree': 0.9, 
#           'objective': 'binary:logistic', 
#           'eval_metric': 'auc', 
#           'seed': 99, 
#           'silent': True}


# for i, (train_index, test_index) in enumerate(stratkfold.split(x_train, y_train)):
#     print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))    
#     d_train = xgb.DMatrix(x_train.iloc[train_index], y_train.iloc[train_index]) 
#     d_valid = xgb.DMatrix(x_train.iloc[test_index], y_train.iloc[test_index]) 
#     watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    
#     model = xgb.train(params, d_train, num_round, watchlist, early_stopping_rounds = 50, 
#                   feval = gini_xgb, maximize = True, verbose_eval = 10)

#     y_pred.append(model.predict(xgb.DMatrix(x_test), ntree_limit=(model.best_ntree_limit+50)))

# pickle.dump(model, open("xgb_model.pickle.dat", "wb")) 

In [None]:
y_pred = np.array(y_pred)

In [None]:
y_final = np.mean(y_pred, axis=0)
submit('xgb1_mean.csv', id_test, y_final)