In [10]:
%config IPCompleter.greedy=True

In [11]:
import pandas as pd

df_train = pd.read_csv('train_kaggle.csv')
df_test = pd.read_csv('sample_solution.csv')
y = df_train['label']
y = y.values

In [34]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

NUM_OF_COL = 40

def load_features(col):
    return pd.read_parquet("filtered_features/filtered_feature{}.gzip".format(col))

def load_test_features(col):
    return pd.read_parquet("filtered_test_features/filtered_test_feature{}.gzip".format(col))

def select_train(selected_features):
    train = []
    for col in tqdm(range(NUM_OF_COL)):
        if selected_features[col] == False:
            continue
        tf = load_features(col)
        train.append(tf)
    train = np.concatenate(train,  axis=1)
    return train

def select_test(selected_features):
    test = []
    for col in range(NUM_OF_COL):
        if selected_features[col] == False:
            continue
        tf = load_test_features(col)
        tf = tf.replace([np.inf], np.finfo('float32').max).replace([np.inf, -np.inf], np.finfo('float32').min).fillna(0)
        test.append(tf)
    test = np.concatenate(test,  axis=1)
    return test

def calculate_auc(train, y):
    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.2, random_state=42)
    model = LGBMClassifier()
    model.fit(X_train, y_train)
    pred = model.predict_proba(X_valid)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_valid, pred)
    auc_score = metrics.auc(fpr, tpr)
    return auc_score

def show_index(selected_features):
    return np.where(np.array(selected_features) == False)

def get_extra_tree(train, y):
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(train, y)
    return clf

def get_feature_model(train, y, threshold=None):
    clf = get_extra_tree(train, y)
    print(clf.feature_importances_)
    modelSelection = SelectFromModel(clf, prefit=True, max_features=8000, threshold=threshold)
    return modelSelection

## Frequency importance

In [35]:
import numpy as np

selected_features = [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True]
print(show_index(selected_features))
train = select_train(selected_features)
print(train.shape)

(array([10, 16, 25, 34], dtype=int64),)


100%|█████████████████████████████████████████████████████████| 40/40 [00:09<00:00,  4.17it/s]


(30336, 12794)


In [None]:
calculate_auc(train, y)
# 0.9559280694365483

In [33]:
modelSelection = get_feature_model(train, y)
train_trans = modelSelection.transform(train)
calculate_auc(train_trans, y)
# 0.9531738028709031

100%|█████████████████████████████████████████████████████████| 40/40 [00:16<00:00,  2.50it/s]


[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 2.98439752e-05
 2.75254402e-05 4.96375509e-05]


0.9531738028709031

In [42]:
train_trans.shape

(30336, 2934)

In [36]:
clf = get_extra_tree(train, y)
clf.feature_importances_

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       1.97294762e-05, 2.71300905e-05, 1.60803685e-05])

In [41]:
np.where(clf.feature_importances_ > 0.0001)[0].shape

(1704,)

## Some random trial

In [26]:
import numpy as np

selected_features = [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True,  True,  True,  True, True,  True,
        False,  True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True]
print(show_index(selected_features))
train = select_train(selected_features)
print(train.shape)

(array([27], dtype=int64),)


100%|█████████████████████████████████████████████████████████| 40/40 [00:12<00:00,  3.29it/s]


(30336, 14288)


0.9578389014310529

In [None]:
calculate_auc(train, y)
# 0.9578389014310529

In [31]:
modelSelection = get_feature_model(train, y)
train_trans = modelSelection.transform(train)
calculate_auc(train_trans, y)
# 0.9569397510603413

[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 5.14626514e-05
 2.03923644e-05 7.09797664e-05]


0.9569397510603413

## Some random trial 2

In [55]:
import numpy as np

selected_features = [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True,  False,  True,  True, True,  True,
        False,  True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True]
print(show_index(selected_features))
train = select_train(selected_features)
print(train.shape)

(array([22, 27], dtype=int64),)


100%|█████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.17it/s]


(30336, 13864)


In [56]:
calculate_auc(train, y)


0.9558481612503212

## Best Feature Group so far

In [45]:
import numpy as np

selected_features = [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, True,  True,  True,  True,  True,  True, True,  True,
        False,  True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True]
print(show_index(selected_features))
train = select_train(selected_features)
print(train.shape)

(array([18], dtype=int64),)


100%|█████████████████████████████████████████████████████████| 40/40 [00:14<00:00,  2.81it/s]


(30336, 14136)


In [46]:
calculate_auc(train, y)
# 0.9547855289940256

0.9547855289940256

In [47]:
modelSelection = get_feature_model(train, y)
train_trans = modelSelection.transform(train)
calculate_auc(train_trans, y)
# 0.9561219750811085

[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.71421378e-05
 2.05976304e-05 4.93606398e-05]


0.9561219750811085

## Trial 3

In [48]:
import numpy as np

selected_features = [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True, False,  True,
        False,  True,  True,  True,  True,  True,  True, False,  True,
        False,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True]
print(show_index(selected_features))
train = select_train(selected_features)
print(train.shape)

(array([10, 16, 18, 25, 27, 34], dtype=int64),)


100%|█████████████████████████████████████████████████████████| 40/40 [00:09<00:00,  4.14it/s]


(30336, 12090)


In [49]:
calculate_auc(train, y)

0.9549387474428463

## Trial 4

In [57]:
import numpy as np

selected_features = [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, True,  True,  True,  False,  True,  True, True,  True,
        True,  True,  True,  True,  True,  True,  True, True,  True,
        False,  True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True]
print(show_index(selected_features))
train = select_train(selected_features)
print(train.shape)

(array([13, 27], dtype=int64),)


100%|█████████████████████████████████████████████████████████| 40/40 [00:12<00:00,  3.10it/s]


(30336, 13821)


In [58]:
calculate_auc(train, y)

0.9508630633939499

## Trial 5

In [74]:
import numpy as np

selected_features = [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, True,  True,  True,  True,  True,  True, True,  True,
        False,  True,  True,  True,  True,  True,  True, True,  True,
        False,  True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True]
print(show_index(selected_features))
train = select_train(selected_features)
print(train.shape)

(array([18, 27], dtype=int64),)


100%|█████████████████████████████████████████████████████████| 40/40 [00:12<00:00,  3.27it/s]


(30336, 13860)


In [None]:
calculate_auc(train, y)

In [None]:
modelSelection = get_feature_model(train, y)
train_trans = modelSelection.transform(train)
calculate_auc(train_trans, y)

## Frequency importance 2 

In [50]:
import numpy as np

selected_features = [False,  True,  True,  True, False, False,  True, False,  True,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True, False,  True, False,  True,  True,  True, False, False,
       False, False,  True, False,  True,  True,  True, False, False,
       False,  True, False,  True]
print(show_index(selected_features))
train = select_train(selected_features)
print(train.shape)

(array([ 0,  4,  5,  7,  9, 10, 16, 19, 21, 25, 26, 27, 28, 30, 34, 35, 36,
       38], dtype=int64),)


100%|█████████████████████████████████████████████████████████| 40/40 [00:05<00:00,  6.69it/s]


(30336, 7396)


In [51]:
calculate_auc(train, y)

0.9523124072854272

## Real Training

### Training with untuned

In [64]:
selected_features = [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, True,  True,  True,  True,  True,  True, True,  True,
        False,  True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True,  True,  True,  True, True,  True,
        True,  True,  True,  True]

XTrain = select_train(selected_features)
XTest = select_test(selected_features)

100%|█████████████████████████████████████████████████████████| 40/40 [00:13<00:00,  3.01it/s]


modelSelection = get_feature_model(XTrain, y)
XTrain = modelSelection.transform(XTrain)
XTest = modelSelection.transform(XTest)

In [63]:
print(XTrain.shape)
print(XTest.shape)

(30336, 3255)
(10000, 3255)


In [61]:
model = LGBMClassifier()
model.fit(XTrain, y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [62]:
probs = model.predict_proba(XTest)
YTest = probs[:, 0]
df_test['Predicted'] = YTest
df_test.to_csv('test.csv', index=False)
df_test

Unnamed: 0,Id,Predicted
0,0,0.997732
1,1,0.989903
2,2,0.990316
3,3,0.981817
4,4,0.995938
5,5,0.987671
6,6,0.983008
7,7,0.666072
8,8,0.994786
9,9,0.997367


### Split

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(XTrain, y, test_size=0.2, random_state=42)

In [66]:
import lightgbm as lgb

train_set = lgb.Dataset(X_train, y_train)
valid_set = lgb.Dataset(X_valid, y_valid)

In [67]:
param = {'num_leaves': 31, 'objective': 'binary'}
param['metric'] = 'auc'
bst = lgb.train(param, train_set, 5, valid_sets=valid_set, early_stopping_rounds=5)
bst.best_iteration 

[1]	valid_0's auc: 0.854788
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.883097
[3]	valid_0's auc: 0.905796
[4]	valid_0's auc: 0.913159
[5]	valid_0's auc: 0.920977
Did not meet early stopping. Best iteration is:
[5]	valid_0's auc: 0.920977


5

In [68]:
bst.best_score['valid_0']['auc']

0.9209772184693477

In [69]:
import lightgbm as lgb
from hyperopt import STATUS_OK

ITER = 50
STOP_ROUND = 5

# Create the dataset
def objective(params):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
    params['num_leaves'] = int(params['num_leaves'])
    params['subsample_for_bin'] = int(params['subsample_for_bin'])
    params['min_child_samples'] = int(params['min_child_samples'])
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    bst = lgb.train(params, train_set, ITER, valid_sets=valid_set, early_stopping_rounds=STOP_ROUND)
    bst.save_model('model.txt', num_iteration=bst.best_iteration)
  
    # Extract the best score
    best_score = bst.best_score['valid_0']['auc']
    
    # Loss must be minimized
    loss = np.log(1 - best_score)
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [70]:
from hyperopt import hp

# Define the search space
space = {
    'boosting_type': 'dart',
    'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'metric': 'auc'
}

space

{'boosting_type': 'dart',
 'num_leaves': <hyperopt.pyll.base.Apply at 0x2469af63780>,
 'learning_rate': <hyperopt.pyll.base.Apply at 0x2469b0e0160>,
 'subsample_for_bin': <hyperopt.pyll.base.Apply at 0x2469b0e01d0>,
 'min_child_samples': <hyperopt.pyll.base.Apply at 0x246a14d24e0>,
 'reg_alpha': <hyperopt.pyll.base.Apply at 0x246a14d2668>,
 'reg_lambda': <hyperopt.pyll.base.Apply at 0x246a14d27b8>,
 'colsample_bytree': <hyperopt.pyll.base.Apply at 0x246a14d2908>,
 'metric': 'auc'}

In [71]:
from hyperopt import Trials
# Trials object to track progress
bayes_trials = Trials()

In [72]:
from hyperopt import fmin
from hyperopt import tpe

MAX_EVALS = 500

# Optimize
bestDict = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = bayes_trials)
bestDict

[1]	valid_0's auc: 0.894167                                                                   
  0%|                                                   | 0/500 [00:13<?, ?it/s, best loss: ?]




[2]	valid_0's auc: 0.91234                                                                    
[3]	valid_0's auc: 0.918737                                                                   
[4]	valid_0's auc: 0.927582                                                                   
[5]	valid_0's auc: 0.928691                                                                   
[6]	valid_0's auc: 0.930408                                                                   
[7]	valid_0's auc: 0.934816                                                                   
[8]	valid_0's auc: 0.93631                                                                    
[9]	valid_0's auc: 0.937947                                                                   
[10]	valid_0's auc: 0.938284                                                                  
[11]	valid_0's auc: 0.937886                                                                  
[12]	valid_0's auc: 0.941127                      

[38]	valid_0's auc: 0.933668                                                                  
[39]	valid_0's auc: 0.933655                                                                  
[40]	valid_0's auc: 0.934                                                                     
[41]	valid_0's auc: 0.933982                                                                  
[42]	valid_0's auc: 0.934339                                                                  
[43]	valid_0's auc: 0.934252                                                                  
[44]	valid_0's auc: 0.934319                                                                  
[45]	valid_0's auc: 0.93458                                                                   
[46]	valid_0's auc: 0.934631                                                                  
[47]	valid_0's auc: 0.934545                                                                  
[48]	valid_0's auc: 0.93484                       

[24]	valid_0's auc: 0.934559                                                                  
[25]	valid_0's auc: 0.935624                                                                  
[26]	valid_0's auc: 0.93593                                                                   
[27]	valid_0's auc: 0.936511                                                                  
[28]	valid_0's auc: 0.936301                                                                  
[29]	valid_0's auc: 0.937284                                                                  
[30]	valid_0's auc: 0.937594                                                                  
[31]	valid_0's auc: 0.937785                                                                  
[32]	valid_0's auc: 0.938929                                                                  
[33]	valid_0's auc: 0.938917                                                                  
[34]	valid_0's auc: 0.939429                      

[10]	valid_0's auc: 0.938497                                                                  
[11]	valid_0's auc: 0.938936                                                                  
[12]	valid_0's auc: 0.939547                                                                  
[13]	valid_0's auc: 0.940381                                                                  
[14]	valid_0's auc: 0.941411                                                                  
[15]	valid_0's auc: 0.941924                                                                  
[16]	valid_0's auc: 0.942938                                                                  
[17]	valid_0's auc: 0.943552                                                                  
[18]	valid_0's auc: 0.9433                                                                    
[19]	valid_0's auc: 0.943274                                                                  
[20]	valid_0's auc: 0.944345                      

[46]	valid_0's auc: 0.944265                                                                  
[47]	valid_0's auc: 0.944194                                                                  
[48]	valid_0's auc: 0.944332                                                                  
[49]	valid_0's auc: 0.944321                                                                  
[50]	valid_0's auc: 0.944423                                                                  
[1]	valid_0's auc: 0.897067                                                                   
[2]	valid_0's auc: 0.909763                                                                   
[3]	valid_0's auc: 0.914631                                                                   
[4]	valid_0's auc: 0.918637                                                                   
[5]	valid_0's auc: 0.918692                                                                   
[6]	valid_0's auc: 0.92026                        

[32]	valid_0's auc: 0.942601                                                                  
[33]	valid_0's auc: 0.943062                                                                  
[34]	valid_0's auc: 0.943188                                                                  
[35]	valid_0's auc: 0.943073                                                                  
[36]	valid_0's auc: 0.943197                                                                  
[37]	valid_0's auc: 0.943299                                                                  
[38]	valid_0's auc: 0.943509                                                                  
[39]	valid_0's auc: 0.944245                                                                  
[40]	valid_0's auc: 0.944142                                                                  
[41]	valid_0's auc: 0.944206                                                                  
[42]	valid_0's auc: 0.944064                      

[18]	valid_0's auc: 0.940064                                                                  
[19]	valid_0's auc: 0.9399                                                                    
[20]	valid_0's auc: 0.940411                                                                  
[21]	valid_0's auc: 0.940261                                                                  
[22]	valid_0's auc: 0.94038                                                                   
[23]	valid_0's auc: 0.940862                                                                  
[24]	valid_0's auc: 0.941384                                                                  
[25]	valid_0's auc: 0.942208                                                                  
[26]	valid_0's auc: 0.942071                                                                  
[27]	valid_0's auc: 0.94176                                                                   
[28]	valid_0's auc: 0.941473                      

[4]	valid_0's auc: 0.926743                                                                   
[5]	valid_0's auc: 0.930577                                                                   
[6]	valid_0's auc: 0.932661                                                                   
[7]	valid_0's auc: 0.934049                                                                   
[8]	valid_0's auc: 0.934506                                                                   
[9]	valid_0's auc: 0.935709                                                                   
[10]	valid_0's auc: 0.937142                                                                  
[11]	valid_0's auc: 0.939903                                                                  
[12]	valid_0's auc: 0.939956                                                                  
[13]	valid_0's auc: 0.939353                                                                  
[14]	valid_0's auc: 0.939246                      

[40]	valid_0's auc: 0.945717                                                                  
[41]	valid_0's auc: 0.946002                                                                  
[42]	valid_0's auc: 0.946017                                                                  
[43]	valid_0's auc: 0.945935                                                                  
[44]	valid_0's auc: 0.945181                                                                  
[45]	valid_0's auc: 0.944463                                                                  
[46]	valid_0's auc: 0.944123                                                                  
[47]	valid_0's auc: 0.944265                                                                  
[48]	valid_0's auc: 0.944391                                                                  
[49]	valid_0's auc: 0.944323                                                                  
[50]	valid_0's auc: 0.944549                      

KeyboardInterrupt: 

In [None]:
from hyperopt import space_eval
best = space_eval(space, bestDict)
best['num_leaves'] = int(best['num_leaves'])
best['subsample_for_bin'] = int(best['subsample_for_bin'])
best['min_child_samples'] = int(best['min_child_samples'])
best['metric'] = 'auc'
bestModel = lgb.train(best, train_set, ITER, valid_sets=valid_set, early_stopping_rounds=STOP_ROUND)

In [None]:
bestModel.save_model('model.txt', num_iteration=bestModel.best_iteration)

In [None]:
bestModel.num_trees()

In [None]:
best

In [None]:
from lightgbm import LGBMClassifier

gb_clf = LGBMClassifier(boosting_type=best['boosting_type'],
                        num_leaves=best['num_leaves'],
                        learning_rate=best['learning_rate'],
                        subsample_for_bin=best['subsample_for_bin'],
                        min_child_samples=best['min_child_samples'],
                        reg_alpha=best['reg_alpha'],
                        reg_lambda=best['reg_lambda'],
                        colsample_bytree=best['colsample_bytree'])
gb_clf.fit(XTrain, y)

In [None]:
probs = gb_clf.predict_proba(XTest, num_iteration=bestModel.best_iteration)
YTest = probs[:, 0]
YTest

In [None]:
np.max(YTest)

In [None]:
df_test['Predicted'] = YTest
df_test

In [None]:
df_test.to_csv('test.csv', index=False)

In [None]:
from lightgbm import LGBMModel
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

clf = LGBMModel()
parameters = {'learning_rate': sp_randFloat(),
                  'subsample'    : sp_randFloat(),
                  'n_estimators' : sp_randInt(100, 1000),
                  'max_depth'    : sp_randInt(4, 10) 
                 }

randm = RandomizedSearchCV(estimator=gb_clf, param_distributions = parameters, 
                           cv = 5, n_iter = 10, n_jobs=-1)
randm.fit(XTrain, y)

# Results from Random Search
print("\n========================================================")
print(" Results from Random Search " )
print("========================================================")    

print("\n The best estimator across ALL searched params:\n",
      randm.best_estimator_)

print("\n The best score across ALL searched params:\n",
      randm.best_score_)

print("\n The best parameters across ALL searched params:\n",
      randm.best_params_)

print("\n ========================================================")

gb_clf = randm.best_estimator_