In [1]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold


import lightgbm as lgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

import optuna
from scipy.stats import ks_2samp
from tqdm import tqdm

from lightgbm import LGBMRegressor
from boruta import BorutaPy

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)


@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
    

Using TensorFlow backend.


In [2]:
#train_df = pd.read_csv('train_clean.csv')

train_df = pd.read_csv('train_clean3.csv')



feats =  [col for col in train_df.columns if col not in ['target', 'ID_code']]

In [3]:
random_state = 42
np.random.seed(random_state)

In [4]:
#train_df[feats] = train_df[feats].fillna(0)

lgbmclf = LGBMRegressor(
    device='gpu',
    gpu_platform_id= 1,
    gpu_device_id= 0,
    objective= "binary",
    metric= "auc",
    boosting= 'gbdt',
    max_depth= -1,
    num_leaves= 13,
    learning_rate= 0.01,
    bagging_freq= 5,
    bagging_fraction= 0.4,
    feature_fraction= 0.05,
    min_data_in_leaf= 80,
    min_sum_heassian_in_leaf= 10,
    tree_learner= "serial",
    boost_from_average= "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    bagging_seed= random_state,
    verbosity= 1,
    seed= random_state
)

borutaselector = BorutaPy(lgbmclf, n_estimators=2000, verbose=2)

start_time = timer(None)
borutaselector.fit(train_df[feats].values, train_df['target'].values) 
timer(start_time)





Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	208
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	208
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	208
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	208
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	208
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	208
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	208
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	9 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	10 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	11 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	12 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	13 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	14 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	15 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	16 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	17 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	18 / 100
Confirmed: 	170
Tentative: 	9
Rejected: 	29



invalid value encountered in greater



Iteration: 	19 / 100
Confirmed: 	172
Tentative: 	7
Rejected: 	29



invalid value encountered in greater



Iteration: 	20 / 100
Confirmed: 	172
Tentative: 	7
Rejected: 	29



invalid value encountered in greater



Iteration: 	21 / 100
Confirmed: 	172
Tentative: 	7
Rejected: 	29



invalid value encountered in greater



Iteration: 	22 / 100
Confirmed: 	174
Tentative: 	5
Rejected: 	29



invalid value encountered in greater



Iteration: 	23 / 100
Confirmed: 	174
Tentative: 	4
Rejected: 	30



invalid value encountered in greater



Iteration: 	24 / 100
Confirmed: 	174
Tentative: 	4
Rejected: 	30



invalid value encountered in greater



Iteration: 	25 / 100
Confirmed: 	174
Tentative: 	4
Rejected: 	30



invalid value encountered in greater



Iteration: 	26 / 100
Confirmed: 	174
Tentative: 	4
Rejected: 	30



invalid value encountered in greater



Iteration: 	27 / 100
Confirmed: 	174
Tentative: 	4
Rejected: 	30



invalid value encountered in greater



Iteration: 	28 / 100
Confirmed: 	174
Tentative: 	4
Rejected: 	30



invalid value encountered in greater



Iteration: 	29 / 100
Confirmed: 	174
Tentative: 	3
Rejected: 	31



invalid value encountered in greater



Iteration: 	30 / 100
Confirmed: 	174
Tentative: 	3
Rejected: 	31



invalid value encountered in greater



Iteration: 	31 / 100
Confirmed: 	174
Tentative: 	3
Rejected: 	31



invalid value encountered in greater



Iteration: 	32 / 100
Confirmed: 	174
Tentative: 	3
Rejected: 	31



invalid value encountered in greater



Iteration: 	33 / 100
Confirmed: 	174
Tentative: 	3
Rejected: 	31



invalid value encountered in greater



Iteration: 	34 / 100
Confirmed: 	174
Tentative: 	3
Rejected: 	31



invalid value encountered in greater



Iteration: 	35 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	36 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	37 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	38 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	39 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	40 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	41 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	42 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	43 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	44 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	45 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	46 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	47 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	48 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	49 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	50 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	51 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	52 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	53 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	54 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	55 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	56 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	57 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	58 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	59 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	60 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	61 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	62 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	63 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	64 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	65 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	66 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	67 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	68 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	69 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	70 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	71 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	72 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	73 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	74 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	75 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	76 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	77 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	78 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	79 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	80 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	81 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	82 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	83 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	84 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	85 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	86 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	87 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	88 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	89 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	90 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	91 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	92 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	93 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	94 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	95 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	96 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	97 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	98 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32



invalid value encountered in greater



Iteration: 	99 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32


BorutaPy finished running.

Iteration: 	100 / 100
Confirmed: 	174
Tentative: 	2
Rejected: 	32


<contextlib._GeneratorContextManager at 0x11ec3b70>

In [5]:
print(borutaselector.support_)
print(train_df[feats].columns[borutaselector.support_])
print ('\n Initial features: ', train_df[feats].columns.tolist() )

# number of selected features
print ('\n Number of selected features:')
print (borutaselector.n_features_)

feature_df = pd.DataFrame(train_df[feats].columns.tolist(), columns=['features'])
feature_df['rank']=borutaselector.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
print ('\n Top %d features:' % borutaselector.n_features_)
print (feature_df.head(borutaselector.n_features_))
feature_df.to_csv('boruta-low_feature3-ranking.csv', index=False)

# check ranking of features
print ('\n Feature ranking:')
print (borutaselector.ranking_)

[ True  True  True  True  True  True  True False  True  True False  True
  True  True False  True  True False  True  True  True  True  True  True
  True  True  True False  True False False  True  True  True  True  True
  True  True False False  True False False  True  True  True False False
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True  True  True False  True  True  True  True  True  True
  True False  True  True  True  True  True False  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True False  True False  True  True False  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True False  True False  True  True False  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True False False  True  True  T

In [6]:
features = np.array(feature_df[['features']][feature_df['rank'] == 1]).tolist()
print(features)
features2 = []
for i in features:
    print(i)
    features2.append(i[0])
print(train_df[features2])
train_df[features2].to_csv('train_clean4.csv', index=False)

test_df = pd.read_csv('test_clean3.csv')
test_df[features2].to_csv('test_clean4.csv', index=False)

[['var_0'], ['var_128'], ['var_130'], ['var_131'], ['var_132'], ['var_133'], ['var_134'], ['var_135'], ['var_137'], ['var_138'], ['var_127'], ['var_139'], ['var_141'], ['var_142'], ['var_143'], ['var_144'], ['var_145'], ['var_146'], ['var_147'], ['var_148'], ['var_149'], ['var_140'], ['var_150'], ['var_125'], ['var_122'], ['var_101'], ['var_102'], ['kurt'], ['var_104'], ['var_105'], ['var_106'], ['var_107'], ['var_108'], ['var_109'], ['var_123'], ['var_110'], ['var_112'], ['var_113'], ['var_114'], ['var_115'], ['var_116'], ['var_118'], ['var_119'], ['var_120'], ['var_121'], ['var_111'], ['var_99'], ['var_151'], ['var_154'], ['var_186'], ['var_187'], ['var_188'], ['var_189'], ['var_190'], ['var_191'], ['var_192'], ['var_193'], ['var_194'], ['var_184'], ['var_195'], ['var_197'], ['var_198'], ['var_199'], ['sum'], ['min'], ['max'], ['mean'], ['std'], ['skew'], ['var_196'], ['var_152'], ['var_181'], ['var_179'], ['var_155'], ['var_156'], ['var_157'], ['var_159'], ['var_162'], ['var_163'], 

[200000 rows x 174 columns]
