In [3]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold


import lightgbm as lgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

import optuna
from scipy.stats import ks_2samp
from tqdm import tqdm

from lightgbm import LGBMRegressor
from boruta import BorutaPy

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0']

'''FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  ]'''

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
    

In [4]:
#train_df = pd.read_csv('train_clean.csv')

train_df = pd.read_csv('low_features3.csv')



feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

In [5]:
#train_df[feats] = train_df[feats].fillna(0)

lgbmclf = LGBMRegressor(device= 'gpu',
            gpu_platform_id= 1,
            gpu_device_id= 0,
            objective= 'regression_l2', 
            boosting_type= 'gbdt', 
            n_jobs= 4, max_depth= 7, 
            n_estimators= 2000, 
            subsample_freq= 2, 
            subsample_for_bin= 200000, 
            min_data_per_group= 100, 
            max_cat_to_onehot= 4, 
            cat_l2= 10.0, 
            cat_smooth= 10.0, 
            max_cat_threshold= 32, 
            metric_freq= 10, 
            verbosity= -1, 
            metric= 'rmse', 
            colsample_bytree= 0.5, 
            learning_rate= 0.0061033234451294376, 
            min_child_samples= 80, 
            min_child_weight= 100.0, 
            min_split_gain = 1e-06, 
            num_leaves= 47, 
            reg_alpha= 10.0, 
            reg_lambda= 10.0, 
            subsample= 0.9)
borutaselector = BorutaPy(lgbmclf, n_estimators=2000, verbose=2)

start_time = timer(None)
borutaselector.fit(train_df[feats].values, train_df['target'].values) 
timer(start_time)





Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	10
Tentative: 	4
Rejected: 	44



invalid value encountered in greater



Iteration: 	9 / 100
Confirmed: 	10
Tentative: 	4
Rejected: 	44



invalid value encountered in greater



Iteration: 	10 / 100
Confirmed: 	10
Tentative: 	4
Rejected: 	44



invalid value encountered in greater



Iteration: 	11 / 100
Confirmed: 	10
Tentative: 	4
Rejected: 	44



invalid value encountered in greater



Iteration: 	12 / 100
Confirmed: 	10
Tentative: 	4
Rejected: 	44



invalid value encountered in greater



Iteration: 	13 / 100
Confirmed: 	10
Tentative: 	4
Rejected: 	44



invalid value encountered in greater



Iteration: 	14 / 100
Confirmed: 	10
Tentative: 	4
Rejected: 	44



invalid value encountered in greater



Iteration: 	15 / 100
Confirmed: 	10
Tentative: 	4
Rejected: 	44



invalid value encountered in greater



Iteration: 	16 / 100
Confirmed: 	10
Tentative: 	3
Rejected: 	45



invalid value encountered in greater



Iteration: 	17 / 100
Confirmed: 	10
Tentative: 	3
Rejected: 	45



invalid value encountered in greater



Iteration: 	18 / 100
Confirmed: 	10
Tentative: 	3
Rejected: 	45



invalid value encountered in greater



Iteration: 	19 / 100
Confirmed: 	11
Tentative: 	2
Rejected: 	45



invalid value encountered in greater



Iteration: 	20 / 100
Confirmed: 	11
Tentative: 	2
Rejected: 	45



invalid value encountered in greater



Iteration: 	21 / 100
Confirmed: 	11
Tentative: 	2
Rejected: 	45



invalid value encountered in greater



Iteration: 	22 / 100
Confirmed: 	11
Tentative: 	2
Rejected: 	45



invalid value encountered in greater



Iteration: 	23 / 100
Confirmed: 	11
Tentative: 	2
Rejected: 	45



invalid value encountered in greater



Iteration: 	24 / 100
Confirmed: 	11
Tentative: 	2
Rejected: 	45



invalid value encountered in greater



Iteration: 	25 / 100
Confirmed: 	11
Tentative: 	2
Rejected: 	45



invalid value encountered in greater



Iteration: 	26 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	45



invalid value encountered in greater



Iteration: 	27 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	45



invalid value encountered in greater



Iteration: 	28 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	45



invalid value encountered in greater



Iteration: 	29 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	45



invalid value encountered in greater



Iteration: 	30 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	45



invalid value encountered in greater



Iteration: 	31 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	45



invalid value encountered in greater



Iteration: 	32 / 100
Confirmed: 	12
Tentative: 	0
Rejected: 	46


BorutaPy finished running.

Iteration: 	33 / 100
Confirmed: 	12
Tentative: 	0
Rejected: 	46


<contextlib._GeneratorContextManager at 0x12361ba8>

In [6]:
print(borutaselector.support_)
print(train_df[feats].columns[borutaselector.support_])
print ('\n Initial features: ', train_df[feats].columns.tolist() )

# number of selected features
print ('\n Number of selected features:')
print (borutaselector.n_features_)

feature_df = pd.DataFrame(train_df[feats].columns.tolist(), columns=['features'])
feature_df['rank']=borutaselector.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
print ('\n Top %d features:' % borutaselector.n_features_)
print (feature_df.head(borutaselector.n_features_))
feature_df.to_csv('boruta-low_feature3-ranking.csv', index=False)

# check ranking of features
print ('\n Feature ranking:')
print (borutaselector.ranking_)

[False False  True False False False False False False False False False
 False  True False False False False  True  True  True  True False False
 False False  True False False False False False  True False False  True
 False False False False False False False False  True False False False
 False False False  True False False  True False False False]
Index(['new_purchase_amount_max', 'hist_first_buy',
       'new_purchase_date_uptonow', 'new_purchase_date_uptomin',
       'hist_month_diff_mean', 'hist_authorized_flag_mean',
       'new_purchase_amount_mean', 'hist_category_1_mean',
       'new_purchase_date_diff', 'hist_purchase_date_uptonow',
       'hist_installments_sum', 'hist_purchase_amount_mean'],
      dtype='object')

 Initial features:  ['new_month_lag_var', 'hist_month_lag_var', 'new_purchase_amount_max', 'new_weekofyear_nunique', 'hist_month_diff_var', 'feature_1', 'hist_hour_nunique', 'new_purchase_amount_sum', 'hist_purchase_date_uptomin', 'feature_2', 'new_month_lag_min

In [11]:
def kfold_lightgbm(train_df, test_df, num_folds, stratified = False, debug= False):
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    '''#Feats selecting
    
    list_p_value =[]

    for i in tqdm(train_df[feats].columns):
        list_p_value.append(ks_2samp(test_df[feats][i] , train_df[feats][i])[1])

    Se = pd.Series(list_p_value, index = train_df[feats].columns).sort_values() 
    list_discarded = list(Se[Se < .1].index)
    print(feats)
    print(list_discarded)'''
    
    '''train_df[feats] = train_df[feats][~train_df[feats].isin([np.nan, np.inf, -np.inf]).any(1)]
    train_df[feats] = train_df[feats].fillna(0)
    test_df[feats] = test_df[feats][~test_df[feats].isin([np.nan, np.inf, -np.inf]).any(1)]
    test_df[feats] = test_df[feats].fillna(0)'''

    

    '''# k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # params optimized by optuna
        
        params ={
                'task': 'train',
                'boosting': 'goss',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'subsample':  0.11405295162945166,
                'max_depth': 8,
                'top_rate': 0.986655469613098,
                'num_leaves': 35,
                'min_child_weight': 18.323652043184435,
                'other_rate':  0.00025052072832994557,
                'reg_alpha': 9.28121761424456,
                'colsample_bytree': 0.5842626131999885,
                'min_split_gain':  0.40814934734314257,
                'reg_lambda': 8.641682583175792,
                'min_data_in_leaf':  22,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold),
                'device': 'gpu',
                  'gpu_platform_id': 1,
                  'gpu_device_id': 0,
                  'num_thread' : 1,
                  'sparse_threshold' : 1,
                }


       

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    print("CV score: {:<8.5f}".format(mean_squared_error(oof_preds, train_df['target'])**0.5))

    # display importances
    display_importances(feature_importance_df)

    if not debug:
        # save submission file
        test_df.loc[:,'target'] = sub_preds
        test_df = test_df.reset_index()
        test_df[['card_id', 'target']].to_csv(submission_file_name, index=False)'''


In [12]:
def main(debug=False):
    num_rows = 10000 if debug else None
    '''with timer("train & test"):
        df = train_test(num_rows)
    with timer("historical transactions"):
        df = pd.merge(df, historical_transactions(num_rows), on='card_id', how='outer')
    with timer("new merchants"):
        df = pd.merge(df, new_merchant_transactions(num_rows), on='card_id', how='outer')
    with timer("additional features"):
        df = additional_features(df)'''
    with timer("split train & test"):
        '''train_df = df[df['target'].notnull()]
        test_df = df[df['target'].isnull()]
        
        train_df = pd.read_csv('train_clean.csv')
        test_df = pd.read_csv('test_clean.csv')
        
        train_df = train_df.dropna(axis=1)
        test_df = test_df.dropna(axis=1)'''
        #del df
        gc.collect()
    
    with timer("Run LightGBM with kfold"):
        kfold_lightgbm(train_df, test_df, num_folds=5, stratified=False, debug=debug)

if __name__ == "__main__":
    submission_file_name = "parmssubmission4-GPU-feat.csv"
    with timer("Full model run"):
        main(debug=False)


split train & test - done in 0s
Starting LightGBM. Train shape: (201917, 203), test shape: (123623, 203)



Found `num_iteration` in params. Will use it instead of argument


Found `num_iteration` in params. Will use it instead of argument


Found `num_iteration` in params. Will use it instead of argument


Found `num_iteration` in params. Will use it instead of argument


Found `num_iteration` in params. Will use it instead of argument


Found `num_iteration` in params. Will use it instead of argument


Found `num_iteration` in params. Will use it instead of argument


Found `num_iteration` in params. Will use it instead of argument


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will u


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it 


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found `num_iteration` in params. Will use it instead of argument


invalid value encountered in greater


Found 

Run LightGBM with kfold - done in 22600s
Full model run - done in 22600s
