In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import optuna
import lightgbm as lgb
from path import Path
from sklearn.model_selection import StratifiedKFold
class Config:
    input_path = Path('../input/porto-seguro-safe-driver-prediction')
    optuna_lgb = False
    n_estimators = 1500
    early_stopping_round = 150
    cv_folds = 5
    random_state = 0
    params = {'objective': 'binary',
              'boosting_type': 'gbdt',
              'learning_rate': 0.01,
              'max_bin': 25,
              'num_leaves': 31,
              'min_child_samples': 1500,
              'colsample_bytree': 0.7,
              'subsample_freq': 1,
              'subsample': 0.7,
              'reg_alpha': 1.0,
              'reg_lambda': 1.0,
              'verbosity': 0,
              'random_state': 0}
    
config = Config()

In [None]:
# import the training, test, and sample submission datasets; 
# set the index of the uploaded DataFrames to the identifier (the id column) of each data example
train = pd.read_csv(config.input_path / 'train.csv', index_col='id')
test = pd.read_csv(config.input_path / 'test.csv', index_col='id')
submission = pd.read_csv(config.input_path / 'sample_submission.csv', index_col='id')
calc_features = [feat for feat in train.columns if "_calc" in feat]
cat_features = [feat for feat in train.columns if "_cat" in feat]


In [None]:
# extract the target (a binary target of 0s and 1s) and remove it from the training dataset
target = train["target"]
train = train.drop("target", axis="columns")

In [None]:
#drop calc features
train = train.drop(calc_features, axis="columns")
test = test.drop(calc_features, axis="columns")


In [None]:
# One-hot encoding the categorical features
train = pd.get_dummies(train, columns=cat_features)
test = pd.get_dummies(test, columns=cat_features)
assert((train.columns==test.columns).all())

 A useful [link](https://pythonsimplified.com/difference-between-onehotencoder-and-get_dummies/) discussing the difference between OneHotEncoding and Pandas get_dummies 

Define evaluation metric - normalized Gini coefficient, code is from CPMP's [post](https://www.kaggle.com/code/cpmpml/extremely-fast-gini-computation)

In [None]:
from numba import jit

@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_lgb(y_true, y_pred):
    eval_name = 'normalized_gini_coef'
    eval_result = eval_gini(y_true, y_pred)
    is_higher_better = True
    return eval_name, eval_result, is_higher_better

training params by Optuna; set the optuna_lgb flag to True in the Config class first; it takes around 4 mins per trial

In [None]:
if config.optuna_lgb:
        
    def objective(trial):
        params = {
    'learning_rate': trial.suggest_float("learning_rate", 0.01, 1.0),
    'num_leaves': trial.suggest_int("num_leaves", 3, 255),
    'min_child_samples': trial.suggest_int("min_child_samples", 
                                           3, 3000),
    'colsample_bytree': trial.suggest_float("colsample_bytree", 
                                            0.1, 1.0),
    'subsample_freq': trial.suggest_int("subsample_freq", 0, 10),
    'subsample': trial.suggest_float("subsample", 0.1, 1.0),
    'reg_alpha': trial.suggest_float("reg_alpha", 1e-9, 10.0, log=True),
    'reg_lambda': trial.suggest_float("reg_lambda", 1e-9, 10.0, log=True),
        }
        
        score = list()
        skf = StratifiedKFold(n_splits=config.cv_folds, shuffle=True, 
                              random_state=config.random_state)
        for train_idx, valid_idx in skf.split(train, target):
            X_train = train.iloc[train_idx]
            y_train = target.iloc[train_idx]
            X_valid = train.iloc[valid_idx] 
            y_valid = target.iloc[valid_idx]
            model = lgb.LGBMClassifier(**params,
                                    n_estimators=1500,
                                    early_stopping_round=150,
                                    force_row_wise=True)
            callbacks=[lgb.early_stopping(stopping_rounds=150, 
                                          verbose=False)]
            model.fit(X_train, y_train, 
                      eval_set=[(X_valid, y_valid)],  
                      eval_metric=gini_lgb, callbacks=callbacks)
              
            score.append(
                model.best_score_['valid_0']['normalized_gini_coef'])
        return np.mean(score)
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=300)
    print("Best Gini Normalized Score", study.best_value)
    print("Best parameters", study.best_params)
    
    params = {'objective': 'binary',
              'boosting_type': 'gbdt',
              'verbosity': 0,
              'random_state': 0}
    
    params.update(study.best_params)
    
else:
    params = config.params


I manually stopped the run at trial 148 since the best trial is trial 77 with value 0.2884217536815791.

>  Trial 147 finished with value: 0.28603083152555736 and parameters: {'learning_rate': 0.05304487091205322, 'num_leaves': 55, 'min_child_samples': 2774, 'colsample_bytree': 0.5997781187584867, 'subsample_freq': 4, 'subsample': 0.9578570860177447, 'reg_alpha': 0.34649717192668644, 'reg_lambda': 0.04990565230921181}. Best is trial 77 with value: 0.2884217536815791.

update params with these values
> Trial 77 finished with value: 0.2884217536815791 and parameters: {'learning_rate': 0.034839525105479385, 'num_leaves': 27, 'min_child_samples': 2336, 'colsample_bytree': 0.5365060782638853, 'subsample_freq': 7, 'subsample': 0.9202881612831774, 'reg_alpha': 3.9121189920106114, 'reg_lambda': 0.0004977611982062695}. Best is trial 77 with value: 0.2884217536815791.

In [None]:
config.params.update({'learning_rate': 0.034839525105479385, 'num_leaves': 27, 'min_child_samples': 2336, 'colsample_bytree': 0.5365060782638853, 'subsample_freq': 7, 'subsample': 0.9202881612831774, 'reg_alpha': 3.9121189920106114, 'reg_lambda': 0.0004977611982062695})

In [None]:
config.params

 train a model on each cross-validation fold and use that fold to contribute to an average of test predictions; the model took around 10 mins to run.

In [None]:
preds = np.zeros(len(test))
oof = np.zeros(len(train))
metric_evaluations = list()
skf = StratifiedKFold(n_splits=config.cv_folds, shuffle=True, random_state=config.random_state)
for idx, (train_idx, valid_idx) in enumerate(skf.split(train, 
                                                       target)):
    print(f"CV fold {idx}")
    X_train, y_train = train.iloc[train_idx], target.iloc[train_idx]
    X_valid, y_valid = train.iloc[valid_idx], target.iloc[valid_idx]
    
    model = lgb.LGBMClassifier(**params,
                               n_estimators=config.n_estimators,
                    early_stopping_round=config.early_stopping_round,
                               force_row_wise=True)
    
    callbacks=[lgb.early_stopping(stopping_rounds=150), 
               lgb.log_evaluation(period=100, show_stdv=False)]
                                                                                           
    model.fit(X_train, y_train, 
              eval_set=[(X_valid, y_valid)], 
              eval_metric=gini_lgb, callbacks=callbacks)
    metric_evaluations.append(
                model.best_score_['valid_0']['normalized_gini_coef'])
    preds += (model.predict_proba(test,  
              num_iteration=model.best_iteration_)[:,1] 
              / skf.n_splits)
    oof[valid_idx] = model.predict_proba(X_valid, 
                    num_iteration=model.best_iteration_)[:,1]



In [None]:
metric_evaluations

print normalized gini coefficient

In [None]:
print(f'LightBGM CV normalized Gini coefficient:{np.mean(metric_evaluations):0.3f} ({np.std(metric_evaluations):0.3f})')

In [None]:
submission['target']=preds
submission.to_csv('lgb_submission.csv')

oofs = pd.DataFrame({'id':train.index, 'target':oof})
oofs.to_csv('lgb_oof.csv', index=False)