# Model 0v0: Light GBM
Model 0v0 will explore the utility of using a Light GBM regressor that features extensive parameter tuning and averaging of multiple models. The inspiration for this notebook is taken from a portion of this Kaggle public kernel: https://www.kaggle.com/prashantkikani/ensembling-has-always-been-the-answer/code.

## Load Libraries:

In [9]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pickle as pkl

import h5py

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display

## Helper Functions:

In [2]:
# Load h5py file
def loadh5(fname, dname):
    h5f = h5py.File(fname, 'r')
    data = h5f[dname][:]
    h5f.close()
    return data

In [11]:
# Load pickle file
def loadpickle(fname):
    with open(fname, 'rb') as handle:
        data = pkl.load(handle)
    return data

## Load Data:

In [56]:
data_path = '../data/'
train_dname = 'train_s0'
test_dname = 'test_s0'
f_ext = '_vanilla.h5'

# Load h5py data
train_data = loadh5(data_path + train_dname + f_ext, train_dname)
test_data = loadh5(data_path + test_dname + f_ext, test_dname)
# Load dataframe indexes
train_idx = loadpickle(data_path + 'train_idx.pkl')
test_idx = loadpickle(data_path + 'test_idx.pkl')
# Load dataframe column names
train_cols = loadpickle(data_path + 'train_cols.pkl')
test_cols = loadpickle(data_path + 'test_cols.pkl')

In [16]:
# Create dataframes
train_df = pd.DataFrame(data=train_data, index=train_idx, columns=train_cols)
test_df = pd.DataFrame(data=test_data, index=test_idx, columns=test_cols)

In [17]:
# Separate the training labels
labels = train_df.target
train_df.drop(columns=['target'], inplace=True)

In [18]:
print('Shape of training dataset: {} Rows, {} Columns'.format(*train_df.shape))
print('Shape of test dataset: {} Rows, {} Columns'.format(*test_df.shape))

Shape of training dataset: 4459 Rows, 4730 Columns
Shape of test dataset: 49342 Rows, 4730 Columns


### Format Data for Modeling:

In [27]:
log_labels = np.log1p(labels)

In [28]:
xtrain, xval, ytrain, yval = train_test_split(train_df, log_labels, test_size=0.2, random_state=0)

## Train Models:

### Light GBM Regressor:
Process Flow for Light GBM Regressor Training:
* Find optimal number of rounds for a larger learning rate
* Find optimal number of rounds for smaller learning rates
* Find the optimal number of rounds and best parameters out of the trials above
* Run a final model with different seed initializations

In [29]:
lgtrain = lgb.Dataset(train_df, label=log_labels, feature_name='auto')

In [33]:
results = pd.DataFrame(columns=['Rounds', 'Score', 'STD', 'LB', 'Parameters'])

In [30]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 200,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,  # No limit on tree depth
    "lambda_l1": 0.3,
    "lambda_l2": 0.1,
    "min_sum_hessian_in_leaf": 10,
    'zero_as_missing': True
}

In [31]:
# Find optimal parameters / boosting rounds (larger learning rate)
lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgtrain,
    num_boost_round=2500,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=75)

[50]	cv_agg's rmse: 1.58742 + 0.0305161
[100]	cv_agg's rmse: 1.50458 + 0.0316924
[150]	cv_agg's rmse: 1.46337 + 0.0325538
[200]	cv_agg's rmse: 1.44265 + 0.0327991
[250]	cv_agg's rmse: 1.43354 + 0.032863
[300]	cv_agg's rmse: 1.43088 + 0.0337591
[350]	cv_agg's rmse: 1.42942 + 0.0337614
[400]	cv_agg's rmse: 1.43018 + 0.0342158


In [36]:
optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
best_cv_score = min(lgb_cv['rmse-mean'])
print('Optimal Round: {}\nOptimal Score: {} + {}'.format(optimal_rounds, best_cv_score, 
                                                         lgb_cv['rmse-stdv'][optimal_rounds]))

Optimal Round: 339
Optimal Score: 1.42926761227 + 0.0336902524524


In [37]:
# Append to results dataframe
results = results.append({'Rounds': optimal_rounds,
                          'Score': best_cv_score,
                          'STD': lgb_cv['rmse-stdv'][optimal_rounds],
                          'LB': None,
                          'Parameters': lgbm_params}, ignore_index=True)

In [39]:
# Find optimal parameters / boosting rounds (smaller learning rate)
learning_rates = [0.012, 0.008, 0.016]
for param in learning_rates:
    print 'Learning rate:', param
    lgbm_params['learning_rate'] = param
    # Get cross validated results
    lgb_cv = lgb.cv(
        params = lgbm_params,
        train_set = lgtrain,
        num_boost_round= 10000,
        stratified= False,
        nfold = 5,
        verbose_eval= 200,
        seed = 23,
        early_stopping_rounds= 75)
    optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
    best_cv_score = min(lgb_cv['rmse-mean'])
    print('Optimal Round: {}\nOptimal Score: {} + {}'.format(optimal_rounds, best_cv_score, 
                                                         lgb_cv['rmse-stdv'][optimal_rounds]))
    # Append results to results dataframe
    results = results.append({'Rounds': optimal_rounds,
                              'Score': best_cv_score,
                              'STD': lgb_cv['rmse-stdv'][optimal_rounds],
                              'LB': None,
                              'Parameters': lgbm_params}, ignore_index=True)

Learning rate: 0.012
[200]	cv_agg's rmse: 1.4344 + 0.0334958
Optimal Round: 284
Optimal Score: 1.42870101453 + 0.034950291968
Learning rate: 0.008
[200]	cv_agg's rmse: 1.45785 + 0.0314299
[400]	cv_agg's rmse: 1.42848 + 0.0341142
Optimal Round: 445
Optimal Score: 1.42693548377 + 0.034321978767
Learning rate: 0.016
[200]	cv_agg's rmse: 1.42889 + 0.0343656
Optimal Round: 195
Optimal Score: 1.42871922782 + 0.0341465784269


In [40]:
final_model_params = results.iloc[results['Score'].idxmin(), :]['Parameters']
optimal_rounds = results.iloc[results['Score'].idxmin(), :]['Rounds']

In [42]:
# Run model with different seeds
multi_seed_pred = dict()
all_feature_importance_list = []
all_seeds = [27, 22, 300, 401, 7]
for seed in all_seeds:
    print 'Seed:', seed
    final_model_params['seed'] = seed
    lgb_reg = lgb.train(final_model_params,
                        lgtrain,
                        num_boost_round = optimal_rounds+1,
                        verbose_eval = 200)
    all_feature_importance_list.append((train_cols, lgb_reg.feature_importance()))
    # Predict on test data
    multi_seed_pred[seed] = list(lgb_reg.predict(test_df))

Seed: 27
Seed: 22
Seed: 300
Seed: 401
Seed: 7


In [45]:
sub_preds = pd.DataFrame.from_dict(multi_seed_pred).replace(0, 0.000001)

In [53]:
mean_sub = np.expm1(sub_preds.mean(axis=1)).rename('target')
mean_sub.index = test_idx
mean_sub.index.name = 'ID'

In [54]:
# Make submission file
mean_sub.to_csv('../submissions/lgb_0v0_submit.csv', index=True, header=True)