# Model 2v1: Time Series Sequencing with Light GBM
This model is inspired by the public kernel: https://www.kaggle.com/the1owl/love-is-the-answer

Model 2v1 is a combination of original machine learning along with a sequence of weighted blending with top public leaderboard results. As a disclaimer, the public results I've chosen differ from the ones used in the public kernel linked above. The scores I will be blending together are:
1. *ts_lgb_2v0_submit_0.66.csv* - **PL Score: 0.66** (Created from my own Model 2v0)
2. *best_pub_blend.csv* - **PL Score: 0.63** (Taken from: https://www.kaggle.com/ashishpatel26/blending)
3. *seq_ext_blend.csv* - **PL Score: 0.63** (Taken from: https://www.kaggle.com/prashantkikani/santad-label-is-present-in-row)

At its core, the machine learning portion of Model 2v1 is an extension of Model 2v0's results. In Model 2v0, a "feature scoring" algorithm using XGBoost was used to determine the predictive value of each individual feature in the Santander dataset. Model 2v1 employs a different strategy. Model 2v1's strategy for choosing important features to include in the LightGBM training phase utilizes two concepts:
1. Prioritizing by number of values that a feature shares with the target variable
2. Checking that all the values within a candidate feature are within a 5% offset from their respective target variable value

This second concept's idea of "sequencing" is conducted column-wise. Additionally, however, Model 2v0 also incorporates a notion of sequencing row-wise by adding additional features to the training set to capture the relative positions of the samples within the dataset (factoring in the **default lag value of 2**).

Another point of deviation between Model 2v0 and Model 2v1 (apart from the blending involved in Model 2v1) is that Model 2v1 trains an LGB model on the entirety of the training and test sets rather than the train itself. The consequence of this training method is something to watch out for in this model.

By itself (ie without the usage of blending), it may not be immediately guaranteed that Model 2v1 will outperform Model 2v0. The import feature selection process in 2v1 is largely based on a looser criteria than 2v0 which relied entirely upon XGBoost results. However, with the inclusion of a cascaded blending procedure, Model 2v1 should be expected to comfortably outperform Model 2v0.

In [None]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pdb
import os
import h5py
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [None]:
# Debug flag
debug = False

## Helper Functions:

In [None]:
# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pickle.load(handle)
# Function for saving pickle file
def save_pickle(fname, data):
    with open(fname, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return None

In [None]:
# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    os.chdir('../data/compressed/')
    print os.getcwd()
    pkl_files = ['train_id.pickle', 'trainidx.pickle', 'target.pickle', 'test_id.pickle', 'testidx.pickle']
    if debug:
        print 'Loading debug train and test datasets...'
        # h5py files
        train = load_h5py('debug_train.h5')
        test = load_h5py('debug_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('debug_%s'%f) for f in pkl_files]
    else:
        print 'Loading original train and test datasets...'
        # h5py files
        train = load_h5py('full_train.h5')
        test = load_h5py('full_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('full_%s'%f) for f in pkl_files]
    # Load feature names
    fnames = load_pickle('feature_names.pickle')
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))
    os.chdir('../../scripts/')
    print os.getcwd()
    return fnames, train, id_train, train_idx, target, test, id_test, test_idx

In [None]:
# Function for getting datasets in dataframe format
def get_dataframes(debug=False):
    # Load data
    fnames, train, id_train, train_idx, target, test, id_test, test_idx = get_input(debug)
    # Format data
    train_df = pd.DataFrame(data=train, index=train_idx, columns=fnames)
    train_df['ID'] = id_train
    train_df['target'] = target
    test_df = pd.DataFrame(data=test, index=test_idx, columns=fnames)
    test_df['ID'] = id_test
    
    print('\nShape of training dataframe: {} Rows, {} Columns'.format(*train_df.shape))
    print('Shape of test dataframe: {} Rows, {} Columns'.format(*test_df.shape))
    return fnames, train_df, test_df

## Main Script:

In [None]:
try:
    del fnames, train, test
    print 'Clearing loaded dataframes from memory...\n'
except:
    pass
fnames, train, test = get_dataframes(debug=debug)

In [None]:
# Load Model 2v0's results
model_2v0_name = '../submissions/ts_lgb_2v0_submit_0.66.csv'
res_2v0 = pd.read_csv(model_2v0_name)

In [None]:
# Find important columns
important_cols = []
for col in fnames:
    crit1 = np.sum(train[col]==train['target']).astype(int)
    crit2 = np.sum(((train[col]-train['target'])/train['target'])<0.05).astype(int)
    if crit1>30 and crit2>3500:
        important_cols.append(col)
print '\nNumber of important features: %s\n'%len(important_cols)

In [None]:
# Rewrite train and test data
cols = important_cols
train_data = train[cols + ['ID', 'target']]
test_data = test[cols + ['ID']]

In [None]:
# Function for calculating row-wise metadata
def get_meta(df, cols):
    df['nz_mean'] = df[cols].apply(lambda x: x[x!=0].mean(), axis=1)
    df['nz_max'] = df[cols].apply(lambda x: x[x!=0].max(), axis=1)
    df['nz_min'] = df[cols].apply(lambda x: x[x!=0].min(), axis=1)
    df['num_zero'] = df[cols].apply(lambda x: len(x[x==0]), axis=1)
    df['mean'] = df[cols].apply(lambda x: x.mean(), axis=1)
    df['max'] = df[cols].apply(lambda x: x.max(), axis=1)
    df['min'] = df[cols].apply(lambda x: x.min(), axis=1)
    return df

In [None]:
# Add metadata to train and test data
print 'Adding metadata to train set...'
train_data = get_meta(train_data, important_cols)
print 'Adding metadata to test set...'
test_data = get_meta(test_data, important_cols)
# Update column list
cols += ['nz_mean', 'nz_max', 'nz_min', 'num_zero', 'mean', 'max', 'min']

In [None]:
# Add in sequencing information row-wise
for i in range(2, 100):
    train_data['index_%s'%str(i)] = ((train_data.index + 2)%i == 0).astype(int)
    test_data['index_%s'%str(i)] = ((test_data.index + 2)%i == 0).astype(int)
    cols.append('index_%s'%str(i))

In [None]:
# Add leak to test set
test_data = pd.merge(test_data, res_2v0, how='left', on='ID')

In [None]:
# Format datasets for training and predicting
train_data.replace(0, np.nan, inplace=True)
test_data.replace(0, np.nan, inplace=True)
# Concat the two dataframes together
train_data = pd.concat((train_data, test_data), axis=0, ignore_index=True)

### Train LightGBM:

In [None]:
num_folds = 5
folds = KFold(n_splits=num_folds, shuffle=True, random_state=0)

test_pred = np.zeros(test_data.shape[0])
train_pred = np.zeros(train_data.shape[0])
# Begin iteration over folds
for i, (trn, val) in enumerate(folds.split(train_data)):
    print 'Iterating on fold %s...'%i
    # Define parameters
    params = {'learning_rate': 0.02, 
              'max_depth': 7, 
              'boosting': 'gbdt', 
              'objective': 'regression', 
              'metric': 'rmse', 
              'is_training_metric': True, 
              'feature_fraction': 0.9, 
              'bagging_fraction': 0.8, 
              'bagging_freq': 5, 
              'seed': i}
    # Define LGB datasets
    dtrain = lgb.Dataset(train_data[cols].iloc[trn], np.log1p(train_data.target.values[trn]))
    dval = lgb.Dataset(train_data[cols].iloc[val], np.log1p(train_data.target.values[val]))
    # Train model
    model = lgb.train(params=params,
                      train_set = dtrain,
                      valid_sets = dval,
                      num_boost_round = 3000,
                      early_stopping_rounds = 100,
                      verbose_eval = 200)
    # Update predictions
    test_pred += np.expm1(model.predict(test_data[cols], num_iteration = model.best_iteration))
    train_pred[val] = model.predict(train_data[cols].iloc[val], num_iteration = model.best_iteration)
    # Find validation error
    val_error = np.sqrt(mean_squared_error(np.log1p(train_data.target.values[val]), train_pred[val]))
    print 'Validation error for %s fold is: %f'%(i, val_error)

### Implement Cascaded Blending:

In [None]:
# Import leak values
leak_path = './time_series/stats/'
path_test_leak = leak_path + 'test_leak.csv'
# Add test leak
test_leak = pd.read_csv(path_test_leak)

In [None]:
# Import files for blending
best_pub_name = '../submissions/public/best_pub_blend.csv'
seq_ext_name = '../submissions/public/seq_ext_blend.csv'
res_best = pd.read_csv(best_pub_name)
res_seq = pd.read_csv(seq_ext_name)

In [None]:
# Format LightGBM results
test_data['target'] = test_pred
test_data['target'] /= folds.n_splits
# Replace predictions with time-series results from imported leak
test_data.loc[test_leak['compiled_leak'].notnull(), 'target'] = test_leak.loc[test_leak['compiled_leak'].notnull(), 
                                                                              'compiled_leak']

In [None]:
# Make dataframe of targets
sub_df = pd.DataFrame()
sub_df['ID'] = test_data['ID']
sub_df['res_ml'] = test_data['target']
sub_df['res_2v0'] = res_2v0['target']
sub_df['res_seq'] = res_seq['target']
sub_df['res_best'] = res_best['target']
# Start blending
sub_df['blend1'] = 0.8*sub_df.res_2v0 + 0.2*sub_df.res_ml
sub_df['blend2'] = 0.8*sub_df.res_seq + 0.2*sub_df.blend1
sub_df['blend3'] = 0.5*sub_df.res_seq + 0.5*sub_df.blend2
sub_df['blend4'] = 0.6*sub_df.res_best + 0.4*sub_df.blend3
# Define target variable as latest blend
sub_df['target'] = sub_df.blend4

In [None]:
# Save final results
sub_name = '../submissions/ts_lgb_2v1_submit.csv'
sub_df[['ID', 'target']].to_csv(sub_name, index=False)

In [None]:
sub_df.head(10)