# Model 2v4: LightGBM with Test-Only Data

In [None]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt

import pdb
import os
import gc; gc.enable()
import h5py
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pickle.load(handle)


# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    os.chdir('../data/compressed/')
    print os.getcwd()
    pkl_files = ['train_id.pickle', 'trainidx.pickle', 'target.pickle', 'test_id.pickle', 'testidx.pickle']
    if debug:
        print 'Loading debug train and test datasets...'
        # h5py files
        train = load_h5py('debug_train.h5')
        test = load_h5py('debug_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('debug_%s'%f) for f in pkl_files]
    else:
        print 'Loading original train and test datasets...'
        # h5py files
        train = load_h5py('full_train.h5')
        test = load_h5py('full_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('full_%s'%f) for f in pkl_files]
    # Load feature names
    fnames = load_pickle('feature_names.pickle')
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))
    os.chdir('../../scripts/')
    print os.getcwd()
    return fnames, train, id_train, train_idx, target, test, id_test, test_idx


# Function for getting datasets in dataframe format
def get_dataframes(debug=False):
    # Load data
    fnames, train, id_train, train_idx, target, test, id_test, test_idx = get_input(debug)
    # Format data
    train_df = pd.DataFrame(data=train, index=train_idx, columns=fnames)
    train_df['ID'] = id_train
    train_df['target'] = target
    test_df = pd.DataFrame(data=test, index=test_idx, columns=fnames)
    test_df['ID'] = id_test
    
    print('\nShape of training dataframe: {} Rows, {} Columns'.format(*train_df.shape))
    print('Shape of test dataframe: {} Rows, {} Columns'.format(*test_df.shape))
    return fnames, train_df, test_df

In [None]:
# Function for loading leaks
def load_leaks(leak_val):
    leak_dir = './time_series/stats/'
    
    train_leak_loc = leak_dir + 'train_leak_%s.csv'%leak_val
    train_leak = pd.read_csv(train_leak_loc).compiled_leak
    test_leak_loc = leak_dir + 'test_leak_%s.csv'%leak_val
    test_leak = pd.read_csv(test_leak_loc).compiled_leak
    
    return train_leak, test_leak


# Function for applying statistical transformations to data
def calculate_metadata(df):
    '''
    Function for calculating metadata across pandas dataframe row
    '''
    meta = pd.DataFrame()
    # Calculations that disregard zeros
    meta['nz_mean'] = df.apply(lambda x: x[x!=0].mean(), axis=1)
    meta['nz_log_mean_exp'] = df.apply(lambda x: np.expm1(np.mean(np.log1p(x[x!=0]))), axis=1)
    meta['nz_median'] = df.apply(lambda x: x[x!=0].median(), axis=1)
    meta['nz_std'] = df.apply(lambda x: x[x!=0].std(), axis=1)
    meta['nz_kurtosis'] = df.apply(lambda x: x[x!=0].kurtosis(), axis=1)
    meta['nz_min'] = df.apply(lambda x: np.min(x[x!=0]), axis=1)
    
    # Calculations independent of zeros
    meta['sum'] = df.apply(lambda x: np.sum(x), axis=1)
    meta['max'] = df.apply(lambda x: np.max(x), axis=1)
    
    # Calculations factoring in zeros
    meta['zero_count'] = df.apply(lambda x: np.count_nonzero(x==0), axis=1)
    meta['mean'] = df.apply(lambda x: x.mean(), axis=1)
    meta['log_mean_exp'] = df.apply(lambda x: np.expm1(np.mean(np.log1p(x))), axis=1)
    meta['median'] = df.apply(lambda x: x.median(), axis=1)
    meta['std'] = df.apply(lambda x: x.std(), axis=1)
    meta['kurtosis'] = df.apply(lambda x: x.kurtosis(), axis=1)
    
    return meta


# Function for feature engineering
def format_for_training(train, test, f, trn_res, tst_res, lagval=38):
    '''
    - Formats train and test dataframes for training
    '''
    tmp_trn = train.copy(deep=True)
    tmp_trn['log_leak'] = np.log1p(trn_res['target'].values)
    
    tmp_tst = test.copy(deep=True)
    tmp_tst['log_leak'] = np.log1p(tst_res['target'].values)
    
    score_name = './model_data/model_2v4_featscores_test_%s.csv'%lagval
    print 'Loading file:\n', score_name
    score_df = pd.read_csv(score_name, index_col=0)
    
    # Select good features
    threshold = 0.017233  # lag 38
    good_features = score_df.loc[score_df['rmse']<=threshold].index
    
    print '\nLoading metadata for training set...'
    trn_meta = pd.read_csv('./model_data/train_meta.csv')
    print 'Loading metadata for test set...'
    tst_meta = pd.read_csv('./model_data/test_meta.csv')
    
    # Format training and test datasets
    cols = ['ID'] + list(good_features) + list(trn_meta.columns.values) + ['log_leak']
    tmp_trn = pd.concat([tmp_trn, trn_meta], axis=1)
    tmp_tst = pd.concat([tmp_tst, tst_meta], axis=1)
    
    return tmp_trn[cols], tmp_tst[cols]


# Function for scaling datasets 
def scale_for_training(train, test, real):
    print 'Scaling data...'
    tmp_trn = train.copy(deep=True)
    tmp_trn.replace(np.nan, 0, inplace=True)
    tmp_tst = test.copy(deep=True)
    tmp_tst.replace(np.nan, 0, inplace=True)
    tmp_rel = real.copy(deep=True)
    tmp_rel.replace(np.nan, 0, inplace=True)
    
    tmp_trn.drop(labels=['ID'], axis=1, inplace=True)
    tmp_tst.drop(labels=['ID'], axis=1, inplace=True)
    tmp_rel.drop(labels=['ID'], axis=1, inplace=True)
    
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(tmp_trn)
    scaled_test = scaler.transform(tmp_tst)
    scaled_real = scaler.transform(tmp_rel)
    
    del tmp_trn, tmp_tst, tmp_rel; gc.collect();
    return scaled_train, scaled_test, scaled_real

In [None]:
# Main Script
try:
    del fnames, train, test
    print 'Clearing loaded dataframes from memory...\n'
except:
    pass
fnames, train, test = get_dataframes(debug=False)

# Load leaks
leak_val = 38
print '\nLoading train and test leaks...'
train_leak, test_leak = load_leaks(leak_val)
print 'Nonzero elements in train:', np.count_nonzero(train_leak)
print 'Nonzero elements in test:', np.count_nonzero(test_leak)

# Load good result
print '\nLoading good results for train and test predictions...'

tst_res_name = './model_data/tstest_lgb_2v2_lag38_good_test.csv'
trn_res_name = './model_data/tstest_lgb_2v2_lag38_good_train.csv'

tst_res = pd.read_csv(tst_res_name)
trn_res = pd.read_csv(trn_res_name)
print 'Shape of test results import:', tst_res.shape
print 'Shape of train results import:', trn_res.shape

# Find hit and miss indexes
trn_leak_idx = np.where(train_leak!=0)[0]
trn_miss_idx = np.where(train_leak==0)[0]

tst_leak_idx = np.where(test_leak!=0)[0]
tst_miss_idx = np.where(test_leak==0)[0]

In [None]:
# Get train and test in format for training booster

# Real train target
y = train['target'].values
train_target = np.log1p(y)

# Define target in terms of test set leaks
target = test_leak[tst_leak_idx].values
target_log = np.log1p(target)

pp_flag = False
train_name = './model_data/btrain_%s_test.csv'%leak_val
test_name = './model_data/btest_%s_test.csv'%leak_val
if pp_flag:
    btrain, btest = format_for_training(train, test, fnames, trn_res, tst_res, leak_val)
    print '\nSaving generated datasets...'
    btrain.to_csv(train_name, index=False)
    btest.to_csv(test_name, index=False)
else:
    print '\nLoading generated datasets...'
    btrain = pd.read_csv(train_name)
    btest = pd.read_csv(test_name)

In [None]:
# Create dataset for boosting
tst_training = btest.loc[tst_leak_idx]
tst_testing = btest.loc[tst_miss_idx]

boost_train, boost_test, real_train = scale_for_training(tst_training, tst_testing, btrain)

print 'New shape of boost train:', boost_train.shape
print 'New shape of boost test:', boost_test.shape
print 'New shape of real train:', real_train.shape

In [None]:
# Custom evaluation metric for LGBM
def RMSLE(preds, train_data):
    return 'RMSLE', np.sqrt(mean_squared_error(train_data.get_label(), preds)), False


# Function for evaluating mean errors
def average_best_scores(scores):
    s_cols = [c for c in scores.columns if c!='index']
    errs = scores[s_cols].apply(lambda x: np.min(x[x!=np.nan]), axis=0)
    return np.mean(errs)


# Function for plotting RMSLE averaged over all iterations
def plot_scores(t_scores, v_scores):
    s_cols = [c for c in t_scores.columns if c!='index']
    t_mean = np.mean(t_scores[s_cols], axis=1)
    t_mean = t_mean[~np.isnan(t_mean)]
    v_mean = np.mean(v_scores[s_cols], axis=1)
    v_mean = v_mean[~np.isnan(v_mean)]
    
    plt.figure(figsize=(10, 6))
    plt.plot(v_mean, label='validation', c='orange', alpha=0.7)
    plt.plot(t_mean, label='training', c='orange', alpha=0.7)
    plt.title('Averaged Training and Validation Error')
    plt.xlabel('Training Iteration')
    plt.ylabel('RMSLE')
    plt.legend(loc='upper right')
    plt.show()
    return None


# Function for training a LGB Regressor
def train_lgb_regressor(train, target, test, real, params, n_boost=500):
    num_boosting = n_boost
    valid_result = pd.DataFrame(data=np.arange(num_boosting), columns=['index'])
    train_result = pd.DataFrame(data=np.arange(num_boosting), columns=['index'])
    test_predictions = np.zeros(test.shape[0])
    train_predictions = np.zeros(train.shape[0])
    real_predictions = np.zeros(real.shape[0])
    kfold = KFold(n_splits=4)
    
    for i, (trn, val) in enumerate(kfold.split(train)):
        print '\nTraining on fold:', i
        round_name = 'round_%s'%i
        xtrain = train[trn, :]
        ytrain = target[trn]

        xval = train[val, :]
        yval = target[val]

        lgb_train = lgb.Dataset(xtrain, ytrain)
        lgb_eval = lgb.Dataset(xval, yval, reference=lgb_train)

        evals_result = {}
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=num_boosting,
                        valid_sets= (lgb_train, lgb_eval),
                        verbose_eval=500,
                        feval=RMSLE, 
                        evals_result=evals_result,
                        early_stopping_rounds=50)

        # Get evaluation results
        valid_res_df = pd.DataFrame.from_dict(evals_result['valid_1'], orient='columns')
        valid_res_df.rename(columns={'RMSLE': round_name}, inplace=True)
        valid_res_df.reset_index(inplace=True)
        train_res_df = pd.DataFrame.from_dict(evals_result['training'], orient='columns')
        train_res_df.rename(columns={'RMSLE': round_name}, inplace=True)
        train_res_df.reset_index(inplace=True)

        valid_result = pd.merge(valid_result, valid_res_df, how='outer', on='index')
        train_result = pd.merge(train_result, train_res_df, how='outer', on='index')

        # Make predictions
        test_predictions += (gbm.predict(test, num_iteration=gbm.best_iteration))/kfold.n_splits
        train_predictions += (gbm.predict(train, num_iteration=gbm.best_iteration))/kfold.n_splits
        real_predictions += (gbm.predict(real, num_iteration=gbm.best_iteration))/kfold.n_splits
    
    return train_result, valid_result, test_predictions, train_predictions, real_predictions


# Function for calculating real scores for training set
def get_real_scores(trn_preds, train_target, trn_miss_idx):
    '''
    Splits predictions into training leak and miss
    Gets scores for both sets
    '''
    train_miss_preds = trn_preds[trn_miss_idx]
    train_target_miss = train_target[trn_miss_idx]
    
    # Calculate scores
    full_score = np.sqrt(mean_squared_error(trn_preds, train_target))
    miss_score = np.sqrt(mean_squared_error(train_miss_preds, train_target_miss))
    
    return full_score, miss_score

In [None]:
# LGB Regressor Parameters
params = {'task': 'train',
          'boosting': 'gbdt',
          'objective': 'regression',
          'metric': 'RMSLE',
          'num_leaves': 7,
          'learning_rate': 0.08,
          'max_depth': -1,
          'feature_fraction': 0.9,
          'bagging_fraction': 0.73,
          'bagging_freq': 5,
          'min_sum_hessian_in_leaf': 1e-3,
          'lambda_l2': 30,
          'verbose': 0}

In [None]:
# Tune LGB Regressor Hyper-Parameters
tune_flag = False
if tune_flag:
    cv_results = {}

#     cv_entry = 'num_leaves'
#     cv_set = [5, 8, 11]

#     cv_entry = 'learning_rate'
#     cv_set = [0.6, 0.8, 0.9]

#     cv_entry = 'max_depth'
#     cv_set = [-1, 6, 10, 16]

#     cv_entry = 'feature_fraction'
#     cv_set = [0.8, 0.99]

#     cv_entry = 'bagging_fraction'
#     cv_set = [0.70, 0.73, 0.75]

#     cv_entry = 'min_sum_hessian_in_leaf'
#     cv_set = [1e-4, 1e-3, 1e-2]

    cv_entry = 'lambda_l2'
    cv_set = [27, 30, 33]

    for cv_val in cv_set:
        print '\nCross validating with %s: %s'%(cv_entry, cv_val)
        params[cv_entry] = cv_val
        # Get predictions
        train_res, valid_res, tst_preds, trn_preds, rel_preds = train_lgb_regressor(train=boost_train, 
                                                                                    target=target_log, 
                                                                                    test=boost_test,
                                                                                    real=real_train,
                                                                                    params=params, 
                                                                                    n_boost=30000)
        # Get real and train / val scores
        trn_full, trn_miss = get_real_scores(trn_preds=rel_preds,
                                             train_target=train_target,
                                             trn_miss_idx=trn_miss_idx)
        train_res_score = average_best_scores(train_res)
        valid_res_score = average_best_scores(valid_res)
        # Compile all scores
        cv_results[cv_entry+'_full_miss_train_val_%s'%cv_val] = (trn_full, trn_miss,
                                                                 train_res_score, valid_res_score)
        
    # Show cv results
    print '\nCV Results:'
    for key in cv_results.keys():
        print key, cv_results[key]

In [None]:
# Confirm current parameter settings
train_res, valid_res, tst_preds, trn_preds, rel_preds = train_lgb_regressor(train=boost_train, 
                                                                            target=target_log, 
                                                                            test=boost_test,
                                                                            real=real_train,
                                                                            params=params, 
                                                                            n_boost=30000)
# Get real and train / val scores
train_res_score = average_best_scores(train_res)
valid_res_score = average_best_scores(valid_res)

trn_full, trn_miss = get_real_scores(trn_preds=rel_preds,
                                     train_target=train_target,
                                     trn_miss_idx=trn_miss_idx)

print '\nTraining score:', train_res_score
print 'Validation score:', valid_res_score

print '\nReal full training score:', trn_full
print 'Real training missed score:', trn_miss

In [None]:
def make_test_target(tst_leak, tst_preds):
    exp_preds = np.expm1(tst_preds)
    
    fill_idx = np.where(tst_leak==0)[0]
    
    tmp_leak = tst_leak.copy()
    tmp_leak[fill_idx] = exp_preds
    return tmp_leak

In [None]:
# Make submission

tst_target = make_test_target(test_leak, tst_preds)

save_flag=True

sub_name_tst = '../submissions/tstest_lgb_2v4_lag%s_submit.csv'%leak_val
tst_df = pd.DataFrame()
tst_df['ID'] = test.ID
tst_df['target'] = tst_target

if save_flag:
    tst_df.to_csv(sub_name_tst, index=False)

tst_df.head(15)

In [None]:
def root_mean_square_log_diff(sub1, sub2):
    return np.sqrt(np.mean(np.square(np.subtract(np.log1p(sub1), np.log1p(sub2)))))

def sum_abs_diff(sub1, sub2):
    return np.sum(np.abs(np.subtract(sub1, sub2)))

fri_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag36_submit_0.52.csv')
sat_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag38_bad_submit_0.52.csv')
sun_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag38_good_submit.csv')
v5_df = pd.read_csv('../submissions/tstest_lgb_2v5_lag38_submit.csv')

right_idx = np.where(test_leak!=0)[0]
wrong_idx = np.where(test_leak==0)[0]

sub_right = tst_df['target'].values[right_idx]
sun_right = sun_df['target'].values[right_idx]
fri_right = fri_df['target'].values[right_idx]
sat_right = sat_df['target'].values[right_idx]
v5_right = v5_df['target'].values[right_idx]

sub_wrong = tst_df['target'].values[wrong_idx]
sun_wrong = sun_df['target'].values[wrong_idx]
fri_wrong = fri_df['target'].values[wrong_idx]
sat_wrong = sat_df['target'].values[wrong_idx]
v5_wrong = v5_df['target'].values[wrong_idx]

In [None]:
print 'Sum abs diff:'
print sum_abs_diff(sub_right, sun_right)
print sum_abs_diff(sub_wrong, sun_wrong)

print '\nRoot mean square log diff:'
print root_mean_square_log_diff(sub_right, sun_right)
print root_mean_square_log_diff(sub_wrong, sun_wrong)

In [None]:
print 'Sum abs diff:'
print sum_abs_diff(sub_right, v5_right)
print sum_abs_diff(sub_wrong, v5_wrong)

print '\nRoot mean square log diff:'
print root_mean_square_log_diff(sub_right, v5_right)
print root_mean_square_log_diff(sub_wrong, v5_wrong)