# Model 2v2: LightGBM with Robust Time-Series Reconstruction

In [None]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt

import pdb
import os
import gc; gc.enable()
import h5py
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pickle.load(handle)


# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    os.chdir('../data/compressed/')
    print os.getcwd()
    pkl_files = ['train_id.pickle', 'trainidx.pickle', 'target.pickle', 'test_id.pickle', 'testidx.pickle']
    if debug:
        print 'Loading debug train and test datasets...'
        # h5py files
        train = load_h5py('debug_train.h5')
        test = load_h5py('debug_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('debug_%s'%f) for f in pkl_files]
    else:
        print 'Loading original train and test datasets...'
        # h5py files
        train = load_h5py('full_train.h5')
        test = load_h5py('full_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('full_%s'%f) for f in pkl_files]
    # Load feature names
    fnames = load_pickle('feature_names.pickle')
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))
    os.chdir('../../scripts/')
    print os.getcwd()
    return fnames, train, id_train, train_idx, target, test, id_test, test_idx


# Function for getting datasets in dataframe format
def get_dataframes(debug=False):
    # Load data
    fnames, train, id_train, train_idx, target, test, id_test, test_idx = get_input(debug)
    # Format data
    train_df = pd.DataFrame(data=train, index=train_idx, columns=fnames)
    train_df['ID'] = id_train
    train_df['target'] = target
    test_df = pd.DataFrame(data=test, index=test_idx, columns=fnames)
    test_df['ID'] = id_test
    
    print('\nShape of training dataframe: {} Rows, {} Columns'.format(*train_df.shape))
    print('Shape of test dataframe: {} Rows, {} Columns'.format(*test_df.shape))
    return fnames, train_df, test_df

In [None]:
# Function for loading leaks
def load_leaks(leak_val):
    leak_dir = './time_series/stats/'
    
    train_leak_loc = leak_dir + 'train_leak_%s.csv'%leak_val
    train_leak = pd.read_csv(train_leak_loc).compiled_leak
    test_leak_loc = leak_dir + 'test_leak_%s.csv'%leak_val
    test_leak = pd.read_csv(test_leak_loc).compiled_leak
    
    return train_leak, test_leak


# Function for applying statistical transformations to data
def calculate_metadata(df):
    '''
    Function for calculating metadata across pandas dataframe row
    '''
    meta = pd.DataFrame()
    # Calculations that disregard zeros
    meta['nz_mean'] = df.apply(lambda x: x[x!=0].mean(), axis=1)
    meta['nz_log_mean_exp'] = df.apply(lambda x: np.expm1(np.mean(np.log1p(x[x!=0]))), axis=1)
    meta['nz_median'] = df.apply(lambda x: x[x!=0].median(), axis=1)
    meta['nz_std'] = df.apply(lambda x: x[x!=0].std(), axis=1)
    meta['nz_kurtosis'] = df.apply(lambda x: x[x!=0].kurtosis(), axis=1)
    meta['nz_min'] = df.apply(lambda x: np.min(x[x!=0]), axis=1)
    
    # Calculations independent of zeros
    meta['sum'] = df.apply(lambda x: np.sum(x), axis=1)
    meta['max'] = df.apply(lambda x: np.max(x), axis=1)
    
    # Calculations factoring in zeros
    meta['zero_count'] = df.apply(lambda x: np.count_nonzero(x==0), axis=1)
    meta['mean'] = df.apply(lambda x: x.mean(), axis=1)
    meta['log_mean_exp'] = df.apply(lambda x: np.expm1(np.mean(np.log1p(x))), axis=1)
    meta['median'] = df.apply(lambda x: x.median(), axis=1)
    meta['std'] = df.apply(lambda x: x.std(), axis=1)
    meta['kurtosis'] = df.apply(lambda x: x.kurtosis(), axis=1)
    
    return meta 


# Function for feature engineering
def format_for_training_v1(train, test, f, trn_leak, tst_leak, lagval=38):
    '''
    - Formats train and test dataframes for training
    '''
    tmp_trn = train.copy(deep=True)
    tmp_trn['leak'] = trn_leak
    tmp_trn['log_leak'] = np.log1p(tmp_trn['leak'])
    
    tmp_tst = test.copy(deep=True)
    tmp_tst['leak'] = tst_leak
    tmp_tst['log_leak'] = np.log1p(tmp_tst['leak'])
    
    score_name = './model_data/model_2v2_featscores_%s.csv'%lagval
    print 'Loading file:\n', score_name
    score_df = pd.read_csv(score_name, index_col=0)
    
    # Select good features
    if lagval==36:
        threshold = 1.756
    elif lagval==37:
        threshold = 0.6255
    else:
        threshold = 0.625  # lag 38
#     good_features = score_df.loc[score_df['rmse']<=threshold].index
    feature_list = load_pickle('./time_series/aaron_test_v0.pickle')
    good_features = []
    for i in range(55):
        good_features += feature_list[i]
    
    print '\nLoading metadata for training set...'
    trn_meta = pd.read_csv('./model_data/train_meta.csv')
    print 'Loading metadata for test set...'
    tst_meta = pd.read_csv('./model_data/test_meta.csv')
    
    # Format training and test datasets
    cols = ['ID'] + list(good_features) + list(trn_meta.columns.values) + ['log_leak']
    tmp_trn = pd.concat([tmp_trn, trn_meta], axis=1)
    tmp_tst = pd.concat([tmp_tst, tst_meta], axis=1)
    
    return tmp_trn[cols], tmp_tst[cols]


# Function for scaling datasets 
def scale_for_training(train, test):
    print 'Scaling data...'
    tmp_trn = train.copy(deep=True)
    tmp_trn.replace(np.nan, 0, inplace=True)
    tmp_tst = test.copy(deep=True)
    tmp_tst.replace(np.nan, 0, inplace=True)
    
    tmp_trn.drop(labels=['ID'], axis=1, inplace=True)
    tmp_tst.drop(labels=['ID'], axis=1, inplace=True)
    
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(tmp_trn)
    scaled_test = scaler.transform(tmp_tst)
    
    del tmp_trn, tmp_tst; gc.collect();
    return scaled_train, scaled_test

In [None]:
# Main Script
try:
    del fnames, train, test
    print 'Clearing loaded dataframes from memory...\n'
except:
    pass
fnames, train, test = get_dataframes(debug=False)

# Load leaks
leak_val = 38
print '\nLoading train and test leaks...'
train_leak, test_leak = load_leaks(leak_val)
print 'Nonzero elements in train:', np.count_nonzero(train_leak)
print 'Nonzero elements in test:', np.count_nonzero(test_leak)

In [None]:
# Get train and test in format for training booster
# Format target variable
target = train['target'].values
target_log = np.log1p(target)

pp_flag = False
# train_name = './model_data/btrain_%s.csv'%leak_val
# test_name = './model_data/btest_%s.csv'%leak_val

train_name = './model_data/btrain_fullfeat_%s.csv'%leak_val
test_name = './model_data/btest_fullfeat_%s.csv'%leak_val
if pp_flag:
    btrain, btest = format_for_training_v1(train, test, fnames, train_leak, test_leak, leak_val)
    print '\nSaving generated datasets...'
    btrain.to_csv(train_name, index=False)
    btest.to_csv(test_name, index=False)
else:
    print '\nLoading generated datasets...'
    btrain = pd.read_csv(train_name)
    btest = pd.read_csv(test_name)

# Scale dataset for booster training
boost_train, boost_test = scale_for_training(btrain, btest)

In [None]:
# Custom evaluation metric for LGBM
def RMSLE(preds, train_data):
    return 'RMSLE', np.sqrt(mean_squared_error(train_data.get_label(), preds)), False


# Function for evaluating mean errors
def average_best_scores(scores):
    s_cols = [c for c in scores.columns if c!='index']
    errs = scores[s_cols].apply(lambda x: np.min(x[x!=np.nan]), axis=0)
    return np.mean(errs)


# Function for plotting RMSLE averaged over all iterations
def plot_scores(t_scores, v_scores):
    s_cols = [c for c in t_scores.columns if c!='index']
    t_mean = np.mean(t_scores[s_cols], axis=1)
    t_mean = t_mean[~np.isnan(t_mean)]
    v_mean = np.mean(v_scores[s_cols], axis=1)
    v_mean = v_mean[~np.isnan(v_mean)]
    
    plt.figure(figsize=(10, 6))
    plt.plot(v_mean, label='validation', c='orange', alpha=0.7)
    plt.plot(t_mean, label='training', c='orange', alpha=0.7)
    plt.title('Averaged Training and Validation Error')
    plt.xlabel('Training Iteration')
    plt.ylabel('RMSLE')
    plt.legend(loc='upper right')
    plt.show()
    return None


# Function for training a LGB Regressor
def train_lgb_regressor(train, target, test, params, n_boost=500):
    num_boosting = n_boost
    valid_result = pd.DataFrame(data=np.arange(num_boosting), columns=['index'])
    train_result = pd.DataFrame(data=np.arange(num_boosting), columns=['index'])
    test_predictions = np.zeros(test.shape[0])
    train_predictions = np.zeros(train.shape[0])
    kfold = KFold(n_splits=4)
    
    for i, (trn, val) in enumerate(kfold.split(train)):
        print '\nTraining on fold:', i
        round_name = 'round_%s'%i
        xtrain = train[trn, :]
        ytrain = target[trn]

        xval = train[val, :]
        yval = target[val]

        lgb_train = lgb.Dataset(xtrain, ytrain)
        lgb_eval = lgb.Dataset(xval, yval, reference=lgb_train)

        evals_result = {}
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=num_boosting,
                        valid_sets= (lgb_train, lgb_eval),
                        verbose_eval=500,
                        feval=RMSLE, 
                        evals_result=evals_result,
                        early_stopping_rounds=100)

        # Get evaluation results
        valid_res_df = pd.DataFrame.from_dict(evals_result['valid_1'], orient='columns')
        valid_res_df.rename(columns={'RMSLE': round_name}, inplace=True)
        valid_res_df.reset_index(inplace=True)
        train_res_df = pd.DataFrame.from_dict(evals_result['training'], orient='columns')
        train_res_df.rename(columns={'RMSLE': round_name}, inplace=True)
        train_res_df.reset_index(inplace=True)

        valid_result = pd.merge(valid_result, valid_res_df, how='outer', on='index')
        train_result = pd.merge(train_result, train_res_df, how='outer', on='index')

        # Make predictions
        test_predictions += (gbm.predict(test, num_iteration=gbm.best_iteration))/kfold.n_splits
        train_predictions += (gbm.predict(train, num_iteration=gbm.best_iteration))/kfold.n_splits
    
    return train_result, valid_result, test_predictions, train_predictions


# Function for evaluating trouble samples
trouble_idx = np.where(train_leak==0)[0]
def selective_eval(preds, target, idx):
    pred_subset = preds[idx]
    target_subset = target[idx]
    return mean_squared_error(pred_subset, target_subset)

In [None]:
# LGB Regressor Parameters
params = {'task': 'train',
          'boosting': 'gbdt',
          'objective': 'regression',
          'metric': 'RMSLE',
          'num_leaves': 18,
          'learning_rate': 0.05,
          'max_depth': -1,
          'feature_fraction': 0.9,
          'bagging_fraction': 0.8,
          'bagging_freq': 5,
          'min_sum_hessian_in_leaf': 1e-3,
          'lambda_l2': 10,
          'verbose': 0}

In [None]:
# Tune LGB Regressor Hyper-Parameters
tune_flag = False
if tune_flag:
    cv_results = {}

#     cv_entry = 'num_leaves'
#     cv_set = [23, 24]

#     cv_entry = 'learning_rate'
#     cv_set = [0.0005, 0.001]

#     cv_entry = 'max_depth'
#     cv_set = [-1, 9, 10, 11]

#     cv_entry = 'feature_fraction'
#     cv_set = [0.8, 0.99]

#     cv_entry = 'bagging_fraction'
#     cv_set = [0.70, 0.73, 0.75]

#     cv_entry = 'min_sum_hessian_in_leaf'
#     cv_set = [1e-4, 1e-3, 1e-2]

#     cv_entry = 'lambda_l2'
#     cv_set = [27, 30, 33]

    for cv_val in cv_set:
        print '\nCross validating with %s: %s'%(cv_entry, cv_val)
        params[cv_entry] = cv_val
        train_res, valid_res, tst_preds, trn_preds = train_lgb_regressor(train=boost_train, 
                                                                         target=target_log, 
                                                                         test=boost_test, 
                                                                         params=params, 
                                                                         n_boost=30000)
        cv_results[cv_entry+'miss_train_val_%s'%cv_val] = (selective_eval(trn_preds, target_log, trouble_idx),
                                                           average_best_scores(train_res), 
                                                           average_best_scores(valid_res))
        
    # Show cv results
    print '\nCV Results:'
    for key in cv_results.keys():
        print key, cv_results[key]

In [None]:
# Confirm current parameter settings
train_res, valid_res, tst_preds, trn_preds = train_lgb_regressor(train=boost_train, 
                                                                 target=target_log, 
                                                                 test=boost_test, 
                                                                 params=params, 
                                                                 n_boost=30000)
train_score = average_best_scores(train_res)
valid_score = average_best_scores(valid_res)
print '\nTraining score:', train_score
print 'Validation score:', valid_score
print '\nTrouble score:', selective_eval(trn_preds, target_log, trouble_idx)

In [None]:
def make_target_v1(leak, preds):
    exp_preds = np.expm1(preds)
    
    fill_idx = np.where(leak==0)[0]
    
    tmp_leak = leak.copy()
    tmp_leak[fill_idx] = exp_preds[fill_idx]
    return tmp_leak

In [None]:
# Make submission
save_flag=True

tst_target = make_target_v1(test_leak, tst_preds)

ori_target = np.expm1(tst_preds)
trn_target = np.expm1(trn_preds)

sub_name_tst = '../submissions/tstest_lgb_2v2_lag%s_fullfeat_submit.csv'%leak_val

sub_name_ori = './model_data/tstest_lgb_2v2_lag%s_fullfeat_test.csv'%leak_val
sub_name_trn = './model_data/tstest_lgb_2v2_lag%s_fullfeat_train.csv'%leak_val

tst_df = pd.DataFrame()
ori_df = pd.DataFrame()
trn_df = pd.DataFrame()

tst_df['ID'] = test['ID']
ori_df['ID'] = test['ID']
trn_df['ID'] = train['ID']

tst_df['target'] = tst_target
ori_df['target'] = ori_target
trn_df['target'] = trn_target

if save_flag:
    tst_df.to_csv(sub_name_tst, index=False)
    ori_df.to_csv(sub_name_ori, index=False)
    trn_df.to_csv(sub_name_trn, index=False)

tst_df.head(15)

In [None]:
def mean_log_diff(sub1, sub2):
    return np.sqrt(np.mean(np.square(np.subtract(np.log1p(sub1), np.log1p(sub2)))))

fri_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag36_submit_0.52.csv')
sat_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag38_bad_submit_0.52.csv')
sun_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag38_good_submit.csv')

right_idx = np.where(test_leak!=0)[0]
wrong_idx = np.where(test_leak==0)[0]

sub_right = tst_df['target'].values[right_idx]
sun_right = sun_df['target'].values[right_idx]
fri_right = fri_df['target'].values[right_idx]
sat_right = sat_df['target'].values[right_idx]

sub_wrong = tst_df['target'].values[wrong_idx]
sun_wrong = sun_df['target'].values[wrong_idx]
fri_wrong = fri_df['target'].values[wrong_idx]
sat_wrong = sat_df['target'].values[wrong_idx]

In [None]:
print mean_log_diff(sun_right, sub_right)
print mean_log_diff(sun_wrong, sub_wrong)

In [None]:
def load_lexicons():
    lexi_train = load_pickle('./model_data/lexi_train.pickle')
    lexi_test = load_pickle('./model_data/lexi_test.pickle')
    return lexi_train, lexi_test


def preds_to_lexicon(preds, lexicon):
    new_preds = np.zeros(preds.shape[0])
    
    lex_matrix = lexicon * np.ones((preds.shape[0], lexicon.shape[0]))
    
    diff = (lex_matrix.T - preds).T
    diff = np.abs(diff)
    mins = np.argmin(diff, axis=1)
    
    return lexicon[mins]


def train_lexicon(data, target, leak, preds, lexi):
    lexi_transform = preds_to_lexicon(preds, lexi)
    
    lexi_df = pd.DataFrame()
    lexi_df['ID'] = data['ID']
    lexi_df['target'] = target
    lexi_df['leak'] = leak
    lexi_df['pred'] = preds
    lexi_df['lexi_pred'] = lexi_transform
    
    lexi_df['SLE'] = np.square(np.subtract(np.log1p(lexi_df['target']), np.log1p(lexi_df['pred'])))
    lexi_df['lexi_SLE'] = np.square(np.subtract(np.log1p(lexi_df['target']), np.log1p(lexi_df['lexi_pred'])))
    lexi_df['SLE_diff'] = np.abs(np.subtract(lexi_df['SLE'], lexi_df['lexi_SLE']))
    
    return lexi_df


def test_lexicon(data, leak, preds, lexi):
    lexi_transform = preds_to_lexicon(preds, lexi)
    
    lexi_df = pd.DataFrame()
    lexi_df['ID'] = data['ID']
    lexi_df['leak'] = leak
    lexi_df['pred'] = preds
    lexi_df['lexi_pred'] = lexi_transform
    
    lexi_df['pred_abs_diff'] = np.abs(np.subtract(lexi_df['pred'], lexi_df['lexi_red']))
    
    return lexi_df 


def get_errors(lexi):
    zero_idx = lexi['leak']==0
    
    pred_sle_missed = lexi.loc[zero_idx, 'SLE']
    lexi_sle_missed = lexi.loc[zero_idx, 'lexi_SLE']
    
    print 'Trouble error w/o lexicon:', np.sqrt(np.mean(pred_sle_missed))
    print 'Trouble error w/ lexicon:', np.sqrt(np.mean(lexi_sle_missed))
    return None


# lexi_train, lexi_test = load_lexicons()

# train_lexi_eval = train_lexicon(train, target, train_leak, np.expm1(trn_preds), lexi_train)

# get_errors(train_lexi_eval)