# Model 2v3: Time-Series Reconstruction with CatBoost

In [None]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from catboost import CatBoostRegressor, Pool

import matplotlib.pyplot as plt

import pdb
import os
import gc; gc.enable()
import h5py
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pickle.load(handle)


# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    os.chdir('../data/compressed/')
    print os.getcwd()
    pkl_files = ['train_id.pickle', 'trainidx.pickle', 'target.pickle', 'test_id.pickle', 'testidx.pickle']
    if debug:
        print 'Loading debug train and test datasets...'
        # h5py files
        train = load_h5py('debug_train.h5')
        test = load_h5py('debug_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('debug_%s'%f) for f in pkl_files]
    else:
        print 'Loading original train and test datasets...'
        # h5py files
        train = load_h5py('full_train.h5')
        test = load_h5py('full_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('full_%s'%f) for f in pkl_files]
    # Load feature names
    fnames = load_pickle('feature_names.pickle')
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))
    os.chdir('../../scripts/')
    print os.getcwd()
    return fnames, train, id_train, train_idx, target, test, id_test, test_idx


# Function for getting datasets in dataframe format
def get_dataframes(debug=False):
    # Load data
    fnames, train, id_train, train_idx, target, test, id_test, test_idx = get_input(debug)
    # Format data
    train_df = pd.DataFrame(data=train, index=train_idx, columns=fnames)
    train_df['ID'] = id_train
    train_df['target'] = target
    test_df = pd.DataFrame(data=test, index=test_idx, columns=fnames)
    test_df['ID'] = id_test
    
    print('\nShape of training dataframe: {} Rows, {} Columns'.format(*train_df.shape))
    print('Shape of test dataframe: {} Rows, {} Columns'.format(*test_df.shape))
    return fnames, train_df, test_df

In [None]:
# Function for loading leaks
def load_leaks(leak_val):
    leak_dir = './time_series/stats/'
    
    train_leak_loc = leak_dir + 'train_leak_%s.csv'%leak_val
    train_leak = pd.read_csv(train_leak_loc).compiled_leak
    test_leak_loc = leak_dir + 'test_leak_%s.csv'%leak_val
    test_leak = pd.read_csv(test_leak_loc).compiled_leak
    
    return train_leak, test_leak


# Function for applying statistical transformations to data
def calculate_metadata(df):
    '''
    Function for calculating metadata across pandas dataframe row
    '''
    meta = pd.DataFrame()
    # Calculations that disregard zeros
    meta['nz_mean'] = df.apply(lambda x: x[x!=0].mean(), axis=1)
    meta['nz_log_mean_exp'] = df.apply(lambda x: np.expm1(np.mean(np.log1p(x[x!=0]))), axis=1)
    meta['nz_median'] = df.apply(lambda x: x[x!=0].median(), axis=1)
    meta['nz_std'] = df.apply(lambda x: x[x!=0].std(), axis=1)
    meta['nz_kurtosis'] = df.apply(lambda x: x[x!=0].kurtosis(), axis=1)
    meta['nz_min'] = df.apply(lambda x: np.min(x[x!=0]), axis=1)
    
    # Calculations independent of zeros
    meta['sum'] = df.apply(lambda x: np.sum(x), axis=1)
    meta['max'] = df.apply(lambda x: np.max(x), axis=1)
    
    # Calculations factoring in zeros
    meta['zero_count'] = df.apply(lambda x: np.count_nonzero(x==0), axis=1)
    meta['mean'] = df.apply(lambda x: x.mean(), axis=1)
    meta['log_mean_exp'] = df.apply(lambda x: np.expm1(np.mean(np.log1p(x))), axis=1)
    meta['median'] = df.apply(lambda x: x.median(), axis=1)
    meta['std'] = df.apply(lambda x: x.std(), axis=1)
    meta['kurtosis'] = df.apply(lambda x: x.kurtosis(), axis=1)
    
    return meta


# Function for feature engineering
def format_for_training_v1(train, test, f, trn_leak, tst_leak, trn_res, tst_res, lagval=38):
    '''
    - Formats train and test dataframes for training
    '''
    tmp_trn = train.copy(deep=True)
    tmp_trn['leak'] = trn_res['target'].values
    tmp_trn['log_leak'] = np.log1p(tmp_trn['leak'])
    
    tmp_tst = test.copy(deep=True)
    tmp_tst['leak'] = tst_res['target'].values
    tmp_tst['log_leak'] = np.log1p(tmp_tst['leak'])
    
    score_name = './model_data/model_2v2_featscores_%s.csv'%lagval
    print 'Loading file:\n', score_name
    score_df = pd.read_csv(score_name, index_col=0)
    
    # Select good features
    if lagval==36:
        threshold = 1.756
    elif lagval==37:
        threshold = 0.6255
    else:
        threshold = 0.625  # lag 38
#     good_features = score_df.loc[score_df['rmse']<=threshold].index
    feature_list = load_pickle('./time_series/aaron_test_v0.pickle')
    good_features = []
    for i in range(55):
        good_features += feature_list[i]
    
    print '\nLoading metadata for training set...'
    trn_meta = pd.read_csv('./model_data/train_meta.csv')
    print 'Loading metadata for test set...'
    tst_meta = pd.read_csv('./model_data/test_meta.csv')
    
    # Format training and test datasets
    cols = ['ID'] + list(good_features) + list(trn_meta.columns.values) + ['log_leak']
    tmp_trn = pd.concat([tmp_trn, trn_meta], axis=1)
    tmp_tst = pd.concat([tmp_tst, tst_meta], axis=1)
    
    return tmp_trn[cols], tmp_tst[cols]



# Function for scaling datasets 
def scale_for_training(train, test):
    print 'Scaling data...'
    tmp_trn = train.copy(deep=True)
    tmp_trn.replace(np.nan, 0, inplace=True)
    tmp_tst = test.copy(deep=True)
    tmp_tst.replace(np.nan, 0, inplace=True)
    
    tmp_trn.drop(labels=['ID'], axis=1, inplace=True)
    tmp_tst.drop(labels=['ID'], axis=1, inplace=True)
    
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(tmp_trn)
    scaled_test = scaler.transform(tmp_tst)
    
    del tmp_trn, tmp_tst; gc.collect();
    return scaled_train, scaled_test

In [None]:
# Main Script
try:
    del fnames, train, test
    print 'Clearing loaded dataframes from memory...\n'
except:
    pass
fnames, train, test = get_dataframes(debug=False)

# Load leaks
leak_val = 38
print '\nLoading train and test leaks...'
train_leak, test_leak = load_leaks(leak_val)
print 'Nonzero elements in train:', np.count_nonzero(train_leak)
print 'Nonzero elements in test:', np.count_nonzero(test_leak)

# Load good result
print '\nLoading good results for train and test predictions...'

tst_res_name = './model_data/tstest_lgb_2v2_lag38_fullfeat_test.csv'
trn_res_name = './model_data/tstest_lgb_2v2_lag38_fullfeat_train.csv'

tst_res = pd.read_csv(tst_res_name)
trn_res = pd.read_csv(trn_res_name)
print 'Shape of test results import:', tst_res.shape
print 'Shape of train results import:', trn_res.shape

In [None]:
# Get train and test in format for training booster
# Format target variable
target = train['target'].values
target_log = np.log1p(target)

pp_flag = True
train_name = './model_data/btrain_fullfeat_%s.csv'%leak_val
test_name = './model_data/btest_fullfeat_%s.csv'%leak_val
if pp_flag:
    btrain, btest = format_for_training_v1(train, test, fnames, train_leak, test_leak, trn_res, tst_res, leak_val)
    print '\nSaving generated datasets...'
    btrain.to_csv(train_name, index=False)
    btest.to_csv(test_name, index=False)
else:
    print '\nLoading generated datasets...'
    btrain = pd.read_csv(train_name)
    btest = pd.read_csv(test_name)

# Scale dataset for booster training
boost_train, boost_test = scale_for_training(btrain, btest)

In [None]:
# Function for training a CatBoost Regressor
def train_catboost_regressor(train, target, test, params):
    '''
    Function for training a catboost regressor
    '''
    test_predictions = np.zeros(test.shape[0])
    train_predictions = np.zeros(train.shape[0])
    kfold = KFold(n_splits=4)
    val_errors = np.zeros(kfold.n_splits)
    trn_errors = np.zeros(kfold.n_splits)
    
    for i, (trn, val) in enumerate(kfold.split(train)):
        params['random_seed'] = i
        print '\nTraining on fold:', i
        round_name = 'round_%s'%i
        xtrain = train[trn, :]
        ytrain = target[trn]

        xval = train[val, :]
        yval = target[val]

        train_pool = Pool(xtrain, ytrain)
        validate_pool = Pool(xval, yval)
        
        model = CatBoostRegressor(**params)
        model.fit(train_pool, eval_set=validate_pool)

        # Make predictions
        test_predictions += model.predict(test)
        
        # Get validation and train errors
        valid_pred = model.predict(xval)
        train_pred = model.predict(train)
        train_predictions += train_pred
        val_errors[i] = np.sqrt(mean_squared_error(valid_pred, yval))
        trn_errors[i] = np.sqrt(mean_squared_error(train_pred, target))
    
    # Average over cv folds
    test_predictions /= kfold.n_splits
    train_predictions /= kfold.n_splits
    
    return trn_errors, val_errors, test_predictions, train_predictions


# Function for evaluating trouble samples
trouble_idx = np.where(train_leak==0)[0]
def selective_eval(preds, target, idx):
    pred_subset = preds[idx]
    target_subset = target[idx]
    return mean_squared_error(pred_subset, target_subset)

In [None]:
# CatBoost Regressor Parameters
params = {'iterations': 30000,
          'random_seed': 0,
          'loss_function': 'RMSE',
          'eval_metric': 'RMSE',
          'od_type': 'Iter',
          'od_wait': 20,
          'learning_rate': 0.043,
          'depth': 6,
          'bagging_temperature': 9,
          'l2_leaf_reg': 0.113,
          'verbose': 1000,
          'use_best_model': True}

In [None]:
# Tune CatBoost Regressor Hyper-Parameters
tune_flag = False
if tune_flag:
    cv_results = {}

    cv_entry = 'l2_leaf_reg'
    cv_set = [0.113, 0.115, 0.117]
    
#     cv_entry = 'learning_rate'
#     cv_set = [0.042, 0.043, 0.044]

#     cv_entry = 'depth'
#     cv_set = [9, 10, 11]

#     cv_entry = 'bagging_temperature'
#     cv_set = [1, 5, 10]

    for cv_val in cv_set:
        print '\nCross validating with %s: %s'%(cv_entry, cv_val)
        params[cv_entry] = cv_val
        train_err, valid_err, tst_preds, trn_preds = train_catboost_regressor(train=boost_train, 
                                                                              target=target_log, 
                                                                              test=boost_test, 
                                                                              params=params)
        cv_results[cv_entry+'_miss_train_val_%s'%cv_val] = (selective_eval(trn_preds, target_log, trouble_idx),
                                                            np.mean(train_err), 
                                                            np.mean(valid_err))
        
    # Show cv results
    print '\nCV Results:'
    for key in cv_results.keys():
        print key, cv_results[key]

In [None]:
train_err, valid_err, tst_preds, trn_preds = train_catboost_regressor(train=boost_train, 
                                                                      target=target_log, 
                                                                      test=boost_test, 
                                                                      params=params)
print 'Train score:', np.mean(train_err) 
print 'Validation score:', np.mean(valid_err)
print '\nTrouble score:', selective_eval(trn_preds, target_log, trouble_idx)

In [None]:
def make_target_v1(leak, preds):
    exp_preds = np.expm1(preds)
    
    fill_idx = np.where(leak==0)[0]
    
    tmp_leak = leak.copy()
    tmp_leak[fill_idx] = exp_preds[fill_idx]
    return tmp_leak

In [None]:
# Make submission
save_flag=True

tst_target = make_target_v1(test_leak, tst_preds)

ori_target = np.expm1(tst_preds)
trn_target = np.expm1(trn_preds)

sub_name_tst = '../submissions/tstest_cat_2v3_lag%s_fullfeat_submit.csv'%leak_val

sub_name_ori = './model_data/tstest_cat_2v3_lag%s_fullfeat_test.csv'%leak_val
sub_name_trn = './model_data/tstest_cat_2v3_lag%s_fullfeat_train.csv'%leak_val

tst_df = pd.DataFrame()
ori_df = pd.DataFrame()
trn_df = pd.DataFrame()

tst_df['ID'] = test['ID']
ori_df['ID'] = test['ID']
trn_df['ID'] = train['ID']

tst_df['target'] = tst_target
ori_df['target'] = ori_target
trn_df['target'] = trn_target

if save_flag:
    tst_df.to_csv(sub_name_tst, index=False)
    ori_df.to_csv(sub_name_ori, index=False)
    trn_df.to_csv(sub_name_trn, index=False)

tst_df.head(15)

In [None]:
def root_mean_square_log_diff(sub1, sub2):
    return np.sqrt(np.mean(np.square(np.subtract(np.log1p(sub1), np.log1p(sub2)))))

def sum_abs_diff(sub1, sub2):
    return np.sum(np.abs(np.subtract(sub1, sub2)))

fri_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag36_submit_0.52.csv')
sat_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag38_bad_submit_0.52.csv')
sun_df = pd.read_csv('../submissions/tstest_lgb_2v2_lag38_good_submit.csv')
tmp_df = pd.read_csv('../submissions/tstest_cat_2v3_lag38_submit.csv')

right_idx = np.where(test_leak!=0)[0]
wrong_idx = np.where(test_leak==0)[0]

sub_right = tst_df['target'].values[right_idx]
sun_right = sun_df['target'].values[right_idx]
fri_right = fri_df['target'].values[right_idx]
sat_right = sat_df['target'].values[right_idx]
tmp_right = tmp_df['target'].values[right_idx]

sub_wrong = tst_df['target'].values[wrong_idx]
sun_wrong = sun_df['target'].values[wrong_idx]
fri_wrong = fri_df['target'].values[wrong_idx]
sat_wrong = sat_df['target'].values[wrong_idx]
tmp_wrong = tmp_df['target'].values[wrong_idx]

In [None]:
print 'Sum abs diff:'
print sum_abs_diff(sub_right, tmp_right)
print sum_abs_diff(sub_wrong, tmp_wrong)

print '\nRoot mean square log diff:'
print root_mean_square_log_diff(sub_right, tmp_right)
print root_mean_square_log_diff(sub_wrong, tmp_wrong)

In [None]:
print 'Sum abs diff:'
print sum_abs_diff(sub_right, sun_right)
print sum_abs_diff(sub_wrong, sun_wrong)

print '\nRoot mean square log diff:'
print root_mean_square_log_diff(sub_right, sun_right)
print root_mean_square_log_diff(sub_wrong, sun_wrong)

In [None]:
def load_lexicons():
    lexi_train = load_pickle('./model_data/lexi_train.pickle')
    lexi_test = load_pickle('./model_data/lexi_test.pickle')
    return lexi_train, lexi_test


def preds_to_lexicon(preds, lexicon):
    new_preds = np.zeros(preds.shape[0])
    
    lex_matrix = lexicon * np.ones((preds.shape[0], lexicon.shape[0]))
    
    diff = (lex_matrix.T - preds).T
    diff = np.abs(diff)
    mins = np.argmin(diff, axis=1)
    
    return lexicon[mins]


def evaluate_lexicon(data, target, leak, preds, lexi):
    lexi_transform = preds_to_lexicon(preds, lexi)
    
    lexi_df = pd.DataFrame()
    lexi_df['ID'] = data['ID']
    lexi_df['target'] = target
    lexi_df['leak'] = leak
    lexi_df['pred'] = preds
    lexi_df['lexi_pred'] = lexi_transform
    
    lexi_df['SLE'] = np.square(np.subtract(np.log1p(lexi_df['target']), np.log1p(lexi_df['pred'])))
    lexi_df['lexi_SLE'] = np.square(np.subtract(np.log1p(lexi_df['target']), np.log1p(lexi_df['lexi_pred'])))
    lexi_df['SLE_diff'] = np.abs(np.subtract(lexi_df['SLE'], lexi_df['lexi_SLE']))
    
    return lexi_df 


def get_errors(lexi):
    zero_idx = lexi['leak']==0
    
    pred_sle_missed = lexi.loc[zero_idx, 'SLE']
    lexi_sle_missed = lexi.loc[zero_idx, 'lexi_SLE']
    
    print 'Trouble error w/o lexicon:', np.sqrt(np.mean(pred_sle_missed))
    print 'Trouble error w/ lexicon:', np.sqrt(np.mean(lexi_sle_missed))
    return None


lexi_train, lexi_test = load_lexicons()

train_lexi_eval = evaluate_lexicon(train, target, train_leak, np.expm1(trn_preds), lexi_train)
get_errors(train_lexi_eval)

In [None]:
train_lexi_eval.sort_values(by='SLE_diff', ascending=False)