# Model 2v0: Feature Scoring and LGBM with Time Series
This model is based off the public kernel: https://www.kaggle.com/ogrellier/feature-scoring-vs-zeros

This model utilizes machine learning to bolster the accuracy of time-series predictions. A key feature to this model is that it replaces all predictions generated via LightGBM by time-series predictions obtained from a time-series reconstruction process. By doing so, this approach prioritizes the time-series reconstruction's results and simply fills in values that the time-series reconstruction was unable to obtain.

In [1]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pdb
import os
import h5py
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [2]:
# Debug flag
debug = False
# Get feature scores flag
get_scores = False

## Helper Functions:

In [3]:
# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pickle.load(handle)
# Function for saving pickle file
def save_pickle(fname, data):
    with open(fname, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return None

In [4]:
# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    os.chdir('../data/compressed/')
    print os.getcwd()
    pkl_files = ['train_id.pickle', 'trainidx.pickle', 'target.pickle', 'test_id.pickle', 'testidx.pickle']
    if debug:
        print 'Loading debug train and test datasets...'
        # h5py files
        train = load_h5py('debug_train.h5')
        test = load_h5py('debug_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('debug_%s'%f) for f in pkl_files]
    else:
        print 'Loading original train and test datasets...'
        # h5py files
        train = load_h5py('full_train.h5')
        test = load_h5py('full_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('full_%s'%f) for f in pkl_files]
    # Load feature names
    fnames = load_pickle('feature_names.pickle')
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))
    os.chdir('../../scripts/')
    print os.getcwd()
    return fnames, train, id_train, train_idx, target, test, id_test, test_idx

In [5]:
# Function for getting datasets in dataframe format
def get_dataframes(debug=False):
    # Load data
    fnames, train, id_train, train_idx, target, test, id_test, test_idx = get_input(debug)
    # Format data
    train_df = pd.DataFrame(data=train, index=train_idx, columns=fnames)
    train_df['ID'] = id_train
    train_df['target'] = target
    test_df = pd.DataFrame(data=test, index=test_idx, columns=fnames)
    test_df['ID'] = id_test
    
    print('\nShape of training dataframe: {} Rows, {} Columns'.format(*train_df.shape))
    print('Shape of test dataframe: {} Rows, {} Columns'.format(*test_df.shape))
    return fnames, train_df, test_df

In [6]:
# Function for calculating ROOT mean squared error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

## Main Script:

In [7]:
try:
    del fnames, train, test
    print 'Clearing loaded dataframes from memory...\n'
except:
    pass
fnames, train, test = get_dataframes(debug=debug)

/Users/cheng-haotai/Projects_Data/santander-value-prediction/data/compressed
Loading original train and test datasets...
Shape of training dataset: 4459 Rows, 4991 Columns
Shape of test dataset: 49342 Rows, 4991 Columns
/Users/cheng-haotai/Projects_Data/santander-value-prediction/scripts

Shape of training dataframe: 4459 Rows, 4993 Columns
Shape of test dataframe: 49342 Rows, 4992 Columns


In [8]:
# Load leak values
leak_path = './time_series/stats/'
path_train_leak = leak_path + 'train_leak.csv' 
path_test_leak = leak_path + 'test_leak.csv'

In [9]:
# Add train leak
train_leak = pd.read_csv(path_train_leak)
train['leak'] = train_leak['compiled_leak'].replace(np.nan, 0.0)
train['log_leak'] = np.log1p(train['leak'].values)

In [10]:
# Add test leak
test_leak = pd.read_csv(path_test_leak)
test['leak'] = test_leak['compiled_leak'].replace(np.nan, 0.0)
test['log_leak'] = np.log1p(test['leak'].values)

In [11]:
# Isolate and format target
target = np.log1p(train['target'].values)

### Feature Scoring using XGBoost with Leak Feature:

In [12]:
# Function for finding feature scores
def feature_score(num_splits=5):
    # Initialize XGBRegressor object
    reg = xgb.XGBRegressor(n_estimators=1000)

    folds = KFold(n_splits=num_splits, shuffle=True, random_state=0)
    fold_idx = [(trn, val) for trn, val in folds.split(train)]

    scores = []

    for idx, f in enumerate(fnames):
        feat_set = ['log_leak', f]
        score = 0
        for trn, val in fold_idx:
            reg.fit(X = train[feat_set].iloc[trn], 
                    y = target[trn], 
                    eval_set = [(train[feat_set].iloc[val], target[val])],
                    eval_metric = 'rmse',
                    early_stopping_rounds = 50, 
                    verbose=False)
            score += rmse(target[val], reg.predict(data=train[feat_set].iloc[val], 
                                                   ntree_limit=reg.best_ntree_limit)) / folds.n_splits
        scores.append((f, score))
    
    return scores 

In [13]:
score_name = './model_data/model_2v0_featscores.pickle'
if get_scores:
    # Get scores
    scores = feature_score(num_splits=5)
    # Save scores
    save_pickle(score_name, scores)
else:
    # Load scores
    scores = load_pickle(score_name)

In [14]:
# Create dataframe from scores
score_df = pd.DataFrame(data=scores, columns=['feature', 'rmse']).set_index('feature')
score_df.sort_values(by='rmse', ascending=True, inplace=True)

In [15]:
# Select good features
threshold = 0.7925
good_features = score_df.loc[score_df['rmse']<=threshold].index
good_rmse = score_df.loc[score_df['rmse']<=threshold, 'rmse'].values

In [16]:
tmp = ['6eef030c1', 'ba42e41fa', '703885424', 'eeb9cd3aa', '3f4a39818',
       '371da7669', 'b98f3e0d7', 'fc99f9426', '2288333b4', '324921c7b',
       '66ace2992', '84d9d1228', '491b9ee45', 'de4e75360', '9fd594eec',
       'f190486d6', '62e59a501', '20aa07010', 'c47340d97', '1931ccfdd',
       'c2dae3a5a', 'e176a204a']

In [17]:
len(np.intersect1d(good_features, tmp))

21

In [18]:
len(good_features)

57

### Train LightGBM:

In [19]:
# Function for calculating row-wise metadata
def add_metadata(df):
    df.replace(0, np.nan, inplace=True)
    # Calculate new metadata
    df['log_of_mean'] = np.log1p(df[fnames].replace(0, np.nan).mean(axis=1))
    df['mean_of_log'] = np.log1p(df[fnames]).replace(0, np.nan).mean(axis=1)
    df['log_of_median'] = np.log1p(df[fnames].replace(0, np.nan).median(axis=1))
    df['num_nans'] = df[fnames].isnull().sum(axis=1)
    df['sum'] = np.log1p(df[fnames].sum(axis=1))
    df['std'] = df[fnames].std(axis=1)
    df['kurtosis'] = df[fnames].kurtosis(axis=1)
    return df

In [20]:
# Add row-wise metadata to train and test sets
print '\nAdding metadata to train set...'
data_train = add_metadata(train.copy(deep=True))
print 'Adding metadata to test set...\n'
data_test = add_metadata(test.copy(deep=True))


Adding metadata to train set...
Adding metadata to test set...



In [21]:
# Add target column to test set
data_test['target'] = 0
# Define features to be used in training LGBM
flist = good_features.tolist() + ['log_leak', 'log_of_mean', 'mean_of_log', 'log_of_median', 
                                  'num_nans', 'sum', 'std', 'kurtosis']

In [22]:
# Function for training and evaluating LightGBM
def run_lgb(lgb_params):
    folds = KFold(n_splits=5, shuffle=True, random_state=0)
    # Initialize placeholder prediction vectors
    test_pred = np.zeros(data_test.shape[0])
    train_pred = np.zeros(data_train.shape[0])
    val_error = np.zeros(folds.n_splits)
    
    # Iterate through folds
    for i, (trn, val) in enumerate(folds.split(data_train)):
        # Define train and val for current fold
        lgb_train = lgb.Dataset(data_train[flist].iloc[trn], label=target[trn])
        lgb_eval = lgb.Dataset(data_train[flist].iloc[val], label=target[val])
        
        # Train regressor
        reg = lgb.train(params=lgb_params,
                        train_set = lgb_train,
                        valid_sets = lgb_eval, 
                        num_boost_round = 10000,
                        early_stopping_rounds = 100,
                        verbose_eval = 0)

        # Get training predictions
        train_pred[val] = reg.predict(data_train[flist].iloc[val])
        # Get test predictions
        test_pred += reg.predict(data_test[flist]) / folds.n_splits
        # Get validation error
        val_error[i] = np.sqrt(mean_squared_error(train_pred[val], target[val]))
        print 'Validation error for fold %s is: %f'%(i, val_error[i])
        
    # Print validation error
    print 'Overall validation error:', val_error.mean()
    
    return test_pred, train_pred, val_error.mean()

In [23]:
# Define LGBM training set
dtrain = lgb.Dataset(data=data_train[flist], 
                     label=target, free_raw_data=True)

# Train LightGBM
lgb_params = {'objective': 'regression',
              'metric': 'mean_squared_error',
              'boosting_type': 'gbdt',
              'random_seed': 3,
              'verbose': -1,
              'learning_rate': 0.05,
              'num_leaves': 58,
              'subsample': 0.6143,
              'colsample_bytree': 0.6453,
              'min_split_gain': np.power(10, -2.5988),
              'reg_alpha': np.power(10, -2.2887),
              'reg_lambda': np.power(10, 1.7570),
              'min_child_weight': np.power(10, -0.1477),
              'max_depth': -1}


test_pred, train_pred, v_acc = run_lgb(lgb_params)

Validation error for fold 0 is: 0.647939
Validation error for fold 1 is: 0.681281
Validation error for fold 2 is: 0.620527
Validation error for fold 3 is: 0.729893
Validation error for fold 4 is: 0.605220
Overall validation error: 0.6569718036541922


### Evaluate Results and Save Submission:

In [24]:
# Evaluate training results
data_train['predictions'] = train_pred
data_train.loc[data_train['leak'].notnull(), 'predictions'] = np.log1p(data_train.loc[data_train['leak'].notnull(), 
                                                                                      'leak'])
print 'Train score:', mean_squared_error(target, train_pred)**.5
print 'Train score with leak:', mean_squared_error(target, data_train['predictions'])**.5

Train score: 0.6585031116467885
Train score with leak: 0.651826372560223


In [25]:
# Save test submission
sub_name = '../submissions/ts_lgb_2v0_submit.csv'

data_test['target'] = np.expm1(test_pred)
data_test.loc[data_test['leak'].notnull(), 'target'] = data_test.loc[data_test['leak'].notnull(), 'leak']
data_test[['ID', 'target']].to_csv(sub_name, index=False, float_format='%.2f')