# Stage 0 Model: Encoding with CatBoost
Inspired from: https://www.kaggle.com/tanreinama/catboost-stackedae-with-mxnet-meta-1-40lb

In [None]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle as pkl

from catboost import CatBoostRegressor

from sklearn.model_selection import KFold
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

import time
import os
import h5py

## Helper Functions:

In [None]:
# Load h5py file
def loadh5(fname, dname):
    h5f = h5py.File(fname, 'r')
    data = h5f[dname][:]
    h5f.close()
    return data

In [None]:
# Load pickle file
def loadpickle(fname):
    with open(fname, 'rb') as handle:
        data = pkl.load(handle)
    return data

In [None]:
# Row-wise metadata generator
def generate_meta(df):
    meta = pd.DataFrame({
        'nonsparse_count': (df[df==0]).fillna(1).sum(axis=1),
        'sum': df[df!=0].sum(axis=1),
        'mean': df[df!=0].mean(axis=1),
        'std': df[df!=0].std(axis=1),
        'median': df[df!=0].median(axis=1),
        'max': df[df!=0].max(axis=1),
        'min': df[df!=0].min(axis=1),
        'var': df[df!=0].var(axis=1)})
    return meta

## Load Data:

In [None]:
data_path = '../data/'
train_dname = 'train_s0'
test_dname = 'test_s0'
f_ext = '_vanilla.h5'

load_start = time.time()
print 'Loading Stage 0 vanilla train and test datasets...'
# Load h5py data
train_data = loadh5(data_path + train_dname + f_ext, train_dname)
test_data = loadh5(data_path + test_dname + f_ext, test_dname)
# Load dataframe indexes
train_idx = loadpickle(data_path + 'train_idx.pkl')
test_idx = loadpickle(data_path + 'test_idx.pkl')
# Load dataframe column names
train_cols = loadpickle(data_path + 'train_cols.pkl')
test_cols = loadpickle(data_path + 'test_cols.pkl')

# Create dataframes
train_df = pd.DataFrame(data=train_data, index=train_idx, columns=train_cols)
test_df = pd.DataFrame(data=test_data, index=test_idx, columns=test_cols)

print 'Loading completed in %s seconds'%(time.time()-load_start)

In [None]:
# Metadata data path
meta_path = data_path + 'meta_stage0v1.csv'
# Autoencoder data path
auto_type = '155'
autodir_path = './autoencoder/data/'
autodata_path = autodir_path + 'data_' + auto_type + '.csv'

### Some data preprocessing:

In [None]:
# Format label values
Y = np.log1p(train_df.target.values)
train_df.drop(columns=['target'], axis=1, inplace=True)

In [None]:
# Create master dataset and perform scaling 
X = pd.concat([train_df, test_df], axis=0, ignore_index=True)
scaled_X = X.div(X.max(), axis='columns')
# Scale labels
y_min = np.min(Y)
y_max = np.max(Y)
scaled_Y = (Y - y_min)/(y_max - y_min)

### Get autoencoder and metadata results:

In [None]:
# Autoencoder data
autodata = pd.read_csv(autodata_path)

In [None]:
# Row-wise metadata
if not os.path.exists(meta_path):
    print 'Generating row-wise metadata...'
    meta_X = generate_meta(scaled_X)
    meta_X.to_csv(meta_path, index=False)
    print 'Row-wise metadata generated!'
else:
    print 'Loading row-wise metadata...'
    meta_X = pd.read_csv(meta_path)
    print 'Row-wise metadata loaded!'

## Get Encodings:

In [None]:
random_state=0
num_clusters1 = 24
num_clusters2 = 128

In [None]:
print 'Running Mini-Batch KMeans...'
mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=random_state)
mbkm_X = pd.DataFrame(mbkm.fit_transform(scaled_X))
print 'Running PCA...'
pca = PCA(n_components=num_clusters2, random_state=random_state)
pca_X = pd.DataFrame(pca.fit_transform(scaled_X))
print 'Running Truncated SVD...'
tsvd = TruncatedSVD(n_components=num_clusters2, random_state=random_state)
tsvd_X = pd.DataFrame(tsvd.fit_transform(scaled_X))
print 'Running Gaussian Random Projection...'
grp = GaussianRandomProjection(n_components=num_clusters1, eps=0.1, random_state=random_state)
grp_X = pd.DataFrame(grp.fit_transform(scaled_X))
print 'Running Sparse Random Projection...'
srp = SparseRandomProjection(n_components=num_clusters1, dense_output=True, random_state=random_state)
srp_X = pd.DataFrame(srp.fit_transform(scaled_X))

## Training Catboost with Cross Validation:

In [None]:
# Aggregate all data
X_all = pd.concat([scaled_X, meta_X, autodata, mbkm_X, pca_X, tsvd_X, grp_X, srp_X], axis=1)

In [None]:
submissions = []
print 'Training Catboost model...'
for fold_id, (IDX_train, IDX_test) in enumerate(KFold(n_splits=5, random_state=random_state, shuffle=False).split(scaled_Y)):
    # Partition training and test sets
    X_train = X_all.iloc[IDX_train].values
    X_test = X_all.iloc[IDX_test].values
    Y_train = scaled_Y[IDX_train]
    Y_test = scaled_Y[IDX_test]
    
    # Define Catboost model
    cb_reg = CatBoostRegressor(iterations=500,
                               learning_rate=0.05, 
                               depth=10,
                               eval_metric='RMSE',
                               random_seed=fold_id, 
                               bagging_temperature=0.2,
                               od_type='Iter', 
                               metric_period=50,  # Rounds to process before calculating objective
                               od_wait=20)
    cb_reg.fit(X_train, Y_train, eval_set=(X_test, Y_test), cat_features=[], use_best_model=True, verbose=True)
    target_pred = cb_reg.predict(X_all.iloc[scaled_Y.shape[0]:])
    # Rescale target predictions and append to submissions list
    target_pred = target_pred * (y_max - y_min) + y_min
    submissions.append(np.expm1(target_pred))

In [None]:
# Format submissions
mean_submissions = np.mean(submissions, axis=0)
result = pd.DataFrame({'ID': test_idx, 'target': mean_submissions})

In [None]:
# Save submissions
result_path = '../submissions/encat_0v1_submit.csv'
result.to_csv(result_path, index=False)