# Stage 0 Model: Encoding with CatBoost

In [39]:
# Load libraries
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

import time
import os

## Helper Functions:

In [36]:
# Row-wise metadata generator
def generate_meta(df):
    meta = pd.DataFrame({
        'nonsparse_count': (df[df==-1]+1).fillna(1).sum(axis=1),
        'mean': df[df!=-1].mean(axis=1),
        'std': df[df!=-1].std(axis=1),
        'median': df[df!=-1].median(axis=1),
        'max': df[df!=-1].max(axis=1),
        'min': df[df!=-1].min(axis=1),
        'var': df[df!=-1].var(axis=1)})
    return meta

## Load Data:

In [4]:
load_start = time.time()
data_path = '../data/'
print 'Loading original train and test datasets...'
train_df = pd.read_csv(data_path + 'train.csv')
test_df = pd.read_csv(data_path + 'test.csv')
print 'Loading completed in %s seconds'%(time.time()-load_start)

Loading original train and test datasets...
Loading completed in 54.9546971321 seconds


In [43]:
# Metadata data path
meta_path = data_path + 'meta_stage0v1.csv'
# Autoencoder data path
auto_type = '155'
autodir_path = './autoencoder/data/'
autotrain_path = autodir_path + 'train_' + auto_type + '.csv'
autotest_path = autodir_path + 'test_' + auto_type + '.csv'

### Some data preprocessing:

In [5]:
# Find columns with constant terms (aka standard dev == 0)
std_df = train_df.std(axis=0)
drop_cols = std_df[std_df==0].index

In [8]:
# Format label values
Y = np.log1p(train_df.target.values)

In [9]:
# Get test ID values
ID = test_df.ID.values

In [10]:
# Drop target and ID columns
train_df.drop(columns=['ID', 'target'], axis=1, inplace=True)
test_df.drop(columns=['ID'], axis=1, inplace=True)

In [24]:
# Create master dataset and perform scaling 
X = pd.concat([train_df, test_df], axis=0, ignore_index=True)
scaled_X = X.div(X.max(), axis='columns')
scaled_X[scaled_X==0.0] = -1.0
# Scale labels
y_min = np.min(Y)
y_max = np.max(Y)
scaled_Y = (Y - y_min)/(y_max - y_min)

### Get autoencoder and metadata results:

In [48]:
# Autoencoder data
autotrain = pd.read_csv(autotrain_path, index_col=0)
autotest = pd.read_csv(autotest_path, index_col=0)
auto_all = pd.concat([autotrain, autotest], axis=0, ignore_index=True)

In [37]:
# Row-wise metadata
if not os.path.exists(meta_path):
    print 'Generating row-wise metadata...'
    meta_X = generate_meta(scaled_X)
    meta_X.to_csv(meta_path, index=False)
else:
    print 'Loading row-wise metadata...'
    meta_X = read_csv(meta_path)

## Get Encodings:

In [40]:
random_state=0
num_clusters1 = 24
num_clusters2 = 128

In [41]:
print 'Running Mini-Batch KMeans...'
mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=random_state)
mbkm_X = pd.DataFrame(mbkm.fit_transform(scaled_X))
print 'Running PCA...'
pca = PCA(n_components=num_clusters2, random_state=random_state)
pca_X = pd.DataFrame(pca.fit_transform(scaled_X))
print 'Running Truncated SVD...'
tsvd = TruncatedSVD(n_components=num_clusters2, random_state=random_state)
tsvd_X = pd.DataFrame(tsvd.fit_transform(scaled_X))
print 'Running Gaussian Random Projection...'
grp = GaussianRandomProjection(n_components=num_clusters1, eps=0.1, random_state=random_state)
grp_X = pd.DataFrame(grp.fit_transform(scaled_X))
print 'Running Sparse Random Projection...'
srp = SparseRandomProjection(n_components=num_clusters1, dense_output=True, random_state=random_state)
srp_X = pd.DataFrame(srp.fit_transform(scaled_X))

Running Mini-Batch KMeans...
Running PCA...
Running Truncated SVD...
Running Gaussian Random Projection...
Running Sparse Random Projection...


In [None]:
# 