# XGBoost with Covariate Shift Correction
Extension of Model 0v2 that has covariate shift via Kullback-Leibler Importance Estimation Procedure integrated.

In [1]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import xgboost as xgb

import pickle as pkl
import pdb
import os
import h5py

from scipy.stats import skew, kurtosis

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin  # For making custom classes
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.model_selection import KFold

from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import FeatureUnion

  from ._conv import register_converters as _register_converters


In [2]:
# High-level parameters
debug=True
random_state=17
# KLIEP weight parameters
num_kernels = 1000
gw_val = 130

## Helper Functions and Classes

In [3]:
# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pkl.load(handle)
# Function for saving pickle file
def save_pickle(fname, data):
    with open(fname, 'wb') as handle:
        pkl.dump(obj=data, file=handle, protocol=pkl.HIGHEST_PROTOCOL)
    return None

In [4]:
class UniqueTransformer(BaseEstimator, TransformerMixin):
    '''
    Class with fit and transform methods for removing duplicate columns from a dataset
    **fit** finds the indexes of unique columns using numpy unique
    **transform** returns the dataset with the indexes of unique columns
    '''
    def __init__(self, axis=1):
        self.axis=axis

    def fit(self, X, y=None):
        print 'Finding unique indexes...'
        _, self.unique_indexes_ = np.unique(X, axis=self.axis, return_index=True)
        return self

    def transform(self, X, y=None):
        print 'Filtering for only unique columns...'
        return X[:, self.unique_indexes_]

In [5]:
class ClassifierTransformer(BaseEstimator, TransformerMixin):
    '''
    Class describing an object that transforms datasets via estimator results
    **_get_labels** specifies target value bins and transforms target vector into bin values
    '''
    def __init__(self, estimator=None, n_classes=2, cv=3):
        self.estimator=estimator
        self.n_classes=n_classes
        self.cv=cv

    def _get_labels(self, y):
        y_labels = np.zeros(len(y))
        y_us = np.sort(np.unique(y))
        step = int(len(y_us)/self.n_classes)
        for i_class in range(self.n_classes):
            if i_class+1 == self.n_classes:  # Edge case where i_class is initialized at 1
                y_labels[y >= y_us[i_class*step]] = i_class
            else:
                y_labels[np.logical_and(y>=y_us[i_class*step], y<y_us[(i_class+1)*step])] = i_class
        return y_labels

    def fit(self, X, y):
        print 'Fitting random forest classifier with n_classes = %s'%self.n_classes
        y_labels = self._get_labels(y)
        kf = KFold(n_splits=self.cv, shuffle=True, random_state=random_state)
        self.estimators_ = []
        # Train individual classifiers
        for train, _ in kf.split(X, y_labels):
            self.estimators_.append(clone(self.estimator).fit(X[train], y_labels[train]))
        return self

    def transform(self, X, y=None):
        print 'Applying classifier transformation with n_classes = %s'%self.n_classes
        kf = KFold(n_splits=self.cv, shuffle=False, random_state=random_state)

        X_prob = np.zeros((X.shape[0], self.n_classes))
        X_pred = np.zeros(X.shape[0])

        for estimator, (_, test) in zip(self.estimators_, kf.split(X)):
            X_prob[test] = estimator.predict_proba(X[test])
            X_pred[test] = estimator.predict(X[test])
        return np.hstack([X_prob, np.array([X_pred]).T])

In [6]:
# Function for transforming a row into statistical values
def apply_stats_to_row(row):
    stats = []
    for fun in stat_functions:
        stats.append(fun(row))
    return stats

In [7]:
class StatsTransformer(BaseEstimator, TransformerMixin):
    '''
    Class describing an object for transforming datasets into statistical values row-wise
    NOTE: This class is dependent on the function **apply_stats_to_row**
    '''
    def __init__(self, verbose=0, n_jobs=-1, pre_dispatch='2*n_jobs'):
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.pre_dispatch = pre_dispatch

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print 'Applying statistical transformation to dataset...'
        parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch, verbose=self.verbose)
        # Get statistics transformation
        stats_list = parallel(delayed(apply_stats_to_row)(X[i_smpl, :]) for i_smpl in range(len(X)))
        return np.array(stats_list)

In [8]:
class _StatFunAdaptor:
    '''
    Class describing an object that wraps pre-processing functions with a main statistical function
    **__init__** sets up the object parameters
    **__call__** describes routine steps when object is called
    '''
    def __init__(self, stat_fun, *funs, **stat_fun_kwargs):
        self.stat_fun = stat_fun
        self.funs = funs
        self.stat_fun_kwargs = stat_fun_kwargs

    def __call__(self, x):
        x = x[x != 0]  # Only look at nonzero entries
        # Transform row with cached functions
        for fun in self.funs:
            x = fun(x)
        if x.size == 0:
            return -99999  # Edge case default
        return self.stat_fun(x, **self.stat_fun_kwargs)  # Returns result of a run

In [9]:
def diff2(x):
    return np.diff(x, n=2)

In [10]:
def get_stat_funs():
    '''
    Function for defining all the statistical functions used for evaluating elements in a row-wise manner
    Functions include: length, minimum, maximum, standard deviation, skew, kurtosis, and percentile
    '''
    stat_funs = []

    stats = [len, np.min, np.max, np.median, np.std, skew, kurtosis] + 19 * [np.percentile]
    # Dictionary arguments (nontrivial only for percentile function)
    stats_kwargs = [{} for i in range(7)] + [{'q': i} for i in np.linspace(0.05, 0.95, 19)]

    for stat, stat_kwargs in zip(stats, stats_kwargs):
        stat_funs.append(_StatFunAdaptor(stat, **stat_kwargs))
        stat_funs.append(_StatFunAdaptor(stat, np.diff, **stat_kwargs))  # Apply to 1-diff of row
        stat_funs.append(_StatFunAdaptor(stat, diff2, **stat_kwargs))  # Apply to 2-diff of row
        stat_funs.append(_StatFunAdaptor(stat, np.unique, **stat_kwargs))  # Apply to unique vals of row
        stat_funs.append(_StatFunAdaptor(stat, np.unique, np.diff, **stat_kwargs))  # Apply to unique, 1-diff row vals
        stat_funs.append(_StatFunAdaptor(stat, np.unique, diff2, **stat_kwargs))  # Apply to unique, 2-diff row vals
    return stat_funs

In [11]:
# Function for retrieving a Random Forest Classifier object
def get_rfc():
    return RandomForestClassifier(n_estimators=100,
                                  max_features=0.5,
                                  max_depth=None,
                                  max_leaf_nodes=270,
                                  min_impurity_decrease=0.0001,
                                  random_state=123,
                                  n_jobs=-1)

In [15]:
# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    if debug:
        print 'Loading debug train and test datasets...'
        train = load_h5py('../data/compressed/debug_train.h5')
        test = load_h5py('../data/compressed/debug_test.h5')
        id_test = load_pickle('../data/compressed/debug_test_id.pickle')
    else:
        print 'Loading original train and test datasets...'
        train = load_h5py('../data/compressed/full_train.h5')
        test = load_h5py('../data/compressed/full_test.h5')
        id_test = load_pickle('../data/compressed/full_test_id.pickle')
    # Isolate target variable
    y_train_log = np.log1p(train[:, -1])
    # Drop unnecessary columns
    train = np.delete(train, -1, axis=1)
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))

    return train, y_train_log, test, id_test

## Main Script

In [16]:
# Get data
X_train, y_train_log, X_test, id_test = get_input(debug)

# Remove constant columns
variance_checker = VarianceThreshold(threshold=0.0)
xtrain = variance_checker.fit_transform(X_train)
xtest = variance_checker.transform(X_test)

# Remove duplicate columns
unique_transformer = UniqueTransformer()
unique_transformer.fit(X_train)
xtrain = unique_transformer.transform(X_train)
xtest = unique_transformer.transform(X_test)

Loading debug train and test datasets...
Shape of training dataset: 100 Rows, 4991 Columns
Shape of test dataset: 200 Rows, 4991 Columns
Finding unique indexes...
Filtering for only unique columns...
Filtering for only unique columns...


In [None]:
y_train_log

In [None]:
# Define stat functions
stat_functions = get_stat_funs()

In [None]:
# Define feature union
data_union = FeatureUnion([
    ('pca', PCA(n_components=100)),
    ('ct-2', ClassifierTransformer(get_rfc(), n_classes=2, cv=5)),
    ('ct-3', ClassifierTransformer(get_rfc(), n_classes=3, cv=5)),
    ('ct-4', ClassifierTransformer(get_rfc(), n_classes=4, cv=5)),
    ('ct-5', ClassifierTransformer(get_rfc(), n_classes=5, cv=5)),
    ('st', StatsTransformer(verbose=2))
])

In [None]:
# Transform data
data_union.fit(X=xtrain, y=y_train_log)
print '\nCreating processed training set...\n'
train_data = data_union.transform(xtrain)
print '\nCreating processed test set...\n'
test_data = data_union.transform(xtest)

In [None]:
# Scale data
xdata = np.concatenate([train_data, test_data], axis=0)
scaler = StandardScaler()
xdata_scaled = scaler.fit_transform(X=xdata)
train_scaled = xdata_scaled[:len(X_train), :]
test_scaled = xdata_scaled[len(X_train):, :]

In [None]:
# Load KLIEP importance weights
weights_path = './covariate_shift/cs_weights_v1/'
weights_temp = '0_width%s_numk%s.pickle'%(gw_val, num_kernels)

In [None]:
# XGBoost regressor parameters
xgb_params = {'n_estimators': 1000,
              'objective': 'reg:linear',
              'booster': 'gbtree',
              'learning_rate': 0.02,
              'max_depth': 22,
              'min_child_weight': 57,
              'gamma' : 1.45,
              'alpha': 0.0,
              'lambda': 0.0,
              'subsample': 0.67,
              'colsample_bytree': 0.054,
              'colsample_bylevel': 0.50,
              'n_jobs': -1,
              'random_state': 456}
# Fitting XGB Regressor parameters
fit_params = {'early_stopping_rounds': 15,
              'eval_metric': 'rmse',
              'verbose': False}

In [None]:
# Train XGBoost Regressor
