# Collinearity Reducer HCDR Pipeline

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Setup

### Imports

In [15]:
# import packages

import os
import time
import warnings
import zipfile
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from pandas.plotting import scatter_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')

### Custom Classes

In [16]:
# transformer reduces the list of columns by a subset
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# transformer produces a reduced column list by collinearity reduction
class CollinearityReducer(BaseEstimator, TransformerMixin):
    
    '''
    This class reduces features by measuring collinearity between the input variables and target.
    Works on numerical features based on the correlations between each variable pair.
    Of the var1iable pairs with absolute correlations above the threshold value...
    ...the variables with the lowest target variable correlation are dropped from the input X.
    The process is repeated until there are no more colinear pairs with absolute correlations above the threshold.
    ...Or max_iter. 
    
    The transformation returns a subset of feature names... 
    ...to be used with the DataFrameSelector() Class. 

    This class is meant to be run at the end of the numerical pipeline
    PRIOR TO THE ACTUAL PIPELINE - only returns subset for DataFrameSelector().

    NOTE! The function receives a dataframe structured with the target variable in first column.
    '''
    
    def __init__(self, attribute_names, threshold=0.5, max_iter=None):
        self.attribute_names = attribute_names
        self.threshold = threshold
        self.max_iter = max_iter
            
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None): 
        
        dataframe = pd.concat([y, pd.DataFrame(X)], axis=1)
        
        i = 0
        while i <= self.max_iter:

            # read-in and assign columns
            # gets correlation matrix between variables and pivots to a longer df
            # identify target variable
            # drop same-name and target correlations pairs
              
            df = dataframe
            features = df.iloc[:,1:].columns
            target_name = df.iloc[:,0].name

            df = pd.melt(abs(df.corr()).reset_index(), id_vars='index', value_vars=features)
            targets = df[df['index']==target_name]
            df = df[(df['index'] != df['variable']) & (df['index'] != target_name) & (df['variable'] != target_name)]

            # combine the correlated variables into ordered pairs
            # aggregate the max correlation and sort pairs
            # split out the variables from the pair
            # join the target variable correlations for each variable pair, rename columns

            df['joined'] = df[['index', 'variable']].apply(lambda row: '::'.join(np.sort(row.values.astype(str))), axis=1)

            df = df.groupby('joined', as_index=False) \
                   .agg({'value':'max'}) \
                   .sort_values(by='value', ascending=False)

            df[['var_1','var_2']] = df['joined'].str.split("::",expand=True).astype(int)

            df = df.merge(targets, how='left', left_on='var_1', right_on='variable') \
                   .merge(targets, how='left', left_on='var_2', right_on='variable')
            df.rename(columns = {'value_x':'var_pair_corr', 'value_y':'var_1_target_corr', 'value':'var_2_target_corr'}, inplace = True)

            # This section takes all variable pairs with a correlation greater than threshold
            # tests to determine which variable has a higher correlation with the target.
            # The higher of the two gets marked as a win
            # While the other gets marked as a loss
            # the wins and losses for each variable are then grouped and summed

            exceeds = df[df['var_pair_corr']>self.threshold]

            # break if none above threshold
            if len(exceeds['var_pair_corr'])==0:
                break

            # "correlation competition"
            exceeds['var_1_win'] = exceeds.apply(lambda row: 1 if row["var_1_target_corr"] >= row["var_2_target_corr"] else 0, axis=1)
            exceeds['var_1_loss'] = exceeds.apply(lambda row: 1 if row["var_2_target_corr"] >= row["var_1_target_corr"] else 0, axis=1)
            exceeds['var_2_win'] = exceeds.apply(lambda row: 1 if row["var_1_target_corr"] < row["var_2_target_corr"] else 0, axis=1)
            exceeds['var_2_loss'] = exceeds.apply(lambda row: 1 if row["var_2_target_corr"] < row["var_1_target_corr"] else 0, axis=1)

            # aggregate scores
            var1 = exceeds[['var_1', 'var_1_win', 'var_1_loss']].groupby('var_1', as_index=False) \
                                                                .agg({'var_1_win':'sum', 'var_1_loss':'sum'})
            var1.rename(columns = {'var_1':'var', 'var_1_win':'win', 'var_1_loss':'loss'}, inplace=True)

            var2 = exceeds[['var_2', 'var_2_win', 'var_2_loss']].groupby('var_2', as_index=False) \
                                                                .agg({'var_2_win':'sum', 'var_2_loss':'sum'})
            var2.rename(columns = {'var_2':'var', 'var_2_win':'win', 'var_2_loss':'loss'}, inplace=True)

            corrcomps = pd.concat([var1,var2], axis=0).groupby('var', as_index=False) \
                                                      .agg({'win':'sum', 'loss':'sum'})

            # drop variables which had 0 wins - IE collinear variables which were always least related to the target
            dropvars = corrcomps[corrcomps['win']==0]['var']

            dataframe = dataframe.drop(dropvars, axis=1)  

            i += 1  
        
        X = [self.attribute_names[col] for col in dataframe.columns]

        return X

### Custom Functions

In [17]:
# function identifies missing data
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) 


# function to identify different feature types and summary EDA
def id_num_cat_feature(df,text = True):
    numerical = df.select_dtypes(include=np.number).columns
    categorical = df.select_dtypes(include=['object', 'bool', 'category']).columns
    feat_num = list(numerical)
    feat_cat = list(categorical)
    
    id_cols = ['SK_ID_CURR','SK_ID_BUREAU']
    
    id_cols = [cols for cols in  list(df.columns.intersection(id_cols))] 
    features = list(set(df.columns) - set(id_cols))

    if text == True:
          # print eda
        print('--------')
        print(f"# of ID's: {len(id_cols)}")
        print(f" ID's:")
        print(id_cols)
        print('')
        print('--------')
        print(f"# All features: {len(features)}")
        print(f"All features:")
        print(features)
        print('')
        print(f"Missing data:")
        print(missing_data(df[features]))
        print('')
        print('--------')
        print(f"# of Numerical features: {len(feat_num)}")
        print(f"Numerical features:")
        print(feat_num)
        print('')
        print(f"Numerical Statistical Summary:")
        print('')
        print(df[feat_num].describe())
        print('')
        print('--------')
        print(f"# of Categorical features: {len(feat_cat)}")
        print(f"Categorical features:")
        print(feat_cat)
        print('')
        print(f"Categorical Statistical Summary:")
        print('')
        #print(df[feat_cat].describe(include='all'))
        print('')
        print("Categories:")
        print('')
        print(df[feat_cat].apply(lambda col: col.unique()))
        print('')
        print('--------')
        
    return id_cols,feat_num,feat_cat,features


# https://pythonsimplified.com/how-to-handle-large-datasets-in-python-with-pandas/

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**3
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**3
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Data Preparation

### Read-In and Merge

In [18]:
# read-in
DATA_DIR =  "/drive/MyDrive/ColabNotebooks/"

ds_names = (
    # ["bureau_ip_ccb_prev_pos_merged"]
    "application_train", "bureau_agg_data_trans_untrans",  # "application_test", 
    "ip_agg_data_tr", "pos_agg_data_tr", "prevapp_agg_data_tr"
    # "ccb_agg_data_tr",
)  

datasets_agg = {}

for ds_name in ds_names:
    print('---')
    print(ds_name)
    datasets_agg[ds_name] = pd.read_csv(os.getcwd() + DATA_DIR + f'{ds_name}.csv')
    datasets_agg[ds_name] = reduce_mem_usage(datasets_agg[ds_name])

---
application_train
Memory usage of dataframe is 0.28 MB
Memory usage after optimization is: 0.06 MB
Decreased by 79.2%
---
bureau_agg_data_trans_untrans
Memory usage of dataframe is 0.72 MB
Memory usage after optimization is: 0.21 MB
Decreased by 70.6%
---
ip_agg_data_tr
Memory usage of dataframe is 0.17 MB
Memory usage after optimization is: 0.06 MB
Decreased by 63.4%
---
pos_agg_data_tr
Memory usage of dataframe is 0.34 MB
Memory usage after optimization is: 0.09 MB
Decreased by 73.3%
---
prevapp_agg_data_tr
Memory usage of dataframe is 1.73 MB
Memory usage after optimization is: 0.46 MB
Decreased by 73.2%


In [19]:
# denormalize and clean text
for ds_name in datasets_agg:
    if ds_name == 'application_train':
        agg_data = datasets_agg['application_train'].replace(to_replace='\s+', value='_', regex=True) \
                                                    .replace(to_replace='\-', value='_', regex=True) \
                                                    .replace(to_replace='\/', value='_', regex=True) \
                                                    .replace(to_replace='\(', value='', regex=True) \
                                                    .replace(to_replace='\)', value='', regex=True) \
                                                    .replace(to_replace='\:', value='', regex=True) \
                                                    .replace(to_replace='\,', value='', regex=True)
    else:
        agg_data = agg_data.merge(datasets_agg[ds_name], on='SK_ID_CURR', how='left')


agg_data = agg_data.loc[:,~agg_data.columns.str.startswith('Unnamed:')]
agg_data = agg_data.loc[:,~agg_data.columns.str.startswith('SK_ID_PREV')]


In [20]:
agg_data.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'PA_O_NAME_GOODS_CATEGORY_Other_median',
       'PA_O_NAME_GOODS_CATEGORY_Other_mean',
       'PA_O_NAME_GOODS_CATEGORY_Other_var',
       'PA_O_NAME_SELLER_INDUSTRY_XNA_median',
       'PA_O_NAME_SELLER_INDUSTRY_XNA_mean',
       'PA_O_NAME_SELLER_INDUSTRY_XNA_var',
       'PA_O_NAME_CASH_LOAN_PURPOSE_Urgent needs_median',
       'PA_O_NAME_CASH_LOAN_PURPOSE_Urgent needs_mean',
       'PA_O_NAME_CASH_LOAN_PURPOSE_Urgent needs_var',
       'PS_O_SK_ID_PREV_count_y'],
      dtype='object', length=1318)

## Pipeline

### Main Pipeline

In [22]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import time

# create train, validation, and test sets
y = agg_data['TARGET']
X = agg_data.drop(['SK_ID_CURR', 'TARGET'], axis = 1) #drop some features with questionable value


_, X, _, y = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

## Pipeline

### Collinear Feature Reduction

# determine feature types, reduce numerical features by collinearity reduction
id_col, feat_num, feat_cat, feature =  id_num_cat_feature(X, text = False)

# cr = make_pipeline(
#     SimpleImputer(strategy='median'),
#     StandardScaler(),    
#     CollinearityReducer(attribute_names=feat_num, threshold = 0.5, max_iter=25)
# )

# tic = time.perf_counter()
# reduced_feat_num = cr.fit_transform(X_train[feat_num], y_train) 
# toc = time.perf_counter()

# print(f"Collinearity Reduction completed in {toc - tic:0.4f} seconds.")
# print(f'Reduced numerical column count from {len(feat_num)}...')
# print(f'...to {len(reduced_feat_num)} by collinearity reduction.')

### Main Pipeline

# Pipeline

num_pipeline = Pipeline([
    # ('selector', DataFrameSelector(reduced_feat_num)),
    ('imputer',SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
])

data_pipeline = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, feat_num),
    ("cat_pipeline", cat_pipeline, feat_cat)],
    remainder='drop',
    n_jobs=-1
)

full_pipeline_with_predictor = Pipeline([
    ("preparation", data_pipeline),
    ("rf", RandomForestClassifier())
])

max_depth = [5, 10, 25, 50, 100]
min_samples_leaf = [5, 10, 25, 50, 100]

parameters = dict(
    rf__max_depth = max_depth,
    rf__min_samples_leaf = min_samples_leaf
)

grid = GridSearchCV(
    full_pipeline_with_predictor, param_grid= parameters, 
    cv = 3, n_jobs=4, scoring='roc_auc', verbose=2
)

grid.fit(X_train, y_train)

print(grid.best_params_)

# get results of pipeline from validation and test sets for accuracy and AUC-ROC
try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])

exp_name = f"0.1RF_agg_trans_{grid.best_params_}"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, grid.predict(X_train)), 
                accuracy_score(y_valid, grid.predict(X_valid)),
                accuracy_score(y_test, grid.predict(X_test)),
                roc_auc_score(y_train, grid.predict_proba(X_train)[:, 1]),
                roc_auc_score(y_valid, grid.predict_proba(X_valid)[:, 1]),
                roc_auc_score(y_test, grid.predict_proba(X_test)[:, 1])],
    4)) 
expLog


X train           shape: (19680, 1316)
X validation      shape: (4921, 1316)
X test            shape: (6151, 1316)
Fitting 3 folds for each of 25 candidates, totalling 75 fits
{'rf__max_depth': 100, 'rf__min_samples_leaf': 50}


Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train AUC,Valid AUC,Test AUC
0,0.1xgb_agg_trans_no-cr_{'xgb__subsample': 0.8},0.9227,0.9193,0.9192,0.859,0.7521,0.7472
1,0.1xgb_agg_trans_cr:0.5-10_{'xgb__subsample': ...,0.9227,0.9195,0.9187,0.8522,0.7612,0.7438
2,0.1xgb_agg_trans_cr:0.5-25_{'xgb__subsample': ...,0.9224,0.9187,0.9185,0.8459,0.757,0.7492
3,"0.1RF_agg_trans_{'rf__max_depth': 100, 'rf__mi...",0.9193,0.9193,0.9192,0.9135,0.7133,0.7119


In [23]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import time

# create train, validation, and test sets
y = agg_data['TARGET']
X = agg_data.drop(['SK_ID_CURR', 'TARGET'], axis = 1) #drop some features with questionable value


_, X, _, y = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

## Pipeline

### Collinear Feature Reduction

# determine feature types, reduce numerical features by collinearity reduction
id_col, feat_num, feat_cat, feature =  id_num_cat_feature(X, text = False)

cr = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),    
    CollinearityReducer(attribute_names=feat_num, threshold = 0.5, max_iter=25)
)

tic = time.perf_counter()
reduced_feat_num = cr.fit_transform(X_train[feat_num], y_train) 
toc = time.perf_counter()

print(f"Collinearity Reduction completed in {toc - tic:0.4f} seconds.")
print(f'Reduced numerical column count from {len(feat_num)}...')
print(f'...to {len(reduced_feat_num)} by collinearity reduction.')

### Main Pipeline

# Pipeline

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(reduced_feat_num)),
    ('imputer',SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
])

data_pipeline = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, feat_num),
    ("cat_pipeline", cat_pipeline, feat_cat)],
    remainder='drop',
    n_jobs=-1
)

full_pipeline_with_predictor = Pipeline([
    ("preparation", data_pipeline),
    ("rf", RandomForestClassifier())
])

max_depth = [5, 10, 25, 50, 100]
min_samples_leaf = [5, 10, 25, 50, 100]

parameters = dict(
    rf__max_depth = max_depth,
    rf__min_samples_leaf = min_samples_leaf
)

grid = GridSearchCV(
    full_pipeline_with_predictor, param_grid= parameters, 
    cv = 3, n_jobs=4, scoring='roc_auc', verbose=2
)

grid.fit(X_train, y_train)

print(grid.best_params_)

# get results of pipeline from validation and test sets for accuracy and AUC-ROC
try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])

exp_name = f"0.1RF_agg_trans_cr:0.5-25_{grid.best_params_}"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, grid.predict(X_train)), 
                accuracy_score(y_valid, grid.predict(X_valid)),
                accuracy_score(y_test, grid.predict(X_test)),
                roc_auc_score(y_train, grid.predict_proba(X_train)[:, 1]),
                roc_auc_score(y_valid, grid.predict_proba(X_valid)[:, 1]),
                roc_auc_score(y_test, grid.predict_proba(X_test)[:, 1])],
    4)) 
expLog


X train           shape: (19680, 1316)
X validation      shape: (4921, 1316)
X test            shape: (6151, 1316)
Collinearity Reduction completed in 984.5254 seconds.
Reduced numerical column count from 1300...
...to 525 by collinearity reduction.
Fitting 3 folds for each of 25 candidates, totalling 75 fits
{'rf__max_depth': 50, 'rf__min_samples_leaf': 50}


Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train AUC,Valid AUC,Test AUC
0,0.1xgb_agg_trans_no-cr_{'xgb__subsample': 0.8},0.9227,0.9193,0.9192,0.859,0.7521,0.7472
1,0.1xgb_agg_trans_cr:0.5-10_{'xgb__subsample': ...,0.9227,0.9195,0.9187,0.8522,0.7612,0.7438
2,0.1xgb_agg_trans_cr:0.5-25_{'xgb__subsample': ...,0.9224,0.9187,0.9185,0.8459,0.757,0.7492
3,"0.1RF_agg_trans_{'rf__max_depth': 100, 'rf__mi...",0.9193,0.9193,0.9192,0.9135,0.7133,0.7119
4,0.1RF_agg_trans_cr:0.5-25_{'rf__max_depth': 50...,0.9193,0.9193,0.9192,0.909,0.7188,0.7045


In [24]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import time

# create train, validation, and test sets
y = agg_data['TARGET']
X = agg_data.drop(['SK_ID_CURR', 'TARGET'], axis = 1) #drop some features with questionable value


_, X, _, y = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

## Pipeline

### Collinear Feature Reduction

# determine feature types, reduce numerical features by collinearity reduction
id_col, feat_num, feat_cat, feature =  id_num_cat_feature(X, text = False)

cr = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),    
    CollinearityReducer(attribute_names=feat_num, threshold = 0.35, max_iter=25)
)

tic = time.perf_counter()
reduced_feat_num = cr.fit_transform(X_train[feat_num], y_train) 
toc = time.perf_counter()

print(f"Collinearity Reduction completed in {toc - tic:0.4f} seconds.")
print(f'Reduced numerical column count from {len(feat_num)}...')
print(f'...to {len(reduced_feat_num)} by collinearity reduction.')

### Main Pipeline

# Pipeline

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(reduced_feat_num)),
    ('imputer',SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
])

data_pipeline = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, feat_num),
    ("cat_pipeline", cat_pipeline, feat_cat)],
    remainder='drop',
    n_jobs=-1
)

full_pipeline_with_predictor = Pipeline([
    ("preparation", data_pipeline),
    ("rf", RandomForestClassifier())
])

max_depth = [5, 10, 25, 50, 100]
min_samples_leaf = [5, 10, 25, 50, 100]

parameters = dict(
    rf__max_depth = max_depth,
    rf__min_samples_leaf = min_samples_leaf
)

grid = GridSearchCV(
    full_pipeline_with_predictor, param_grid= parameters, 
    cv = 3, n_jobs=4, scoring='roc_auc', verbose=2
)

grid.fit(X_train, y_train)

print(grid.best_params_)

# get results of pipeline from validation and test sets for accuracy and AUC-ROC
try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])

exp_name = f"0.1RF_agg_trans_cr:0.35-25_{grid.best_params_}"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, grid.predict(X_train)), 
                accuracy_score(y_valid, grid.predict(X_valid)),
                accuracy_score(y_test, grid.predict(X_test)),
                roc_auc_score(y_train, grid.predict_proba(X_train)[:, 1]),
                roc_auc_score(y_valid, grid.predict_proba(X_valid)[:, 1]),
                roc_auc_score(y_test, grid.prebdict_proba(X_test)[:, 1])],
    4)) 
expLog


X train           shape: (19680, 1316)
X validation      shape: (4921, 1316)
X test            shape: (6151, 1316)
Collinearity Reduction completed in 1245.9714 seconds.
Reduced numerical column count from 1300...
...to 599 by collinearity reduction.
Fitting 3 folds for each of 25 candidates, totalling 75 fits
{'rf__max_depth': 100, 'rf__min_samples_leaf': 50}


Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train AUC,Valid AUC,Test AUC
0,0.1xgb_agg_trans_no-cr_{'xgb__subsample': 0.8},0.9227,0.9193,0.9192,0.859,0.7521,0.7472
1,0.1xgb_agg_trans_cr:0.5-10_{'xgb__subsample': ...,0.9227,0.9195,0.9187,0.8522,0.7612,0.7438
2,0.1xgb_agg_trans_cr:0.5-25_{'xgb__subsample': ...,0.9224,0.9187,0.9185,0.8459,0.757,0.7492
3,"0.1RF_agg_trans_{'rf__max_depth': 100, 'rf__mi...",0.9193,0.9193,0.9192,0.9135,0.7133,0.7119
4,0.1RF_agg_trans_cr:0.5-25_{'rf__max_depth': 50...,0.9193,0.9193,0.9192,0.909,0.7188,0.7045
5,0.1RF_agg_trans_cr:0.35-25_{'rf__max_depth': 1...,0.9193,0.9193,0.9192,0.9061,0.7073,0.7


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import time

# create train, validation, and test sets
y = agg_data['TARGET']
X = agg_data.drop(['SK_ID_CURR', 'TARGET'], axis = 1) #drop some features with questionable value


_, X, _, y = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

## Pipeline

### Collinear Feature Reduction

# determine feature types, reduce numerical features by collinearity reduction
id_col, feat_num, feat_cat, feature =  id_num_cat_feature(X, text = False)

cr = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),    
    CollinearityReducer(attribute_names=feat_num, threshold = 0.6, max_iter=50)
)

tic = time.perf_counter()
reduced_feat_num = cr.fit_transform(X_train[feat_num], y_train) 
toc = time.perf_counter()

print(f"Collinearity Reduction completed in {toc - tic:0.4f} seconds.")
print(f'Reduced numerical column count from {len(feat_num)}...')
print(f'...to {len(reduced_feat_num)} by collinearity reduction.')

### Main Pipeline

# Pipeline

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(reduced_feat_num)),
    ('imputer',SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
])

data_pipeline = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, feat_num),
    ("cat_pipeline", cat_pipeline, feat_cat)],
    remainder='drop',
    n_jobs=-1
)

full_pipeline_with_predictor = Pipeline([
    ("preparation", data_pipeline),
    ('xgb', xgb.XGBClassifier(n_jobs=-1))
])

max_depth = [5, 10, 25, 50, 100]
min_samples_leaf = [5, 10, 25, 50, 100]

# https://www.datasnips.com/5/tuning-xgboost-with-grid-search/
parameters = dict(
    xgb__subsample = [0.5, 0.75, 1],
    xgb__colsample_bytree = [0.5, 0.75, 1],
    xgb__max_depth = [5, 15, 30],
    xgb__min_child_weight = [1,5,15],
    xgb__learning_rate = [0.3, 0.1, 0.03],
    xgb__n_estimators = [100]
)

grid = GridSearchCV(
    full_pipeline_with_predictor, param_grid= parameters, 
    cv = 3, n_jobs=4, scoring='roc_auc', verbose=2
)

eval_set = [(X_train, y_train), (X_valid, y_valid)]

grid.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=["error", "logloss"],eval_set=eval_set, verbose=3)


In [None]:
print(grid.best_params_)

# get results of pipeline from validation and test sets for accuracy and AUC-ROC
try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])

exp_name = f"0.1xgb_agg_trans_cr:0.6-50_{grid.best_params_}"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, grid.predict(X_train)), 
                accuracy_score(y_valid, grid.predict(X_valid)),
                accuracy_score(y_test, grid.predict(X_test)),
                roc_auc_score(y_train, grid.predict_proba(X_train)[:, 1]),
                roc_auc_score(y_valid, grid.predict_proba(X_valid)[:, 1]),
                roc_auc_score(y_test, grid.prebdict_proba(X_test)[:, 1])],
    4)) 
expLog


In [None]:
from matplotlib import pyplot

# retrieve performance metrics
results = grid.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Valid')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.grid()
pyplot.show()
# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Valid')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.grid()
pyplot.show()

In [27]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import time

# create train, validation, and test sets
y = agg_data['TARGET']
X = agg_data.drop(['SK_ID_CURR', 'TARGET'], axis = 1) #drop some features with questionable value


_, X, _, y = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

## Pipeline

### Collinear Feature Reduction

# determine feature types, reduce numerical features by collinearity reduction
id_col, feat_num, feat_cat, feature =  id_num_cat_feature(X, text = False)

cr = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),    
    CollinearityReducer(attribute_names=feat_num, threshold = 0.5, max_iter=50)
)

tic = time.perf_counter()
reduced_feat_num = cr.fit_transform(X_train[feat_num], y_train) 
toc = time.perf_counter()

print(f"Collinearity Reduction completed in {toc - tic:0.4f} seconds.")
print(f'Reduced numerical column count from {len(feat_num)}...')
print(f'...to {len(reduced_feat_num)} by collinearity reduction.')

### Main Pipeline

# Pipeline

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(reduced_feat_num)),
    ('imputer',SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
])

data_pipeline = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, feat_num),
    ("cat_pipeline", cat_pipeline, feat_cat)],
    remainder='drop',
    n_jobs=-1
)

full_pipeline_with_predictor = Pipeline([
    ("preparation", data_pipeline),
    ("rf", RandomForestClassifier())
])

max_depth = [5, 10, 25, 50, 100]
min_samples_leaf = [5, 10, 25, 50, 100]

parameters = dict(
            rf__max_depth = [9, 15, 22, 26, 30],
            rf__max_features = [1, 3, 5],
            rf__min_samples_split= [5, 10, 15],
            rf__min_samples_leaf = [3, 5, 10],
            rf__bootstrap = [False],
            rf__n_estimators = [20, 80, 150, 200, 300]
)

grid = GridSearchCV(
    full_pipeline_with_predictor, param_grid= parameters, 
    cv = 3, n_jobs=4, scoring='roc_auc', verbose=2
)

grid.fit(X_train, y_train)

print(grid.best_params_)

# get results of pipeline from validation and test sets for accuracy and AUC-ROC
try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])

exp_name = f"0.1RF_agg_trans_cr:0.5-50_{grid.best_params_}"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, grid.predict(X_train)), 
                accuracy_score(y_valid, grid.predict(X_valid)),
                accuracy_score(y_test, grid.predict(X_test)),
                roc_auc_score(y_train, grid.predict_proba(X_train)[:, 1]),
                roc_auc_score(y_valid, grid.predict_proba(X_valid)[:, 1]),
                roc_auc_score(y_test, grid.predict_proba(X_test)[:, 1])],
    4)) 
expLog





X train           shape: (19680, 1316)
X validation      shape: (4921, 1316)
X test            shape: (6151, 1316)
Collinearity Reduction completed in 1473.1862 seconds.
Reduced numerical column count from 1300...
...to 481 by collinearity reduction.
Fitting 3 folds for each of 675 candidates, totalling 2025 fits
{'rf__bootstrap': False, 'rf__max_depth': 26, 'rf__max_features': 5, 'rf__min_samples_leaf': 10, 'rf__min_samples_split': 10, 'rf__n_estimators': 300}


Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train AUC,Valid AUC,Test AUC
0,0.1xgb_agg_trans_no-cr_{'xgb__subsample': 0.8},0.9227,0.9193,0.9192,0.859,0.7521,0.7472
1,0.1xgb_agg_trans_cr:0.5-10_{'xgb__subsample': ...,0.9227,0.9195,0.9187,0.8522,0.7612,0.7438
2,0.1xgb_agg_trans_cr:0.5-25_{'xgb__subsample': ...,0.9224,0.9187,0.9185,0.8459,0.757,0.7492
3,"0.1RF_agg_trans_{'rf__max_depth': 100, 'rf__mi...",0.9193,0.9193,0.9192,0.9135,0.7133,0.7119
4,0.1RF_agg_trans_cr:0.5-25_{'rf__max_depth': 50...,0.9193,0.9193,0.9192,0.909,0.7188,0.7045
5,0.1RF_agg_trans_cr:0.35-25_{'rf__max_depth': 1...,0.9193,0.9193,0.9192,0.9061,0.7073,0.7
6,0.1RF_agg_trans_cr:0.5-50_{'rf__bootstrap': Fa...,0.9193,0.9193,0.9192,0.9953,0.6873,0.6828
