# Bureau and Bureau_Balance EDA

This notebook explores and transforms the data from `bureau.csv` and `bureau_balance.csv`, preparing it for integration with `application_train|test.csv`.

## Setup

### Packages and Data

In [1]:
# import packages

import os
import time
import warnings
import zipfile
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from pandas.plotting import scatter_matrix

warnings.filterwarnings('ignore')

In [2]:
# load data

DATA_DIR =  "/../Data/"

ds_names = ("application_train", "bureau", "bureau_balance")

datasets = {}

for ds_name in ds_names:
    datasets[ds_name] = pd.read_csv(os.getcwd() + DATA_DIR + f'{ds_name}.csv')

### Functions and Classes

The FeatureSummarizer class allows for the groping of feature variables on an id column, and then aggregating them into their statistical summaries for each grouping. 

In [3]:
# Class to summarize the features specified into min, max, mean, count, sum, median, and var
class FeatureSummarizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, features=None): # no *args or **kargs
        self.features = features
        self.agg_ops = ["min", "max", "count", "sum", "median", "mean", "var"]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        keys = list(set(X.columns) - set(self.features))
        
        result = X.groupby(keys, as_index=False) \
                  .agg({ft:self.agg_ops for ft in self.features}) 
        result.columns = result.columns.map(lambda ct: '_'.join([x for x in ct if x != '']))
        
        return result 
    

def drop_null_columns(df, threshold=0.7):

    # Dropping columns with missing value rate higher than threshold
    df = df[df.columns[df.isnull().mean() < threshold]]
    
    return df


# function to run the FeatureSummarizer aggregation to prepare data for rollup
def runFeatureSummarizer(df, features):
    
    print(f"df.shape: {df.shape}\n")
    print(f"Aggregated Features:\ndf[{features}][0:5]: \n{df[features][0:5]}")
    pipeline = make_pipeline(FeatureSummarizer(features))
    return(pipeline.fit_transform(df))



In [4]:
# function to display amount of missing data from dataframe columns
def missing_data(data):
    
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])


# Data Overview EDA function
def id_num_cat_feature(df):
    
    # identify ID and feature columns
    id_cols = ['SK_ID_CURR','SK_ID_BUREAU']
    id_cols = [cols for cols in list(df.columns.intersection(id_cols))] 
    features = list(set(df.columns) - set(id_cols))
    
    # get the feature types
    numerical = df[features].select_dtypes(include=['int64', 'float64']).columns
    categorical = df[features].select_dtypes(include=['object', 'bool']).columns
    feat_num = list(numerical)
    feat_cat = list(categorical)
    
    # print eda
    print('--------')
    print(f"# of ID's: {len(id_cols)}")
    print(f" ID's:")
    print(id_cols)
    print('')
    print('--------')
    print(f"# All features: {len(features)}")
    print(f"All features:")
    print(features)
    print('')
    print(f"Missing data:")
    print(missing_data(df[features]))
    print('')
    print('--------')
    print(f"# of Numerical features: {len(feat_num)}")
    print(f"Numerical features:")
    print(feat_num)
    print('')
    print(f"Numerical Statistical Summary:")
    print('')
    print(df[feat_num].describe())
    print('')
    print('--------')
    print(f"# of Categorical features: {len(feat_cat)}")
    print(f"Categorical features:")
    print(feat_cat)
    print('')
    print(f"Categorical Statistical Summary:")
    print('')
    print(df[feat_cat].describe(include='all'))
    print('')
    print("Categories:")
    print('')
    print(df[feat_cat].apply(lambda col: col.unique()))
    print('')
    print('--------')
    
    return id_cols, feat_num, feat_cat, features



## Bureau_Balance EDA and Transformation

The only functional difference between the `bureau` and `bureau_bal` EDA summaries and transformations is the features selected post aggregation. This is handled in the function below by this snippet of code:

```python
 # drop unnecessary features based on table...
    if feat_method == 0:
        # bureau_balance
        feature_selection = [
            df[id_cols],
            df[[column for column in df.columns if column.startswith('MONTHS') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith('STATUS') and column.endswith(('mean', 'median', 'var'))]]
        ]
    elif feat_method == 1:
        # bureau
        feature_selection = [
            df[[column for column in df.columns if not column.startswith(tuple(feat_cat)) and not column.endswith('count')]],
            df[[column for column in df.columns if column.startswith('DAYS_CREDIT') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith(tuple(feat_cat)) and column.endswith(('mean', 'median', 'var'))]]
        ]
    else: 
        print('ERROR: Invalid `feat_method`. 0 for bureau_bal. 1 for bureau.')
```

The logics for each are explained below: 

### bureau_bal

+ gets the ID columns
+ then only select in count months on the account - since this field just increments by 1, this is the rowcount proxy field
+ then select in the mean, variance, and  median status - because these are OHE binary variables: min/max are meaningless, sum/count are handled by mean and the row count IE months count

### bureau

+ gets all the non-categorical columns (including ID) except counts
+ selects DAYS_CREDIT (a field with *all* the records counted - not empty...) to be the rowcount proxy field
+ then select in the mean, variance, and median categorical variables - because these are OHE binary variables: min/max are meaningless, sum/count are handled by mean and the rowcount proxy IE DAYS_CREDIT count

The function below handles EDA and ETL for both the tables. 

In [5]:
def Bureau_EDA_ETL(df, feat_method):
    
    # function to perform EDA and ETL of bureau or bureau_balance data, as indicated by the feat_method arg
    # feat_method = 0 :: bureau_balance.csv
    # feat_method = 1 :: bureau.csv
    
    df = drop_null_columns(df)
    
    # EDA overview summary
    id_cols, feat_num, feat_cat, features =  id_num_cat_feature(df)
    
    # One-Hot-Encode categorical variables
    df = pd.get_dummies(data=df, columns=feat_cat)
    
    features = list(set(df.columns) - set(id_cols))
    feat_ohe = list(set(features) - set(feat_num))

    print(f"# of OHE Categorical features: {len(feat_ohe)}")
    print(f"OHE Categorical features: ")
    print(feat_ohe)
    print('')
    print('--------')

    # Bureau Balance Transformation

    # aggregate and summarize bureau_bal features
    df = runFeatureSummarizer(df, features)

    # drop unnecessary features based on table...
    if feat_method == 0:
        # bureau_balance
        feature_selection = [
            df[id_cols],
            df[[column for column in df.columns if column.startswith('MONTHS') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith('STATUS') and column.endswith('mean')]]
        ]
    elif feat_method == 1:
        # bureau
        feature_selection = [
            df[[column for column in df.columns if not column.startswith(tuple(feat_cat)) and not column.endswith('count')]],
            df[[column for column in df.columns if column.startswith('DAYS_CREDIT') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith(tuple(feat_cat)) and column.endswith('mean')]]
        ]
    else: 
        print('ERROR: Invalid `feat_method`. 0 for bureau_bal. 1 for bureau.')
        
    # recombine selected features
    df = pd.concat(feature_selection, axis=1)
    features = list(set(df.columns) - set(id_cols))

    # report aggregated features
    print('')
    print('--------')
    print('Aggregated Feature Statistical Summary:')
    print('')
    print(df[features].describe().T)
    print('')
    print('--------')
    print('END')
    print('--------')
    print('')
    
    return df



With the function for the `bureau` and `bureau_bal` defined, we can explore and transform the datasets below:

In [6]:
### bureau_bal ###

print("bureau_bal :: EDA and transformation")
print('')

bureau_bal = datasets['bureau_balance']

bureau_bal = Bureau_EDA_ETL(bureau_bal, 0)

### bureau ###

print("bureau w/ bureau_bal rollup :: EDA and Transformation")
print('')

bureau = datasets['bureau']

# rollup bureau_bal
# gets rid of the unwanted characters in categorical columns entries - makes for nicer OHE column names later...
bureau = bureau.merge(bureau_bal, on='SK_ID_BUREAU', how='left') \
               .replace(to_replace='\s+', value='_', regex=True) \
               .replace(to_replace='\-', value='_', regex=True) \
               .replace(to_replace='\/', value='_', regex=True) \
               .replace(to_replace='\(', value='', regex=True) \
               .replace(to_replace='\)', value='', regex=True) \
               .replace(to_replace='\:', value='', regex=True) \
               .drop('SK_ID_BUREAU', axis=1)

bureau = Bureau_EDA_ETL(bureau, 1)

bureau.to_csv('agg_bureau+bureau_bal.csv')
# bureau.describe().T.to_csv('out.csv')

bureau_bal :: EDA and transformation

--------
# of ID's: 1
 ID's:
['SK_ID_BUREAU']

--------
# All features: 2
All features:
['MONTHS_BALANCE', 'STATUS']

Missing data:
                Total  Percent
MONTHS_BALANCE      0      0.0
STATUS              0      0.0

--------
# of Numerical features: 1
Numerical features:
['MONTHS_BALANCE']

Numerical Statistical Summary:

       MONTHS_BALANCE
count    2.729992e+07
mean    -3.074169e+01
std      2.386451e+01
min     -9.600000e+01
25%     -4.600000e+01
50%     -2.500000e+01
75%     -1.100000e+01
max      0.000000e+00

--------
# of Categorical features: 1
Categorical features:
['STATUS']

Categorical Statistical Summary:

          STATUS
count   27299925
unique         8
top            C
freq    13646993

Categories:

  STATUS
0      C
1      0
2      X
3      1
4      2
5      3
6      5
7      4

--------
# of OHE Categorical features: 8
OHE Categorical features: 
['STATUS_5', 'STATUS_C', 'STATUS_3', 'STATUS_4', 'STATUS_2', 'STATUS_1', 

## Combine into Application_train and Prepare for ML

In [7]:
# prepare training and test dataset
appTrain = datasets['application_train']

y = appTrain['TARGET']
X = appTrain.merge(bureau, how='left', on='SK_ID_CURR') \
            .replace(to_replace='\s+', value='_', regex=True) \
            .replace(to_replace='\-', value='_', regex=True) \
            .replace(to_replace='\/', value='_', regex=True) \
            .replace(to_replace='\(', value='', regex=True) \
            .replace(to_replace='\)', value='', regex=True) \
            .replace(to_replace='\:', value='', regex=True) \
            .replace(to_replace='\,', value='', regex=True) \
            .drop(['SK_ID_CURR', 'TARGET'], axis = 1) #drop some features with questionable value

X = drop_null_columns(X)

X_id_cols, X_feat_num, X_feat_cat, X_features = id_num_cat_feature(X)

# One-Hot-Encode categorical variables
X = pd.get_dummies(data=X, columns=X_feat_cat)

X_features = list(set(X.columns) - set(X_id_cols))
X_feat_ohe = list(set(X_features) - set(X_feat_num))

print(f"# of OHE Categorical features: {len(X_feat_ohe)}")
print(f"OHE Categorical features: ")
print(X_feat_ohe)
print('')
print('--------')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, random_state=42, stratify=y_train)

print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

--------
# of ID's: 0
 ID's:
[]

--------
# All features: 221
All features:
['CREDIT_CURRENCY_currency_3_mean', 'NAME_CONTRACT_TYPE', 'REGION_RATING_CLIENT', 'DAYS_CREDIT_ENDDATE_sum', 'DAYS_CREDIT_ENDDATE_mean', 'CREDIT_CURRENCY_currency_2_mean', 'DAYS_CREDIT_UPDATE_mean', 'AMT_CREDIT_SUM_OVERDUE_min', 'OBS_30_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_20', 'CREDIT_TYPE_Credit_card_mean', 'CNT_CREDIT_PROLONG_min', 'DAYS_LAST_PHONE_CHANGE', 'OWN_CAR_AGE', 'NONLIVINGAREA_MODE', 'FLOORSMAX_AVG', 'ORGANIZATION_TYPE', 'AMT_CREDIT_SUM_LIMIT_min', 'FLAG_DOCUMENT_8', 'YEARS_BEGINEXPLUATATION_MODE', 'REG_CITY_NOT_LIVE_CITY', 'DAYS_ENDDATE_FACT_median', 'CREDIT_DAY_OVERDUE_sum', 'AMT_CREDIT_SUM_LIMIT_sum', 'CREDIT_TYPE_Loan_for_business_development_mean', 'DAYS_CREDIT_UPDATE_var', 'YEARS_BUILD_MEDI', 'FLAG_EMP_PHONE', 'CREDIT_TYPE_Microloan_mean', 'FLAG_DOCUMENT_10', 'CREDIT_TYPE_Unknown_type_of_loan_mean', 'APARTMENTS_AVG', 'AMT_CREDIT_SUM_DEBT_var', 'FLAG_DOCUMENT_14', 'WEEKDAY_APPR_PROCESS_START', '

In [13]:
class ColinearityReducer(BaseEstimator, TransformerMixin):
    
    '''
    This class transforms numerical features based on the correlations between each variable pair and the Target.
    Of the var1iable pairs with absolute correlations above the threshold value...
    ...the variables with the lowest target variable correlation are dropped from the input X.
    The process is repeated until there are no more colinear pairs with absolute correlations above the threshold.
    
    NOTE! The function receives a dataframe structured with the target variable in first column.
    '''
    
    def __init__(self, threshold=0.5):
        self.threshold = threshold
        
        
    def fit(self, X, y):
        return self
    
    
    def transform(self, X, y=None): 
        
        dataframe = pd.concat([y,X],axis=1)
        
        while True:

            # read-in and assign columns
            # gets correlation matrix between variables and pivots to a longer df
            # identify target variable
            # drop same-name and target correlations 
            
            df = dataframe
            features = df.iloc[:,1:].columns
            target_name = df.iloc[:,0].name

            df = pd.melt(abs(df.corr()).reset_index(), id_vars='index', value_vars=features)
            targets = df[df['index']==target_name]
            df = df[(df['index'] != df['variable']) & (df['index'] != target_name) & (df['variable'] != target_name)]

            # combine the correlated variables into ordered string
            # aggregate the max correlation and sort pairs
            # split out the variables from the original string
            # join the target variable correlations for each variable pair, rename columns

            df['joined'] = df[['index', 'variable']].apply(lambda row: '::'.join(np.sort(row.values.astype(str))), axis=1)

            df = df.groupby('joined', as_index=False) \
                   .agg({'value':'max'}) \
                   .sort_values(by='value', ascending=False)

            df[['var_1','var_2']] = df['joined'].str.split("::",expand=True)

            df = df.merge(targets, how='left', left_on='var_1', right_on='variable') \
                   .merge(targets, how='left', left_on='var_2', right_on='variable')
            df.rename(columns = {'value_x':'var_pair_corr', 'value_y':'var_1_target_corr', 'value':'var_2_target_corr'}, inplace = True)

            # This section takes all variable pairs with a correlation greater than threshold
            # test to determine which has a higher correlation with the target.
            # The higher of the two gets marked as a win
            # While the other gets marked as a loss
            # the wins and losses for each variable are then grouped and summed

            exceeds = df[df['var_pair_corr']>self.threshold]

            # break if none above threshold
            if len(exceeds['var_pair_corr'])==0:
                break

            exceeds['var_1_win'] = exceeds.apply(lambda row: 1 if row["var_1_target_corr"] >= row["var_2_target_corr"] else 0, axis=1)
            exceeds['var_1_loss'] = exceeds.apply(lambda row: 1 if row["var_2_target_corr"] >= row["var_1_target_corr"] else 0, axis=1)
            exceeds['var_2_win'] = exceeds.apply(lambda row: 1 if row["var_1_target_corr"] < row["var_2_target_corr"] else 0, axis=1)
            exceeds['var_2_loss'] = exceeds.apply(lambda row: 1 if row["var_2_target_corr"] < row["var_1_target_corr"] else 0, axis=1)

            var1 = exceeds[['var_1', 'var_1_win', 'var_1_loss']].groupby('var_1', as_index=False) \
                                                                .agg({'var_1_win':'sum', 'var_1_loss':'sum'})
            var1.rename(columns = {'var_1':'var', 'var_1_win':'win', 'var_1_loss':'loss'}, inplace=True)

            var2 = exceeds[['var_2', 'var_2_win', 'var_2_loss']].groupby('var_2', as_index=False) \
                                                                .agg({'var_2_win':'sum', 'var_2_loss':'sum'})
            var2.rename(columns = {'var_2':'var', 'var_2_win':'win', 'var_2_loss':'loss'}, inplace=True)

            corrcomps = pd.concat([var1,var2], axis=0).groupby('var', as_index=False) \
                                                      .agg({'win':'sum', 'loss':'sum'})

            # drop variables which had 0 wins - IE collinear variables which were always least related to the target
            dropvars = corrcomps[corrcomps['win']==0]['var']

            dataframe = dataframe.drop(dropvars, axis=1)
            
            X = dataframe.iloc[:,1:]

        return X, y

In [12]:
# ColinearityReducer(threshold=0.98).fit_transform(X_train[X_feat_num], y_train)
SimpleImputer(strategy="median").fit_transform(X_train[X_feat_num], y_train)

array([[ 0.00000000e+00,  2.00000000e+00,  1.07170000e+04, ...,
         0.00000000e+00,  6.66666667e-02,  9.05000000e-02],
       [ 0.00000000e+00,  2.00000000e+00, -7.93300000e+03, ...,
         0.00000000e+00,  0.00000000e+00,  7.51200000e-01],
       [ 0.00000000e+00,  2.00000000e+00, -1.64600000e+03, ...,
         0.00000000e+00,  0.00000000e+00,  7.30000000e-02],
       ...,
       [ 0.00000000e+00,  2.00000000e+00,  2.08170000e+04, ...,
         0.00000000e+00,  0.00000000e+00,  7.30000000e-02],
       [ 0.00000000e+00,  3.00000000e+00, -1.96600000e+03, ...,
         0.00000000e+00,  4.41287879e-01,  7.40000000e-02],
       [ 0.00000000e+00,  2.00000000e+00, -3.77000000e+02, ...,
         0.00000000e+00,  0.00000000e+00,  2.14000000e-02]])

In [8]:
def colinearityReducer(dataframe, threshold=0.5):

    '''
    This function explores the correlation between each variable pair and the Target.
    Of the var1iable pairs with absolute correlations above the threshold value...
    ...the variable with the lowest target variable correlation is dropped from the input dataframe.
    The process is repeated until there are no more colinear pairs with absolute correlations above the threshold.
    
    NOTE! The function receives a dataframe structured with the target variable in first column.
    '''
    
    print('------------------------------------')
    print('BEGIN COLINEAR FEATURE REDUCTION')
    print('------------------------------------')
    
    i = 1
    dropped_variables = list()
    while True:
    
        # read-in and assign columns
        # gets correlation matrix between variables and pivots to a longer df
        # identify target variable
        # drop same-name and target correlations 
        
        print('------------------------------------')
        print(f"Colinearity Reduction Iteration {i}\n")
        
        df = dataframe
        features = df.iloc[:,1:].columns
        target_name = df.iloc[:,0].name
        
        print('')
        print(f'Dataframe Features ({len(features)}):')
        print(features)
        
        df = pd.melt(abs(df.corr()).reset_index(), id_vars='index', value_vars=features)
        targets = df[df['index']==target_name]
        df = df[(df['index'] != df['variable']) & (df['index'] != target_name) & (df['variable'] != target_name)]

        # combine the correlated variables into ordered string
        # aggregate the max correlation and sort pairs
        # split out the variables from the original string
        # join the target variable correlations for each variable pair, rename columns

        df['joined'] = df[['index', 'variable']].apply(lambda row: '::'.join(np.sort(row.values.astype(str))), axis=1)

        df = df.groupby('joined', as_index=False) \
               .agg({'value':'max'}) \
               .sort_values(by='value', ascending=False)

        df[['var_1','var_2']] = df['joined'].str.split("::",expand=True)

        df = df.merge(targets, how='left', left_on='var_1', right_on='variable') \
               .merge(targets, how='left', left_on='var_2', right_on='variable')
        df.rename(columns = {'value_x':'var_pair_corr', 'value_y':'var_1_target_corr', 'value':'var_2_target_corr'}, inplace = True)

        # This section takes all variable pairs with a correlation greater than threshold
        # test to determine which has a higher correlation with the target.
        # The higher of the two gets marked as a win
        # While the other gets marked as a loss
        # the wins and losses for each variable are then grouped and summed

        exceeds = df[df['var_pair_corr']>threshold]

        # break if none above threshold
        if len(exceeds['var_pair_corr'])==0:
            print('------------------------------------')
            print(f"NO VARIABLE PAIRS WITH CORRELATION > {threshold}")
            break

        exceeds['var_1_win'] = exceeds.apply(lambda row: 1 if row["var_1_target_corr"] >= row["var_2_target_corr"] else 0, axis=1)
        exceeds['var_1_loss'] = exceeds.apply(lambda row: 1 if row["var_2_target_corr"] >= row["var_1_target_corr"] else 0, axis=1)
        exceeds['var_2_win'] = exceeds.apply(lambda row: 1 if row["var_1_target_corr"] < row["var_2_target_corr"] else 0, axis=1)
        exceeds['var_2_loss'] = exceeds.apply(lambda row: 1 if row["var_2_target_corr"] < row["var_1_target_corr"] else 0, axis=1)

        var1 = exceeds[['var_1', 'var_1_win', 'var_1_loss']].groupby('var_1', as_index=False) \
                                                            .agg({'var_1_win':'sum', 'var_1_loss':'sum'})
        var1.rename(columns = {'var_1':'var', 'var_1_win':'win', 'var_1_loss':'loss'}, inplace=True)

        var2 = exceeds[['var_2', 'var_2_win', 'var_2_loss']].groupby('var_2', as_index=False) \
                                                            .agg({'var_2_win':'sum', 'var_2_loss':'sum'})
        var2.rename(columns = {'var_2':'var', 'var_2_win':'win', 'var_2_loss':'loss'}, inplace=True)

        corrcomps = pd.concat([var1,var2], axis=0).groupby('var', as_index=False) \
                                                  .agg({'win':'sum', 'loss':'sum'})

        # drop variables which had 0 wins - IE collinear variables which were always least related to the target
        dropvars = corrcomps[corrcomps['win']==0]['var']
        
        dropped_variables.extend(list(dropvars))
        
        dropvarsummary = targets[targets['variable'].isin(dropvars)].iloc[:,1:]
        dropvarsummary.rename(columns={'variable':'Dropped Variable', 'value':'Target Variable Correlation'}, inplace = True)
        
        print('')
        print('Dropped Variables:')
        print(dropvarsummary)
        # print('------------------------------------')
        # print('Exceedances:')
        # print(exceeds)
        
        dataframe = dataframe.drop(dropvars, axis=1)
        
        i += 1
    
    print('------------------------------------')
    print('Final Dropped Variable List:')
    print(dropped_variables)
    print('------------------------------------')
    print('END COLINEAR FEATURE REDUCTION')
    print('------------------------------------')
    
    return dataframe
    
    
# testing
df = pd.concat([y_train,X_train[X_feat_num]],axis=1)

colinearityReducer(df, 0.98)


------------------------------------
BEGIN COLINEAR FEATURE REDUCTION
------------------------------------
------------------------------------
Colinearity Reduction Iteration 1


Dataframe Features (205):
Index(['FLAG_DOCUMENT_21', 'AMT_CREDIT_SUM_LIMIT_mean',
       'CREDIT_ACTIVE_Bad_debt_mean', 'CREDIT_DAY_OVERDUE_var',
       'DAYS_CREDIT_var', 'LIVINGAREA_MEDI', 'AMT_CREDIT_SUM_mean',
       'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_19', 'CREDIT_DAY_OVERDUE_max',
       ...
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_CREDIT_MAX_OVERDUE_var',
       'FLAG_DOCUMENT_4', 'DAYS_CREDIT_UPDATE_var', 'AMT_CREDIT_SUM_DEBT_min',
       'CNT_CREDIT_PROLONG_max', 'NONLIVINGAPARTMENTS_AVG',
       'DAYS_CREDIT_count', 'CREDIT_ACTIVE_Active_mean',
       'OBS_30_CNT_SOCIAL_CIRCLE'],
      dtype='object', length=205)

Dropped Variables:
                  Dropped Variable  Target Variable Correlation
1030               LIVINGAREA_MEDI                     0.033231
1854        CREDIT_DAY_OVERDUE_max        

Unnamed: 0,TARGET,FLAG_DOCUMENT_21,AMT_CREDIT_SUM_LIMIT_mean,CREDIT_ACTIVE_Bad_debt_mean,CREDIT_DAY_OVERDUE_var,DAYS_CREDIT_var,AMT_CREDIT_SUM_mean,FLAG_DOCUMENT_10,FLAG_DOCUMENT_19,DAYS_CREDIT_UPDATE_median,...,REGION_RATING_CLIENT_W_CITY,FLOORSMAX_AVG,REG_REGION_NOT_LIVE_REGION,APARTMENTS_AVG,AMT_REQ_CREDIT_BUREAU_MON,FLAG_DOCUMENT_4,DAYS_CREDIT_UPDATE_var,AMT_CREDIT_SUM_DEBT_min,NONLIVINGAPARTMENTS_AVG,OBS_30_CNT_SOCIAL_CIRCLE
224603,0,0,0.0,0.0,0.0,640118.000000,336667.500000,0,0,-101.0,...,2,0.1667,0,0.0928,0.0,0,229851.000000,83187.0,,0.0
57616,0,0,0.0,0.0,0.0,656369.066667,250722.450000,0,0,-823.0,...,2,0.3333,0,0.6928,0.0,0,519842.929167,0.0,0.0232,0.0
197819,0,0,,0.0,,,18994.500000,0,0,-1646.0,...,2,,0,,0.0,0,,0.0,,1.0
278856,0,0,,,,,,0,0,,...,2,,0,,,0,,,,0.0
192451,0,0,0.0,0.0,,,129870.000000,0,0,-1088.0,...,2,,0,,0.0,0,,0.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104304,0,0,0.0,0.0,0.0,416946.910256,147855.883846,0,0,-573.0,...,2,0.1667,0,0.0722,0.0,0,232710.102564,0.0,0.0077,10.0
39206,0,0,,,,,,0,0,,...,1,0.5417,0,0.2003,,0,,,0.0058,1.0
282714,0,0,0.0,0.0,0.0,0.500000,4500.000000,0,0,-446.0,...,2,,0,,0.0,0,0.000000,0.0,,0.0
125190,0,0,0.0,0.0,0.0,289815.466667,113243.827500,0,0,-203.0,...,3,0.1667,0,0.0619,0.0,0,174067.600000,0.0,0.0000,1.0


In [11]:
# run baseline model

name = "all_features_LR" # <<<--- enter name of run here

num_features = X_feat_num
cat_features = X_feat_cat

selected_features = (num_features) + (cat_features)

# data type pipelines
num_pipeline = Pipeline([
    ('col_reducer', ColinearityReducer(threshold=0.98)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
])

data_pipeline = ColumnTransformer(
    transformers=[
        ("num_pipeline", num_pipeline, num_features),
        ("cat_pipeline", cat_pipeline, cat_features)
    ],
    remainder='drop',
    n_jobs=-1
)

# full feature pipeline
full_pipeline_with_predictor = Pipeline([
    ("preparation", data_pipeline),
    # ('select', SelectKBest()),
    ("linear", LogisticRegression())
])

# grid searching
param_grid = {
    'linear__penalty':[#'l1', 'l2', 'elasticnet',
                                'none']
    #, 'linear__C':[1.0#, 10.0, 100.0]
    #              ]
    #, 'select__k':[414, #15, 20, 30, 50, 100
    #              ]
}

print('Run GridSearch')
grid = GridSearchCV(
    full_pipeline_with_predictor, param_grid=param_grid, scoring='roc_auc', 
    cv = 3,
    # n_jobs = 2, 
    verbose = 2
)

model = grid.fit(X_train, y_train)


try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])

exp_name = f"Baseline_{name}"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, model.predict(X_train)), 
                accuracy_score(y_valid, model.predict(X_valid)),
                accuracy_score(y_test, model.predict(X_test)),
                roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]),
                roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1]),
                roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])],
    4)) 
expLog

Run GridSearch
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END ...............................linear__penalty=none; total time=   0.3s
[CV] END ...............................linear__penalty=none; total time=   0.3s
[CV] END ...............................linear__penalty=none; total time=   0.3s


ValueError: A given column is not a column of the dataframe