# Bureau and Bureau_Balance EDA

This notebook explores and transforms the data from `bureau.csv` and `bureau_balance.csv`, preparing it for integration with `application_train|test.csv`.

## Setup

### Packages and Data

In [1]:
# import packages

import os
import time
import warnings
import zipfile
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from pandas.plotting import scatter_matrix

warnings.filterwarnings('ignore')

In [2]:
# load data

DATA_DIR =  "/../Data/"

ds_names = (
    "application_train", "application_test", 
    "bureau", "bureau_balance"
)

datasets = {}

for ds_name in ds_names:
    datasets[ds_name] = pd.read_csv(os.getcwd() + DATA_DIR + f'{ds_name}.csv')

### Functions and Classes

In [3]:
# function to display amount of missing data from dataframe columns
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])



# Class to summarize the features specified into min, max, mean, count, sum, median, and var
class FeatureSummarizer(BaseEstimator, TransformerMixin):
    def __init__(self, features=None): # no *args or **kargs
        self.features = features
        self.agg_ops = ["min", "max", "count", "sum", "median", "mean", "var"]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        keys = list(set(X.columns) - set(self.features))
        
        result = X.groupby(keys, as_index=False) \
                  .agg({ft:self.agg_ops for ft in self.features}) 
        result.columns = result.columns.map(lambda ct: '_'.join([x for x in ct if x != '']))
        
        return result 
    

def runFeatureSummarizer(df, features):
    print(f"df.shape: {df.shape}\n")
    print(f"Aggregated Features:\ndf[{features}][0:5]: \n{df[features][0:5]}")
    pipeline = make_pipeline(FeatureSummarizer(features))
    return(pipeline.fit_transform(df))



## Bureau_Balance EDA and Transformation

In [4]:
# Bureau Balance EDA

bureau_bal = datasets['bureau_balance']

id_cols = ['SK_ID_BUREAU']

features = list(set(bureau_bal.columns) - set(id_cols))
num_features = list(bureau_bal[features].select_dtypes(include=['int64', 'float64']).columns)
cat_features = list(bureau_bal[features].select_dtypes(include=['object', 'bool']).columns)

print(f"ID columns: {list(id_cols)}")
print(f"Feature columns: {features}")
print('')
print(f"Missing data: {missing_data(bureau_bal[features])}")
print('--------')
print(f"# of numerical features: {len(num_features)}")
print(f"Numerical features: {num_features}")
print('')
print(bureau_bal[features].describe())
print('--------')
print(f"# of categorical features: {len(cat_features)}")
print(f"Categorical features: {cat_features}")
print('')
print(bureau_bal[cat_features].describe(include='all'))
print(bureau_bal[cat_features].apply(lambda col: col.unique()))
print('--------')

# One-Hot-Encode categorical variables
bureau_bal = pd.get_dummies(data=bureau_bal, columns=cat_features)

features = list(set(bureau_bal.columns) - set(id_cols))
ohe_cat_features = list(set(features) - set(num_features))
      
print(f"# of OHE categorical features: {len(ohe_cat_features)}")
print(f"OHE Categorical features: {ohe_cat_features}")
print('--------')

# Bureau Balance Transformation

# aggregate and summarize bureau_bal features
bureau_bal = runFeatureSummarizer(bureau_bal, features)

# drop unnecessary features
feature_selection = [
    bureau_bal[id_cols],
    bureau_bal[[column for column in bureau_bal.columns if column.startswith('MONTHS') and column.endswith('count')]],
    bureau_bal[[column for column in bureau_bal.columns if column.startswith('STATUS') and column.endswith('mean')]]
]

bureau_bal = pd.concat(feature_selection, axis=1)
features = list(set(bureau_bal.columns) - set(id_cols))

print('--------')
print('Aggregated bureau_bal:')
print('')
print(bureau_bal[features].describe())

ID columns: ['SK_ID_BUREAU']
Feature columns: ['MONTHS_BALANCE', 'STATUS']

Missing data:                 Total  Percent
MONTHS_BALANCE      0      0.0
STATUS              0      0.0
--------
# of numerical features: 1
Numerical features: ['MONTHS_BALANCE']

       MONTHS_BALANCE
count    2.729992e+07
mean    -3.074169e+01
std      2.386451e+01
min     -9.600000e+01
25%     -4.600000e+01
50%     -2.500000e+01
75%     -1.100000e+01
max      0.000000e+00
--------
# of categorical features: 1
Categorical features: ['STATUS']

          STATUS
count   27299925
unique         8
top            C
freq    13646993
  STATUS
0      C
1      0
2      X
3      1
4      2
5      3
6      5
7      4
--------
# of OHE categorical features: 8
OHE Categorical features: ['STATUS_0', 'STATUS_3', 'STATUS_1', 'STATUS_X', 'STATUS_C', 'STATUS_4', 'STATUS_2', 'STATUS_5']
--------
df.shape: (27299925, 10)

Aggregated Features:
df[['STATUS_0', 'STATUS_3', 'STATUS_1', 'MONTHS_BALANCE', 'STATUS_X', 'STATUS_C', 'S

In [5]:
# Correlation Heatmap
#plt.figure(figsize=(24,8))
#sns.heatmap(bureau_bal[features].corr(), cmap="viridis")

# histograms
#bureau_bal[features].hist(bins=30, figsize=(15, 10))

## Bureau EDA and Transformation

In [6]:
# Bureau

bureau = datasets['bureau']

# gets rid of the unwanted characters in categorical columns - makes for nicer OHE column names
bureau = bureau.merge(bureau_bal, on='SK_ID_BUREAU', how='left') \
               .replace(to_replace='\s+', value='_', regex=True) \
               .replace(to_replace='\-', value='_', regex=True) \
               .replace(to_replace='\(', value='', regex=True) \
               .replace(to_replace='\)', value='', regex=True) \
               .drop('SK_ID_BUREAU', axis=1)

id_cols = ['SK_ID_CURR']

features = list(set(bureau.columns) - set(id_cols))
num_features = list(bureau[features].select_dtypes(include=['int64', 'float64']).columns)
cat_features = list(bureau[features].select_dtypes(include=['object', 'bool']).columns)

print(f"ID columns: {list(id_cols)}")
print(f"Feature columns: {features}")
print('')
print(f"Missing data: {missing_data(bureau[features])}")
print('--------')
print(f"# of numerical features: {len(num_features)}")
print(f"Numerical features: {num_features}")
print('')
print(bureau[features].describe())
print('--------')
print(f"# of categorical features: {len(cat_features)}")
print(f"Categorical features: {cat_features}")
print('')
print(bureau[cat_features].describe(include='all'))
print(bureau[cat_features].apply(lambda col: col.unique()))
print('--------')

# One-Hot-Encode categorical variables
bureau = pd.get_dummies(data=bureau, columns=cat_features)

features = list(set(bureau.columns) - set(id_cols))
ohe_cat_features = list(set(features) - set(num_features))
      
print(f"# of OHE categorical features: {len(ohe_cat_features)}")
print(f"OHE Categorical features: {ohe_cat_features}")
print('--------')

# aggregate bureau variables
bureau = runFeatureSummarizer(bureau, features)

# drop unnecessary features
feature_selection = [
    bureau[[column for column in bureau.columns if not column.startswith(tuple(cat_features)) and not column.endswith('count')]],
    bureau[[column for column in bureau.columns if column.startswith('DAYS_CREDIT') and column.endswith('count')]],
    bureau[[column for column in bureau.columns if column.startswith(tuple(cat_features)) and column.endswith('mean')]]
]

bureau = pd.concat(feature_selection, axis=1)
features = list(set(bureau.columns) - set(id_cols))

print('--------')
print('Aggregated Features:')
print('\n'.join(map(str, sorted(features))))
print('')
print('Aggregated bureau:')
print('')
print(bureau[features].describe().T)

ID columns: ['SK_ID_CURR']
Feature columns: ['DAYS_ENDDATE_FACT', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_ENDDATE', 'STATUS_5_mean', 'STATUS_X_mean', 'DAYS_CREDIT', 'DAYS_CREDIT_UPDATE', 'MONTHS_BALANCE_count', 'STATUS_C_mean', 'AMT_CREDIT_MAX_OVERDUE', 'STATUS_3_mean', 'CREDIT_ACTIVE', 'AMT_CREDIT_SUM_DEBT', 'STATUS_1_mean', 'CREDIT_DAY_OVERDUE', 'CREDIT_TYPE', 'STATUS_4_mean', 'CNT_CREDIT_PROLONG', 'STATUS_0_mean', 'STATUS_2_mean', 'AMT_ANNUITY', 'CREDIT_CURRENCY']

Missing data:                           Total    Percent
AMT_ANNUITY             1226791  71.473490
AMT_CREDIT_MAX_OVERDUE  1124488  65.513264
STATUS_3_mean            942074  54.885728
STATUS_2_mean            942074  54.885728
STATUS_0_mean            942074  54.885728
STATUS_5_mean            942074  54.885728
STATUS_X_mean            942074  54.885728
STATUS_4_mean            942074  54.885728
STATUS_1_mean            942074  54.885728
MONTHS_BALANCE_count     942074  54.885728

In [7]:
# Correlation Heatmap
#plt.figure(figsize=(24,8))
#sns.heatmap(bureau[features].corr(), cmap="viridis")

# histograms
#bureau[features].hist(bins=30, figsize=(15, 10))

In [8]:
# write a csv
#bureau.describe().T.to_csv('out.csv')

## Combine into Application_train and Prepare for ML

In [9]:
# prepare training and test dataset
appTrain = datasets['application_train']
y = appTrain['TARGET']
X = appTrain.merge(bureau, how='left', on='SK_ID_CURR') \
            .drop(['SK_ID_CURR', 'TARGET'], axis = 1) #drop some features with questionable value

id_cols = ['SK_ID_CURR']

features = list(set(X.columns) - set(id_cols))
num_features = list(X[features].select_dtypes(include=['int64', 'float64']).columns)
cat_features = list(X[features].select_dtypes(include=['object', 'bool']).columns)

print(f"ID columns: {list(id_cols)}")
print(f"Feature columns: {features}")
print('')
print(f"Missing data: {missing_data(X[features])}")
print('--------')
print(f"# of numerical features: {len(num_features)}")
print(f"Numerical features: {num_features}")
print('')
print(X[features].describe())
print('--------')
print(f"# of categorical features: {len(cat_features)}")
print(f"Categorical features: {cat_features}")
print('')
print(X[cat_features].describe(include='all'))
print(X[cat_features].apply(lambda col: col.unique()))
print('--------')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, random_state=42, stratify=y_train)
print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

ID columns: ['SK_ID_CURR']
Feature columns: ['LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_3', 'MONTHS_BALANCE_count_max', 'AMT_CREDIT_MAX_OVERDUE_median', 'MONTHS_BALANCE_count_median', 'STATUS_5_mean_sum', 'DAYS_ENDDATE_FACT_var', 'FLOORSMIN_MEDI', 'STATUS_4_mean_max', 'TOTALAREA_MODE', 'STATUS_5_mean_mean', 'CNT_CREDIT_PROLONG_var', 'FONDKAPREMONT_MODE', 'STATUS_X_mean_sum', 'STATUS_5_mean_min', 'CREDIT_TYPE_Cash_loan_non_earmarked_mean', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT', 'NONLIVINGAPARTMENTS_AVG', 'STATUS_1_mean_median', 'DAYS_CREDIT_ENDDATE_min', 'STATUS_0_mean_var', 'CREDIT_ACTIVE_Sold_mean', 'BASEMENTAREA_MEDI', 'OBS_60_CNT_SOCIAL_CIRCLE', 'AMT_CREDIT_SUM_LIMIT_sum', 'FLAG_DOCUMENT_5', 'DAYS_CREDIT_UPDATE_var', 'REG_CITY_NOT_WORK_CITY', 'BASEMENTAREA_AVG', 'STATUS_C_mean_min', 'AMT_INCOME_TOTAL', 'CREDIT_DAY_OVERDUE_min', 'FLAG_CONT_MOBILE', 'CREDIT_ACTIVE_Active_mean', 'AMT_GOODS_PRICE', 'FLAG_PHONE', 'FLAG_DOCUMENT_16', 'AMT_CREDIT_SUM_mean', 'NONLIVINGAREA_MEDI', 'FLAG_DOCUMENT_2'

In [None]:
# run baseline model

num_pipeline =Pipeline([
    ('imputer',SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

selected_features = (num_features) + (cat_features)

cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
    ])

data_pipeline = ColumnTransformer(transformers=[
        ("num_pipeline", num_pipeline, num_features),
        ("cat_pipeline", cat_pipeline, cat_features)],
        remainder='drop',
        n_jobs=-1
    )


full_pipeline_with_predictor = Pipeline([
        ("preparation", data_pipeline),
        ('select', SelectKBest()),
        ("linear", LogisticRegression())
    ])

param_grid = {
    'linear__penalty':[#'l1', 'l2', 'elasticnet',
                                 'none'],
    #,'linear__C':[1.0#, 10.0, 100.0]
    'select__k':[7, #15, 20, 30, 50, 100
                ]
}

gd1 = GridSearchCV(full_pipeline_with_predictor, param_grid= param_grid, cv = 3, scoring='roc_auc')

model = gd1.fit(X_train, y_train)


try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])

exp_name = f"Baseline_{len(selected_features)}_features"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, model.predict(X_train)), 
                accuracy_score(y_valid, model.predict(X_valid)),
                accuracy_score(y_test, model.predict(X_test)),
                roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]),
                roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1]),
                roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])],
    4)) 
expLog