## Load Credit Card Balance 

In [3]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import os
import zipfile
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from pandas.plotting import scatter_matrix
import warnings
warnings.filterwarnings('ignore')

def load_data(in_path, name):
    df = pd.read_csv(in_path)
    print(f"{name}: shape is {df.shape}")
    print(df.info())
    display(df.head(5))
    return df

DATA_DIR = "./Data"
datasets={}  # lets store the datasets in a dictionary so we can keep track of them easily
ds_name = 'credit_card_balance'
datasets[ds_name] = load_data(os.path.join(DATA_DIR, f'{ds_name}.csv'), ds_name)

datasets['credit_card_balance'].shape

credit_card_balance: shape is (3840312, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 23 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   SK_ID_PREV                  int64  
 1   SK_ID_CURR                  int64  
 2   MONTHS_BALANCE              int64  
 3   AMT_BALANCE                 float64
 4   AMT_CREDIT_LIMIT_ACTUAL     int64  
 5   AMT_DRAWINGS_ATM_CURRENT    float64
 6   AMT_DRAWINGS_CURRENT        float64
 7   AMT_DRAWINGS_OTHER_CURRENT  float64
 8   AMT_DRAWINGS_POS_CURRENT    float64
 9   AMT_INST_MIN_REGULARITY     float64
 10  AMT_PAYMENT_CURRENT         float64
 11  AMT_PAYMENT_TOTAL_CURRENT   float64
 12  AMT_RECEIVABLE_PRINCIPAL    float64
 13  AMT_RECIVABLE               float64
 14  AMT_TOTAL_RECEIVABLE        float64
 15  CNT_DRAWINGS_ATM_CURRENT    float64
 16  CNT_DRAWINGS_CURRENT        int64  
 17  CNT_DRAWINGS_OTHER_CURRENT  float64
 18  CNT_DRAWINGS_POS_C

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


(3840312, 23)

In [4]:
# function to display amount of missing data from dataframe columns
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

# Class to summarize the features specified into min, max, mean, count, sum, median, and var
class FeatureSummarizer(BaseEstimator, TransformerMixin):
    def __init__(self, features=None): # no *args or **kargs
        self.features = features
        self.agg_ops = ["min", "max", "count", "sum", "median", "mean", "var"]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        keys = list(set(X.columns) - set(self.features))
        
        result = X.groupby(keys, as_index=False) \
                  .agg({ft:self.agg_ops for ft in self.features}) 
        result.columns = result.columns.map(lambda ct: '_'.join([x for x in ct if x != '']))
        
        return result 
    

def runFeatureSummarizer(df, features):
    print(f"df.shape: {df.shape}\n")
    print(f"Aggregated Features:\ndf[{features}][0:5]: \n{df[features][0:5]}")
    pipeline = make_pipeline(FeatureSummarizer(features))
    return(pipeline.fit_transform(df))


## Creating Summarized Dataset for Credit Card Balance

In [5]:
df_ccb = datasets['credit_card_balance']
df_ccb_dummies= pd.get_dummies(data=df_ccb,
                        columns=['NAME_CONTRACT_STATUS'])
id_cols = ['SK_ID_PREV',
           'SK_ID_CURR']
df_ccb_col_list = list(set(df_ccb_dummies.columns) - set(id_cols))
df_ccb_summ = runFeatureSummarizer(df_ccb_dummies, df_ccb_col_list)

df.shape: (3840312, 29)

Aggregated Features:
df[['CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'AMT_BALANCE', 'NAME_CONTRACT_STATUS_Sent proposal', 'NAME_CONTRACT_STATUS_Completed', 'SK_DPD', 'NAME_CONTRACT_STATUS_Approved', 'CNT_INSTALMENT_MATURE_CUM', 'NAME_CONTRACT_STATUS_Active', 'AMT_DRAWINGS_CURRENT', 'AMT_TOTAL_RECEIVABLE', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_PAYMENT_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_DRAWINGS_ATM_CURRENT', 'MONTHS_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'SK_DPD_DEF', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'NAME_CONTRACT_STATUS_Signed', 'NAME_CONTRACT_STATUS_Refused', 'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECIVABLE', 'NAME_CONTRACT_STATUS_Demand', 'AMT_INST_MIN_REGULARITY']][0:5]: 
   CNT_DRAWINGS_CURRENT  CNT_DRAWINGS_POS_CURRENT  AMT_BALANCE  \
0                     1                       1.0       56.970   
1                     1                       0.0    63975.555   
2                     0          

## EDA on credit_card_balance.csv 

### 1) A data dictionary of the raw features (test description; data type: numerical, list, etc.)

In [6]:
credit_card_balance_csv = pd.read_csv("CreditCardBalance.csv")

In [7]:
credit_card_balance_csv

Unnamed: 0,Attribute,DataType,Description
0,SK_ID_PREV,NUMBER,ID of previous credit in Home credit related t...
1,SK_ID_CURR,NUMBER,ID of loan in main sample
2,MONTHS_BALANCE,NUMBER,Month of balance relative to application date ...
3,AMT_BALANCE,NUMBER,Balance during the month of previous credit
4,AMT_CREDIT_LIMIT_ACTUAL,NUMBER,Credit card limit during the month of the prev...
5,AMT_DRAWINGS_ATM_CURRENT,NUMBER,Amount drawing at ATM during the month of the ...
6,AMT_DRAWINGS_CURRENT,NUMBER,Amount drawing during the month of the previou...
7,AMT_DRAWINGS_OTHER_CURRENT,NUMBER,Amount of other drawings during the month of t...
8,AMT_DRAWINGS_POS_CURRENT,NUMBER,Amount drawing or buying goods during the mont...
9,AMT_INST_MIN_REGULARITY,NUMBER,Minimal installment for this month of the prev...


## 2) Dataset size for CREDIT CARD BALANCE (rows columns, train, test, validation)

In [8]:
size = datasets['credit_card_balance'].shape
print("The Credit Card Balance dataset before summarized consists of " + str(size[0]) +" rows.")
print("The Credit Card Balance dataset before summarized consists of " + str(size[1]) +" columns.")

The Credit Card Balance dataset before summarized consists of 3840312 rows.
The Credit Card Balance dataset before summarized consists of 23 columns.


In [9]:
size = df_ccb_summ.shape
print("The Credit Card Balance dataset after summarized consists of " + str(size[0]) +" rows.")
print("The Credit Card Balance dataset after summarized consists of " + str(size[1]) +" columns.")

The Credit Card Balance dataset after summarized consists of 104307 rows.
The Credit Card Balance dataset after summarized consists of 191 columns.


## Train dataset joined on SK_ID_CURR with summarized Credit Card Balance 

In [10]:
ds_name = 'application_train'
datasets[ds_name] = load_data(os.path.join(DATA_DIR, f'{ds_name}.csv'), ds_name)
df_curr_target = datasets[ds_name][["SK_ID_CURR","TARGET"]]
df_summarized_joined_train_ccb = df_ccb_summ.join(datasets['application_train'].set_index('SK_ID_CURR'), on='SK_ID_CURR')
df_summarized_joined_train_ccb.shape

application_train: shape is (307511, 122)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


(104307, 312)

## 3) Summary statistics for CREDIT CARD BALANCE 

In [11]:
def EDA(df, features, idCols):

    features = list(set(df.columns) - set(id_cols))
    num_features = list(df[features].select_dtypes(include=['int64', 'float64']).columns)
    cat_features = list(df[features].select_dtypes(include=['object', 'bool']).columns)

    print(f"ID columns: {list(id_cols)}")
    print(f"Feature columns: {features}")
    print('')
    print(f"Missing data: {missing_data(df[features])}")
    print('--------')
    print(f"# of numerical features: {len(num_features)}")
    print(f"Numerical features: {num_features}")
    print('')
    print(df[features].describe())
    print('--------')
    print(f"# of categorical features: {len(cat_features)}")
    print(f"Categorical features: {cat_features}")
    print('')
    print(df[cat_features].describe(include='all'))
    print(df[cat_features].apply(lambda col: col.unique()))
    print('--------')

def AggregatedEDA(df, features, idCols):
    print('Aggregated Features:')
    df = runFeatureSummarizer(df, features)
    features = list(set(df.columns) - set(id_cols))
    print('--------')
    print('Aggregated Features:')
    print('\n'.join(map(str, sorted(features))))
    print('')
    print('Aggregated bureau:')
    print('')
    print(df[features].describe().T)

## EDA and Aggregated EDA on Credit Card Balance Dataset

In [12]:
id_cols = ['SK_ID_PREV',
           'SK_ID_CURR']
EDA(df_ccb, df_ccb_col_list, id_cols)
AggregatedEDA(df_ccb_dummies, df_ccb_col_list, id_cols)

ID columns: ['SK_ID_PREV', 'SK_ID_CURR']
Feature columns: ['CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'AMT_BALANCE', 'SK_DPD', 'CNT_INSTALMENT_MATURE_CUM', 'AMT_DRAWINGS_CURRENT', 'AMT_TOTAL_RECEIVABLE', 'AMT_DRAWINGS_OTHER_CURRENT', 'NAME_CONTRACT_STATUS', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_PAYMENT_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_DRAWINGS_ATM_CURRENT', 'MONTHS_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'SK_DPD_DEF', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECIVABLE', 'AMT_INST_MIN_REGULARITY']

Missing data:                              Total    Percent
AMT_PAYMENT_CURRENT         767988  19.998063
AMT_DRAWINGS_OTHER_CURRENT  749816  19.524872
CNT_DRAWINGS_OTHER_CURRENT  749816  19.524872
CNT_DRAWINGS_ATM_CURRENT    749816  19.524872
AMT_DRAWINGS_ATM_CURRENT    749816  19.524872
CNT_DRAWINGS_POS_CURRENT    749816  19.524872
AMT_DRAWINGS_POS_CURRENT    749816  19.524872
AMT_INST_MIN_REGULARITY     305236   7.948208
CNT_INS

### Datatypes for Credit Card Balance

In [13]:
df_ccb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 23 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   SK_ID_PREV                  int64  
 1   SK_ID_CURR                  int64  
 2   MONTHS_BALANCE              int64  
 3   AMT_BALANCE                 float64
 4   AMT_CREDIT_LIMIT_ACTUAL     int64  
 5   AMT_DRAWINGS_ATM_CURRENT    float64
 6   AMT_DRAWINGS_CURRENT        float64
 7   AMT_DRAWINGS_OTHER_CURRENT  float64
 8   AMT_DRAWINGS_POS_CURRENT    float64
 9   AMT_INST_MIN_REGULARITY     float64
 10  AMT_PAYMENT_CURRENT         float64
 11  AMT_PAYMENT_TOTAL_CURRENT   float64
 12  AMT_RECEIVABLE_PRINCIPAL    float64
 13  AMT_RECIVABLE               float64
 14  AMT_TOTAL_RECEIVABLE        float64
 15  CNT_DRAWINGS_ATM_CURRENT    float64
 16  CNT_DRAWINGS_CURRENT        int64  
 17  CNT_DRAWINGS_OTHER_CURRENT  float64
 18  CNT_DRAWINGS_POS_CURRENT    float64
 19  CNT_INSTALMENT_MATURE

### Describe the data

In [14]:
df_ccb.describe()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF
count,3840312.0,3840312.0,3840312.0,3840312.0,3840312.0,3090496.0,3840312.0,3090496.0,3090496.0,3535076.0,...,3840312.0,3840312.0,3840312.0,3090496.0,3840312.0,3090496.0,3090496.0,3535076.0,3840312.0,3840312.0
mean,1904504.0,278324.2,-34.52192,58300.16,153808.0,5961.325,7433.388,288.1696,2968.805,3540.204,...,55965.88,58088.81,58098.29,0.309449,0.7031439,0.004812496,0.5594791,20.82508,9.283667,0.331622
std,536469.5,102704.5,26.66775,106307.0,165145.7,28225.69,33846.08,8201.989,20796.89,5600.154,...,102533.6,105965.4,105971.8,1.100401,3.190347,0.08263861,3.240649,20.05149,97.5157,21.47923
min,1000018.0,100006.0,-96.0,-420250.2,0.0,-6827.31,-6211.62,0.0,0.0,0.0,...,-423305.8,-420250.2,-420250.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1434385.0,189517.0,-55.0,0.0,45000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
50%,1897122.0,278396.0,-28.0,0.0,112500.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0
75%,2369328.0,367580.0,-11.0,89046.69,180000.0,0.0,0.0,0.0,0.0,6633.911,...,85359.24,88899.49,88914.51,0.0,0.0,0.0,0.0,32.0,0.0,0.0
max,2843496.0,456250.0,-1.0,1505902.0,1350000.0,2115000.0,2287098.0,1529847.0,2239274.0,202882.0,...,1472317.0,1493338.0,1493338.0,51.0,165.0,12.0,165.0,120.0,3260.0,3260.0


### Missing Data for Credit Card Balance

In [15]:
missing_data(df_ccb)

Unnamed: 0,Total,Percent
AMT_PAYMENT_CURRENT,767988,19.998063
AMT_DRAWINGS_ATM_CURRENT,749816,19.524872
CNT_DRAWINGS_POS_CURRENT,749816,19.524872
AMT_DRAWINGS_OTHER_CURRENT,749816,19.524872
AMT_DRAWINGS_POS_CURRENT,749816,19.524872
CNT_DRAWINGS_OTHER_CURRENT,749816,19.524872
CNT_DRAWINGS_ATM_CURRENT,749816,19.524872
CNT_INSTALMENT_MATURE_CUM,305236,7.948208
AMT_INST_MIN_REGULARITY,305236,7.948208
SK_ID_PREV,0,0.0


### Null values

### Summarized Dataset

In [16]:
df_ccb_summ.isna().sum()

SK_ID_PREV                          0
SK_ID_CURR                          0
CNT_DRAWINGS_CURRENT_min            0
CNT_DRAWINGS_CURRENT_max            0
CNT_DRAWINGS_CURRENT_count          0
                                 ... 
AMT_INST_MIN_REGULARITY_count       0
AMT_INST_MIN_REGULARITY_sum         0
AMT_INST_MIN_REGULARITY_median      0
AMT_INST_MIN_REGULARITY_mean        0
AMT_INST_MIN_REGULARITY_var       722
Length: 191, dtype: int64

### Unsummarized Dataset

In [17]:
df_ccb.isna().sum()

SK_ID_PREV                         0
SK_ID_CURR                         0
MONTHS_BALANCE                     0
AMT_BALANCE                        0
AMT_CREDIT_LIMIT_ACTUAL            0
AMT_DRAWINGS_ATM_CURRENT      749816
AMT_DRAWINGS_CURRENT               0
AMT_DRAWINGS_OTHER_CURRENT    749816
AMT_DRAWINGS_POS_CURRENT      749816
AMT_INST_MIN_REGULARITY       305236
AMT_PAYMENT_CURRENT           767988
AMT_PAYMENT_TOTAL_CURRENT          0
AMT_RECEIVABLE_PRINCIPAL           0
AMT_RECIVABLE                      0
AMT_TOTAL_RECEIVABLE               0
CNT_DRAWINGS_ATM_CURRENT      749816
CNT_DRAWINGS_CURRENT               0
CNT_DRAWINGS_OTHER_CURRENT    749816
CNT_DRAWINGS_POS_CURRENT      749816
CNT_INSTALMENT_MATURE_CUM     305236
NAME_CONTRACT_STATUS               0
SK_DPD                             0
SK_DPD_DEF                         0
dtype: int64

## Drop all rows where target is NA from summarized table Train merged with Credit Card Balance

In [18]:
ccb_summ_na_target_dropped= df_summarized_joined_train_ccb.dropna(subset=['TARGET'])

### Join Unsummarized Credit Card Balance Dataset with Train and drop all rows with Target as na

In [19]:
df_joined_train_ccb_unsummarized = df_ccb.join(datasets['application_train'].set_index('SK_ID_CURR'), on='SK_ID_CURR')
df_joined_train_ccb_unsummarized_natarget_dropped = df_joined_train_ccb_unsummarized.dropna(subset=['TARGET'])

## 4) Correlation Analysis on summarized dataset

### Correlation on summarized table 

In [20]:
ccb_summarized_corr = ccb_summ_na_target_dropped[ccb_summ_na_target_dropped.columns[:]].corr()['TARGET'].sort_values(ascending=False)
ccb_summarized_corr

TARGET                                 1.000000
CNT_DRAWINGS_ATM_CURRENT_mean          0.107908
CNT_DRAWINGS_CURRENT_max               0.100742
AMT_BALANCE_mean                       0.087098
AMT_TOTAL_RECEIVABLE_mean              0.086412
                                         ...   
NAME_CONTRACT_STATUS_Refused_median         NaN
NAME_CONTRACT_STATUS_Demand_min             NaN
FLAG_MOBIL                                  NaN
FLAG_DOCUMENT_10                            NaN
FLAG_DOCUMENT_12                            NaN
Name: TARGET, Length: 296, dtype: float64

### Correlation on unsummarized table where all row with na balues for Target Dropped

In [21]:
ccb_unsummarized_corr = df_joined_train_ccb_unsummarized[df_joined_train_ccb_unsummarized.columns[:]].corr()['TARGET'].sort_values(ascending=False) 
ccb_unsummarized_corr

TARGET                         1.000000
DAYS_BIRTH                     0.059450
REGION_RATING_CLIENT_W_CITY    0.056224
REGION_RATING_CLIENT           0.053804
DAYS_LAST_PHONE_CHANGE         0.051437
                                 ...   
EXT_SOURCE_1                  -0.138924
EXT_SOURCE_3                  -0.166260
FLAG_MOBIL                          NaN
FLAG_DOCUMENT_10                    NaN
FLAG_DOCUMENT_12                    NaN
Name: TARGET, Length: 127, dtype: float64

## Pipeline for Credit Card Balance

In [22]:
X = ccb_summ_na_target_dropped.loc[:, ccb_summ_na_target_dropped.columns != "TARGET"]
y = ccb_summ_na_target_dropped["TARGET"]

id_cols = ['SK_ID_CURR']
features = list(set(X.columns) - set(id_cols))
num_features = list(X[features].select_dtypes(include=['int64', 'float64']).columns)
cat_features = list(X[features].select_dtypes(include=['object', 'bool']).columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, random_state=42, stratify=y_train)
print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

X train           shape: (63183, 311)
X validation      shape: (11151, 311)
X test            shape: (13118, 311)


In [None]:
# run baseline model
from sklearn.metrics import accuracy_score, roc_auc_score

num_pipeline =Pipeline([
    ('imputer',SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

selected_features = (num_features) + (cat_features)

cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
    ])

data_pipeline = ColumnTransformer(transformers=[
        ("num_pipeline", num_pipeline, num_features),
        ("cat_pipeline", cat_pipeline, cat_features)],
        remainder='drop',
        n_jobs=-1
    )


full_pipeline_with_predictor = Pipeline([
        ("preparation", data_pipeline),
        ('select', SelectKBest()),
        ("linear", LogisticRegression())
    ])

param_grid = {
    'linear__penalty':[#'l1', 'l2', 'elasticnet',
                                 'none'],
    #,'linear__C':[1.0#, 10.0, 100.0]
    'select__k':[7, #15, 20, 30, 50, 100
                ]
}

gd1 = GridSearchCV(full_pipeline_with_predictor, param_grid= param_grid, cv = 3, scoring='roc_auc')

model = gd1.fit(X_train, y_train)


try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])

exp_name = f"Baseline_{len(selected_features)}_features"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, model.predict(X_train)), 
                accuracy_score(y_valid, model.predict(X_valid)),
                accuracy_score(y_test, model.predict(X_test)),
                roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]),
                roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1]),
                roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])],
    4)) 

expLog