In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_csv('Data/UCI_Credit_Card.csv')

In [3]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
ID,1.0,0.026179,0.018497,0.039177,-0.029079,0.018678,-0.030575,-0.011215,-0.018494,-0.002735,-0.022199,-0.02027,0.019389,0.017982,0.024354,0.040351,0.016705,0.01673,0.009742,0.008406,0.039151,0.007793,0.000652,0.003,-0.013952
LIMIT_BAL,0.026179,1.0,0.024755,-0.219161,-0.108139,0.144713,-0.271214,-0.296382,-0.286123,-0.26746,-0.249411,-0.235195,0.28543,0.278314,0.283236,0.293988,0.295562,0.290389,0.195236,0.178408,0.210167,0.203242,0.217202,0.219595,-0.15352
SEX,0.018497,0.024755,1.0,0.014232,-0.031389,-0.090874,-0.057643,-0.070771,-0.066096,-0.060173,-0.055064,-0.044008,-0.033642,-0.031183,-0.024563,-0.02188,-0.017005,-0.016733,-0.000242,-0.001391,-0.008597,-0.002229,-0.001667,-0.002766,-0.039961
EDUCATION,0.039177,-0.219161,0.014232,1.0,-0.143464,0.175061,0.105364,0.121566,0.114025,0.108793,0.09752,0.082316,0.023581,0.018749,0.013002,-0.000451,-0.007567,-0.009099,-0.037456,-0.030038,-0.039943,-0.038218,-0.040358,-0.0372,0.028006
MARRIAGE,-0.029079,-0.108139,-0.031389,-0.143464,1.0,-0.41417,0.019917,0.024199,0.032688,0.033122,0.035629,0.034345,-0.023472,-0.021602,-0.024909,-0.023344,-0.025393,-0.021207,-0.005979,-0.008093,-0.003541,-0.012659,-0.001205,-0.006641,-0.024339
AGE,0.018678,0.144713,-0.090874,0.175061,-0.41417,1.0,-0.039447,-0.050148,-0.053048,-0.049722,-0.053826,-0.048773,0.056239,0.054283,0.05371,0.051353,0.049345,0.047613,0.026147,0.021785,0.029247,0.021379,0.02285,0.019478,0.01389
PAY_0,-0.030575,-0.271214,-0.057643,0.105364,0.019917,-0.039447,1.0,0.672164,0.574245,0.538841,0.509426,0.474553,0.187068,0.189859,0.179785,0.179125,0.180635,0.17698,-0.079269,-0.070101,-0.070561,-0.064005,-0.05819,-0.058673,0.324794
PAY_2,-0.011215,-0.296382,-0.070771,0.121566,0.024199,-0.050148,0.672164,1.0,0.766552,0.662067,0.62278,0.575501,0.234887,0.235257,0.224146,0.222237,0.221348,0.219403,-0.080701,-0.05899,-0.055901,-0.046858,-0.037093,-0.0365,0.263551
PAY_3,-0.018494,-0.286123,-0.066096,0.114025,0.032688,-0.053048,0.574245,0.766552,1.0,0.777359,0.686775,0.632684,0.208473,0.237295,0.227494,0.227202,0.225145,0.222327,0.001295,-0.066793,-0.053311,-0.046067,-0.035863,-0.035861,0.235253
PAY_4,-0.002735,-0.26746,-0.060173,0.108793,0.033122,-0.049722,0.538841,0.662067,0.777359,1.0,0.819835,0.716449,0.202812,0.225816,0.244983,0.245917,0.242902,0.239154,-0.009362,-0.001944,-0.069235,-0.043461,-0.03359,-0.026565,0.216614


In [5]:
df.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,51223.3309,49179.075167,47013.15,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,73635.860576,71173.768783,69349.39,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3558.75,2984.75,2666.25,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,22381.5,21200.0,20088.5,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,67091.0,64006.25,60164.75,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


### Preprocessing
1. Change ```PAY_0``` to ```PAY1```
2. Drop ```ID```
3. Retain Marriage "0" as "Others" (Value 3)
4. Group Education 0, 4, 5, 6 as 1 bin = 4 ("Others")
5. Take Standard Scale for Amounts and Log Limit Balance <br>
   Standard Scaler used for amounts due to -ve values
6. Bin ages by Information Gain
7. Dummy (education, sex, marriage, binned ages) 

In [6]:
df = df.rename(columns={'PAY_0':'PAY_1'}).drop(columns='ID')
df['EDUCATION'] = df['EDUCATION'].replace([0, 5, 6], 4)
df['MARRIAGE'] = df['MARRIAGE'].replace(0, 3)

### Pre-processing

In [7]:
# bin ages by information gain
def info_gain(data, featured, verbose=True):
    '''
    Input - dataframe, column of interest, verbose
    Output - IG value
    '''
    
    new_df = data.groupby([featured]).agg({'default.payment.next.month':['count', 'sum']}).reset_index()
    new_df.columns = [c[0] if c[1]=='' else c[1] for c in new_df.columns]
    new_df = new_df.rename(columns={'count':'total', 'sum':'bad'})
    new_df['good'] = new_df['total']-new_df['bad']
    
    new_df['P(V=v|Default=0)'] = new_df['good']/sum(new_df['good'])
    new_df['P(V=v|Default=1)'] = new_df['bad']/sum(new_df['bad'])
    new_df['WOE'] = np.log(new_df['P(V=v|Default=0)']/new_df['P(V=v|Default=1)'])
    new_df['IG'] = new_df['WOE']*(new_df['P(V=v|Default=0)']-new_df['P(V=v|Default=1)'])
    
    IG = new_df['IG'].sum()
    if verbose:
        return IG, new_df
    else:
        return IG

In [8]:
all_ig = []
max_num_bins = 15

for i in range(1, max_num_bins):
    test_df = df.copy()
    test_df['BINNED_AGE'] = pd.qcut(df['AGE'], i)
    ig = info_gain(test_df, 'BINNED_AGE', False)
    all_ig.append(ig)

_, labels = pd.qcut(df['AGE'], max(zip(all_ig, range(1,max_num_bins)))[1], retbins=True)
labels = [str(int(label[0]))+"-"+str(int(label[1])) for label in list(zip(labels[:-1], labels[1:]))]
labels = [label.replace('-75', '+') for label in labels] 
df['BINNED_AGE'] = pd.qcut(df['AGE'], max(zip(all_ig, range(1,max_num_bins)))[1], labels=labels)

#check number of bins
df['BINNED_AGE'].nunique()

9

In [9]:
# log LIMIT_BAL and PAY_AMT - No negative values
# Scale BILL_AMT using StandardScaler in Pipeline
df['LOG_LIMIT_BAL'] = np.log(df['LIMIT_BAL'])
for i in range(1, 7):
    df[f'LOG_PAY_AMT{i}'] = np.log(df[f'PAY_AMT{i}']+1)

In [10]:
dummy_cols = ['EDUCATION','SEX','MARRIAGE','BINNED_AGE']
new_df = pd.get_dummies(df.drop(columns=['AGE']), columns=dummy_cols)
new_df.head()

Unnamed: 0,LIMIT_BAL,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month,LOG_LIMIT_BAL,LOG_PAY_AMT1,LOG_PAY_AMT2,LOG_PAY_AMT3,LOG_PAY_AMT4,LOG_PAY_AMT5,LOG_PAY_AMT6,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,SEX_1,SEX_2,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3,BINNED_AGE_21-25,BINNED_AGE_25-28,BINNED_AGE_28-30,BINNED_AGE_30-32,BINNED_AGE_32-35,BINNED_AGE_35-39,BINNED_AGE_39-42,BINNED_AGE_42-48,BINNED_AGE_48-79
0,20000.0,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1,9.903488,0.0,6.536692,0.0,0.0,0.0,0.0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0
1,120000.0,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1,11.695247,0.0,6.908755,6.908755,6.908755,0.0,7.601402,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0
2,90000.0,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0,11.407565,7.325808,7.313887,6.908755,6.908755,6.908755,8.517393,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0
3,50000.0,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0,10.819778,7.601402,7.610853,7.09091,7.003974,6.975414,6.908755,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0
4,50000.0,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0,10.819778,7.601402,10.510041,9.21044,9.105091,6.536692,6.522093,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1


### Feature Engineering
- Did they exceed credit limit? 
- Change in amounts over time (absolute average change across time / mean of amounts)  
- Did they overpay/ get refund, look at negative bill amounts (Col = 0 or 1) 
- Is bill amount correlated to pay amount? -> Ratios for feature engineering = Pay amount/bill amount, impute as max value (for the entire col) for now (not individual specific) 

In [11]:
new_df['AVG_BILL_AMT'] = new_df.loc[:, 'BILL_AMT1':'BILL_AMT6'].mean(axis=1)
new_df['AVG_PAY_AMT'] = new_df.loc[:, 'PAY_AMT1':'PAY_AMT6'].mean(axis=1)
for i in range(1,7):
    new_df[f'LIMIT_UTIL{i}'] = new_df[f'BILL_AMT{i}']/new_df['LIMIT_BAL']
    new_df[f'PERC_PAID{i}'] = new_df[f'PAY_AMT{i}']/new_df[f'BILL_AMT{i}']
    ## 0 bill, 0 paid == 100% paid
    ## -ve Bill, 0 paid == 100% paid
    new_df.loc[(new_df[f'PAY_AMT{i}'] == 0) & (new_df[f'BILL_AMT{i}'] <= 0), f'PERC_PAID{i}'] = 1
    ## 0 bill, any amount paid = pay_amt% paid
    new_df.loc[(new_df[f'PAY_AMT{i}'] > 0) & 
               (new_df[f'BILL_AMT{i}'] == 0), f'PERC_PAID{i}'] = new_df.loc[(new_df[f'PAY_AMT{i}'] > 0) & 
                                                                            (new_df[f'BILL_AMT{i}'] == 0), f'PAY_AMT{i}']
    
new_df['EXCEEDED_LIMIT'] = (new_df.loc[:,'LIMIT_UTIL1':'LIMIT_UTIL6']>1).max(axis=1).astype(int)
new_df['OVERPAID'] = (new_df.loc[:,'PERC_PAID1':'PERC_PAID6']>1).max(axis=1).astype(int)
new_df['NEG_BILL'] = (new_df.loc[:,'BILL_AMT1':'BILL_AMT6']<0).max(axis=1).astype(int)
new_df.head()

Unnamed: 0,LIMIT_BAL,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month,LOG_LIMIT_BAL,LOG_PAY_AMT1,LOG_PAY_AMT2,LOG_PAY_AMT3,LOG_PAY_AMT4,LOG_PAY_AMT5,LOG_PAY_AMT6,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,SEX_1,SEX_2,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3,BINNED_AGE_21-25,BINNED_AGE_25-28,BINNED_AGE_28-30,BINNED_AGE_30-32,BINNED_AGE_32-35,BINNED_AGE_35-39,BINNED_AGE_39-42,BINNED_AGE_42-48,BINNED_AGE_48-79,AVG_BILL_AMT,AVG_PAY_AMT,LIMIT_UTIL1,PERC_PAID1,LIMIT_UTIL2,PERC_PAID2,LIMIT_UTIL3,PERC_PAID3,LIMIT_UTIL4,PERC_PAID4,LIMIT_UTIL5,PERC_PAID5,LIMIT_UTIL6,PERC_PAID6,EXCEEDED_LIMIT,OVERPAID,NEG_BILL
0,20000.0,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1,9.903488,0.0,6.536692,0.0,0.0,0.0,0.0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1284.0,114.833333,0.19565,0.0,0.1551,0.222115,0.03445,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0,0,0
1,120000.0,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1,11.695247,0.0,6.908755,6.908755,6.908755,0.0,7.601402,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,2846.166667,833.333333,0.02235,0.0,0.014375,0.57971,0.02235,0.372856,0.027267,0.305623,0.028792,0.0,0.027175,0.613309,0,0,0
2,90000.0,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0,11.407565,7.325808,7.313887,6.908755,6.908755,6.908755,8.517393,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,16942.166667,1836.333333,0.324878,0.051917,0.155856,0.106937,0.150656,0.073752,0.159233,0.069779,0.166089,0.066899,0.172767,0.321564,0,0,0
3,50000.0,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0,10.819778,7.601402,7.610853,7.09091,7.003974,6.975414,6.908755,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,38555.666667,1398.0,0.9398,0.042562,0.96466,0.041859,0.98582,0.024345,0.56628,0.03885,0.57918,0.036914,0.59094,0.033844,0,0,0
4,50000.0,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0,10.819778,7.601402,10.510041,9.21044,9.105091,6.536692,6.522093,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,18223.166667,9841.5,0.17234,0.232099,0.1134,6.469312,0.7167,0.279057,0.4188,0.429799,0.38292,0.035987,0.38262,0.035492,1,1,0


In [12]:
for i in range(1, 7):
    display(new_df[new_df[f'PERC_PAID{i}']==np.inf][[f'BILL_AMT{i}', f'PAY_AMT{i}']].drop_duplicates())

Unnamed: 0,BILL_AMT1,PAY_AMT1


Unnamed: 0,BILL_AMT2,PAY_AMT2


Unnamed: 0,BILL_AMT3,PAY_AMT3


Unnamed: 0,BILL_AMT4,PAY_AMT4


Unnamed: 0,BILL_AMT5,PAY_AMT5


Unnamed: 0,BILL_AMT6,PAY_AMT6


### Change in absolute bill amount and pay amount does not seem appropriate

In [13]:
#Average of change in amounts over time then the average of these 5 periods. 
#Did not take absolute as to track up/downward trend
# new_df['BILL_CHANGE'] = new_df.loc[:, 'BILL_AMT1': 'BILL_AMT6'].pct_change(axis='columns').drop(columns='BILL_AMT1').fillna(0).mean(axis=1)*100
# new_df['BILL_CHANGE'] = new_df['BILL_CHANGE'].replace(np.inf,new_df[new_df['BILL_CHANGE']<np.inf]['BILL_CHANGE'].max())
# new_df['BILL_CHANGE'] = new_df['BILL_CHANGE'].replace(-np.inf,new_df[new_df['BILL_CHANGE']>-np.inf]['BILL_CHANGE'].min())

# new_df['PAYMENT_CHANGE'] = new_df.loc[:, 'PAY_AMT1': 'PAY_AMT6'].pct_change(axis='columns').drop(columns='PAY_AMT1').fillna(0).mean(axis=1)*100
# new_df['PAYMENT_CHANGE'] = new_df['PAYMENT_CHANGE'].replace(np.inf,new_df[new_df['PAYMENT_CHANGE']<np.inf]['PAYMENT_CHANGE'].max())
# new_df['PAYMENT_CHANGE'] = new_df['PAYMENT_CHANGE'].replace(-np.inf,new_df[new_df['PAYMENT_CHANGE']>-np.inf]['PAYMENT_CHANGE'].min())

In [14]:
new_df.isnull().sum()

LIMIT_BAL                     0
PAY_1                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
LOG_LIMIT_BAL                 0
LOG_PAY_AMT1                  0
LOG_PAY_AMT2                  0
LOG_PAY_AMT3                  0
LOG_PAY_AMT4                  0
LOG_PAY_AMT5                  0
LOG_PAY_AMT6                  0
EDUCATION_1                   0
EDUCATION_2                   0
EDUCATION_3                   0
EDUCATION_4                   0
SEX_1   

In [15]:
new_df.columns.to_series()[np.isinf(new_df).any()]

Series([], dtype: object)

In [16]:
from sklearn.model_selection import train_test_split

X = new_df.drop(columns=['default.payment.next.month']).copy()
y = new_df[['default.payment.next.month']].copy()
cc_train_X, cc_test_X, cc_train_y, cc_test_y = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)

In [17]:
cc_train = pd.concat([cc_train_X, cc_train_y], axis=1)
cc_test = pd.concat([cc_test_X, cc_test_y], axis=1)

In [19]:
cc_train.to_csv('Data/cc_train.csv', index=False)
cc_test.to_csv('Data/cc_test.csv', index=False)

### PIPELINE - copy and use this 

from sklearn.preprocessing import StandardScaler()
from sklearn.pipeline import PipeLine

pipe = Pipeline([('scale', StandardScaler(),
                  'classifier', #)])
pipe.fit(X_train, X_test)