# Initial Data Prep of Master Dataset

In [49]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('DefaultCreditcardClients.csv')
df.rename(columns={'default payment next month':'default'}, inplace=True)

#set index to the "ID" value and remove the ID column
df.index = df.ID
del df['ID']

#Create Lists for Analysis
continuous_features = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2','BILL_AMT3',
                       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
                       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                       'PAY_AMT6']
ordinal_features = ['EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0','PAY_2', 'PAY_3',
                    'PAY_4', 'PAY_5', 'PAY_6','default']

#Convert datatypes
df[continuous_features] = df[continuous_features].astype(np.float64)
df[ordinal_features] = df[ordinal_features].astype(np.int64)

#convert any non-identified education categories to 'OTHER'
df['EDUCATION'] = df['EDUCATION'].replace(to_replace=(0,5,6),value=4)

#convert any non-identified marriage categories to 'OTHER'
df['MARRIAGE'] = df['MARRIAGE'].replace(to_replace=(0),value=3)

#Log transform continuous variables; as they each have a mostly 
##exponential distribution
df["log_LIMIT_BAL"]=np.log(df.LIMIT_BAL)
df["log_PAY_AMT1"]=np.log(df.PAY_AMT1+1)
df["log_PAY_AMT2"]=np.log(df.PAY_AMT2+1)
df["log_PAY_AMT3"]=np.log(df.PAY_AMT3+1)
df["log_PAY_AMT4"]=np.log(df.PAY_AMT4+1)
df["log_PAY_AMT5"]=np.log(df.PAY_AMT5+1)
df["log_PAY_AMT6"]=np.log(df.PAY_AMT6+1)

#Create a separate dataset with only useful variables as identified in Lab1 and Mini-lab1.
df = df[['SEX','EDUCATION','MARRIAGE','AGE', 'default'
            ,'PAY_0','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5','PAY_6', "log_LIMIT_BAL"
            ,"log_PAY_AMT1","log_PAY_AMT2","log_PAY_AMT3","log_PAY_AMT4","log_PAY_AMT5"
            ,"log_PAY_AMT6"]]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 1 to 30000
Data columns (total 18 columns):
SEX              30000 non-null int64
EDUCATION        30000 non-null int64
MARRIAGE         30000 non-null int64
AGE              30000 non-null int64
default          30000 non-null int64
PAY_0            30000 non-null int64
PAY_2            30000 non-null int64
PAY_3            30000 non-null int64
PAY_4            30000 non-null int64
PAY_5            30000 non-null int64
PAY_6            30000 non-null int64
log_LIMIT_BAL    30000 non-null float64
log_PAY_AMT1     30000 non-null float64
log_PAY_AMT2     30000 non-null float64
log_PAY_AMT3     30000 non-null float64
log_PAY_AMT4     30000 non-null float64
log_PAY_AMT5     30000 non-null float64
log_PAY_AMT6     30000 non-null float64
dtypes: float64(7), int64(11)
memory usage: 4.3 MB


# Predicting Customer "Default"
I think it is easier to do the dependent variables seperately since we oversample with the default variable at first, so we are going to be working with a different dat set for this one.


### Default Specific data prep
##### Split data & over sample on the minority default class to help improve F1 statistic

In [50]:
# One-hot encoding of "EDUCATION" and "MARRIAGE".
tmp_df_1 = pd.get_dummies(df.EDUCATION,prefix='EDUCATION')
tmp_df_2 = pd.get_dummies(df.MARRIAGE,prefix='MARRIAGE')
dfsub1 = pd.concat((df,tmp_df_1,tmp_df_2),axis=1)
#Drop variables for which we used one-hot encoding
del dfsub1['EDUCATION']
del dfsub1['MARRIAGE']


split = np.random.rand(len(dfsub1)) < 0.8

df_train = dfsub1[split]
df_test = dfsub1[~split]

from sklearn.preprocessing import StandardScaler
# fit training for scaling after upsampling
X_train = df_train.drop(columns=['default']).values     
scl_obj = StandardScaler()
scl_obj.fit(X_train)

print("Dimensions of training data " , df_train.shape)
print("Dimensions of test are " , df_test.shape)

df_train.info()

Dimensions of training data  (24168, 23)
Dimensions of test are  (5832, 23)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24168 entries, 1 to 30000
Data columns (total 23 columns):
SEX              24168 non-null int64
AGE              24168 non-null int64
default          24168 non-null int64
PAY_0            24168 non-null int64
PAY_2            24168 non-null int64
PAY_3            24168 non-null int64
PAY_4            24168 non-null int64
PAY_5            24168 non-null int64
PAY_6            24168 non-null int64
log_LIMIT_BAL    24168 non-null float64
log_PAY_AMT1     24168 non-null float64
log_PAY_AMT2     24168 non-null float64
log_PAY_AMT3     24168 non-null float64
log_PAY_AMT4     24168 non-null float64
log_PAY_AMT5     24168 non-null float64
log_PAY_AMT6     24168 non-null float64
EDUCATION_1      24168 non-null uint8
EDUCATION_2      24168 non-null uint8
EDUCATION_3      24168 non-null uint8
EDUCATION_4      24168 non-null uint8
MARRIAGE_1       24168 non-null uint8
MAR

In [43]:
target_count = dfsub.default.value_counts()
# Class count
df_class_0, df_class_1 = df_train.default.value_counts()

# Divide by class
df_class_0 = df_train[df_train['default'] == 0]
df_class_1 = df_train[df_train['default'] == 1]

df_class_1_over = df_class_1.sample(frac=target_count[0]/target_count[1], replace=True)
df_OverSampled = pd.concat([df_class_0, df_class_1_over], axis=0)
print('Random over-sampling:')
print(df_OverSampled.default.value_counts())


#Isolate the "default" variable into y and keep everythign else in X to use for predictions:
if 'default' in df_OverSampled:
    y_train = df_OverSampled['default'].values
    del df_OverSampled['default'] 
    X_train = df_OverSampled.values
    
if 'default' in df_test:
    y_test = df_test['default'].values
    del df_test['default'] 
    X_test = df_test.values

print("Dimensions of training features are " , X_train.shape)
print("Dimensions of training target are " , y_train.shape)
print("Dimensions of testing features are " , X_test.shape)
print("Dimensions of testing target are " , y_test.shape)

Random over-sampling:
0    18784
1    18667
Name: default, dtype: int64
Dimensions of training features are  (37451, 22)
Dimensions of training target are  (37451,)
Dimensions of testing features are  (5914, 22)
Dimensions of testing target are  (5914,)


##### apply scales to final training & test set to begin exploring models for Default

In [44]:
# use the previously fit scalines to transform the data after the over sampling
X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) # apply those means and std to the test set (without snooping at the test set values)


## Insert model, visualizations, & interpretation for "Default Predictions here:

# Predicting Education from Credit History

In [51]:
# perform one-hot encoding of the categorical data "EDUCATION" and "MARRIAGE".
tmp_df_2 = pd.get_dummies(df.MARRIAGE,prefix='MARRIAGE')
dfsub2 = pd.concat((df,tmp_df_2),axis=1)
del dfsub2['MARRIAGE']

from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'default' in dfsub2:
    y = dfsub2['EDUCATION'].values # get the labels we want
    del dfsub2['EDUCATION'] # get rid of the class label
    X = dfsub2.values # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=5, random_state=None, test_size=0.2, train_size=None)


In [53]:
from sklearn.preprocessing import StandardScaler

scl_obj = StandardScaler()

for train_indices, test_indices in cv_object.split(X,y): 
    
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
scl_obj.fit(X_train)

X_train_scaled = scl_obj.transform(X_train)
X_test_scaled = scl_obj.transform(X_test)

print("Dimensions of training features are " , X_train.shape)
print("Dimensions of training target are " , y_train.shape)
print("Dimensions of testing features are " , X_test.shape)
print("Dimensions of testing target are " , y_test.shape)

Dimensions of training features are  (24000, 19)
Dimensions of training target are  (24000,)
Dimensions of testing features are  (6000, 19)
Dimensions of testing target are  (6000,)


# Insert models for education here