# Predicting Customer Credit Default

#identify which customer attributes relate significantly to customer default #rates and to build a predictive model #that Credit One can use to better #classify potential customers as being ‘at-risk’, compared to previously #implemented models. We will use machine learning regression methods in #Python for this task.

# Attribute Information:

#This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study #reviewed the literature and used the following 23 variables as explanatory variables:
#X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family ####(supplementary) credit.
#X2: Gender (1 = male; 2 = female).
#X3: Education (1 = graduate school; 2 = university; 3 = high school; 0, 4, 5, 6 = others). X4: Marital status (1 = #married; 2 = single; 3 = divorce; 0=others).
#X5: Age (year).
#X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as #follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the #repayment status in April, 2005.
#The measurement scale for the repayment status is:
#-2: No consumption; -1: Paid in full; 0: The use of revolving credit; 1 = payment delay for one month; 2 = payment #delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
#X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of #bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005.
#X18-X23: Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in #August, 2005; . . .;X23 = amount paid in April, 2005.
#Y: client's behavior; Y=0 then not default, Y=1 then default"

# Import Packages

In [1]:
# DS Basics
import numpy as np
import pandas as pd
import scipy
from math import sqrt
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns; sns.set()

#estimators
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold # for cross validation

from sklearn.svm import SVR
from sklearn import linear_model

#model metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

#cross validation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

#from sklearn.metrics import precision_recallFscore_support
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#rf_y_predictions rf.predict(X_test)
#rf_roc_auc = roc_auc_score(y_test, rf.predict(X_test))
#print(rf_roc_auc)
#import os
#print(os.listdir("../input"))

from sklearn.datasets import make_classification 
#from sklearn.learning_curve import learning_curve 
#from sklearn.cross_validation import train_test_split 
#from sklearn.grid_search import GridSearchCV
#from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import classification_report,confusion_matrix, roc_curve, roc_auc_score, auc, accuracy_score
#
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.svm import SVC

# 1 Import Data

In [2]:
credit_new = pd.read_csv('credit1_extract_data_out.csv')
credit_new.head()

Unnamed: 0.1,Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,0,440000,male,graduate school,1,79,0,0,0,0,...,447112,438187,447543,15715,16519,16513,15800,16531,15677,not default
1,1,250000,female,university,1,75,0,-1,-1,-1,...,1010,5572,794,1631,1536,1010,5572,794,1184,not default
2,2,180000,male,graduate school,1,75,1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,default
3,3,210000,male,university,1,75,0,0,0,0,...,203776,205901,210006,9700,8810,9000,7300,7500,7600,not default
4,4,160000,female,high school,1,74,0,0,0,-1,...,16905,0,19789,3783,2268,16905,0,19789,26442,not default


In [3]:
credit_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29730 entries, 0 to 29729
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Unnamed: 0                  29730 non-null  int64 
 1   LIMIT_BAL                   29730 non-null  int64 
 2   SEX                         29730 non-null  object
 3   EDUCATION                   29730 non-null  object
 4   MARRIAGE                    29730 non-null  int64 
 5   AGE                         29730 non-null  int64 
 6   PAY_0                       29730 non-null  int64 
 7   PAY_2                       29730 non-null  int64 
 8   PAY_3                       29730 non-null  int64 
 9   PAY_4                       29730 non-null  int64 
 10  PAY_5                       29730 non-null  int64 
 11  PAY_6                       29730 non-null  int64 
 12  BILL_AMT1                   29730 non-null  int64 
 13  BILL_AMT2                   29730 non-null  in

In [4]:
credit_new.shape

(29730, 25)

In [5]:
credit_new = pd.get_dummies(credit_new)
#credit_new = pd.get_dummies(credit_new)

In [6]:
credit_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29730 entries, 0 to 29729
Data columns (total 30 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   Unnamed: 0                              29730 non-null  int64
 1   LIMIT_BAL                               29730 non-null  int64
 2   MARRIAGE                                29730 non-null  int64
 3   AGE                                     29730 non-null  int64
 4   PAY_0                                   29730 non-null  int64
 5   PAY_2                                   29730 non-null  int64
 6   PAY_3                                   29730 non-null  int64
 7   PAY_4                                   29730 non-null  int64
 8   PAY_5                                   29730 non-null  int64
 9   PAY_6                                   29730 non-null  int64
 10  BILL_AMT1                               29730 non-null  int64
 11  BILL_AMT2      

In [7]:
#features
X=credit_new.iloc[:,[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]]
print('Summary of feature sample')
X.head()

Summary of feature sample


Unnamed: 0,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,0,0,0,0,0,0,429309,437906,447326,447112,438187,447543,15715,16519,16513,15800,16531,15677
1,0,-1,-1,-1,-1,-1,52874,1631,1536,1010,5572,794,1631,1536,1010,5572,794,1184
2,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,205601,203957,199882,203776,205901,210006,9700,8810,9000,7300,7500,7600
4,0,0,0,-1,-1,-1,79201,69376,66192,16905,0,19789,3783,2268,16905,0,19789,26442


In [8]:
credit_new.head()

Unnamed: 0.1,Unnamed: 0,LIMIT_BAL,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,PAY_AMT5,PAY_AMT6,SEX_female,SEX_male,EDUCATION_graduate school,EDUCATION_high school,EDUCATION_other,EDUCATION_university,default payment next month_default,default payment next month_not default
0,0,440000,1,79,0,0,0,0,0,0,...,16531,15677,0,1,1,0,0,0,0,1
1,1,250000,1,75,0,-1,-1,-1,-1,-1,...,794,1184,1,0,0,0,0,1,0,1
2,2,180000,1,75,1,-2,-2,-2,-2,-2,...,0,0,0,1,1,0,0,0,1,0
3,3,210000,1,75,0,0,0,0,0,0,...,7500,7600,0,1,0,0,0,1,0,1
4,4,160000,1,74,0,0,0,-1,-1,-1,...,19789,26442,1,0,0,1,0,0,0,1


In [9]:
# OOB - select predictors 
X = credit_new.iloc[:,:21]
X.head()

Unnamed: 0.1,Unnamed: 0,LIMIT_BAL,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5
0,0,440000,1,79,0,0,0,0,0,0,...,437906,447326,447112,438187,447543,15715,16519,16513,15800,16531
1,1,250000,1,75,0,-1,-1,-1,-1,-1,...,1631,1536,1010,5572,794,1631,1536,1010,5572,794
2,2,180000,1,75,1,-2,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
3,3,210000,1,75,0,0,0,0,0,0,...,203957,199882,203776,205901,210006,9700,8810,9000,7300,7500
4,4,160000,1,74,0,0,0,-1,-1,-1,...,69376,66192,16905,0,19789,3783,2268,16905,0,19789


In [10]:
# select dependent variable
y = credit_new.loc[:,'default payment next month_default']
y.head()

0    0
1    0
2    1
3    0
4    0
Name: default payment next month_default, dtype: uint8

#### COR ds

### Split/Train/Test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)   
print( X_train.shape )
print( y_train.shape )
print( X_test.shape )
print( y_test.shape )

(22297, 21)
(22297,)
(7433, 21)
(7433,)


###  Create Models

In [12]:
algos = []
algos.append(('Random Forest Regressor', RandomForestRegressor(n_jobs=4)))
algos.append(('Linear Regression', LinearRegression()))
algos.append(('Support Vector Regression', SVR()))

In [13]:
# build models

results = []
names = []

for name, model in algos:
    result = cross_val_score(model, X_train, y_train, cv=3, scoring='r2')
    names.append(name)
    results.append(result)

In [14]:
# evaluate results

for i in range(len(names)):
    print(names[i],results[i].mean())

Random Forest Regressor 0.1761066668094452
Linear Regression 0.11682088425681776
Support Vector Regression -0.08465177536258632


In [15]:
# select best model(s) and tune

rfOOB = RandomForestRegressor(n_jobs=4)
rfOOBfit = rfOOB.fit(X_train,y_train)
print(cross_val_score(rfOOBfit, X_train, y_train, cv = 3))
rfOOBfit.score(X_train, y_train)

[0.1932206  0.15939808 0.17849087]


0.8846618197788311

### Predict

In [16]:
pred = rfOOB.predict(X_test)
predRsquared = r2_score(y_test, pred)
rmse = sqrt(mean_squared_error(y_test, pred))
print( predRsquared )
print( rmse )

0.17467837682703313
0.3781949595357794
