# Predicting Customer Credit Default

#identify which customer attributes relate significantly to customer default #rates and to build a predictive model #that Credit One can use to better #classify potential customers as being ‘at-risk’, compared to previously #implemented models. We will use machine learning regression methods in #Python for this task.

# Attribute Information:

#This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study #reviewed the literature and used the following 23 variables as explanatory variables:
#X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family ####(supplementary) credit.
#X2: Gender (1 = male; 2 = female).
#X3: Education (1 = graduate school; 2 = university; 3 = high school; 0, 4, 5, 6 = others). X4: Marital status (1 = #married; 2 = single; 3 = divorce; 0=others).
#X5: Age (year).
#X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as #follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the #repayment status in April, 2005.
#The measurement scale for the repayment status is:
#-2: No consumption; -1: Paid in full; 0: The use of revolving credit; 1 = payment delay for one month; 2 = payment #delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
#X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of #bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005.
#X18-X23: Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in #August, 2005; . . .;X23 = amount paid in April, 2005.
#Y: client's behavior; Y=0 then not default, Y=1 then default"

# Import Packages

In [27]:
# DS Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import pandas_profiling
import seaborn as sns

# SKLearn Stuff
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# helpers
%matplotlib inline

# Grahpviz
#from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
import graphviz

In [28]:
from sqlalchemy import create_engine
import pymysql
import pandas as pd

In [29]:
db_connection_str = 'mysql+pymysql://deepanalytics:Sqltask1234!@34.73.222.197/deepanalytics'

In [30]:
db_connection = create_engine(db_connection_str)

In [31]:
df = pd.read_sql('SELECT * FROM credit', con=db_connection)

In [32]:
df.head(10)

Unnamed: 0,MyUnknownColumn,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,female,university,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,default
2,2,120000,female,university,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,default
3,3,90000,female,university,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,not default
4,4,50000,female,university,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,not default
5,5,50000,male,university,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,not default
6,6,50000,male,graduate school,2,37,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,not default
7,7,500000,male,graduate school,2,29,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,not default
8,8,100000,female,university,2,23,0,-1,-1,0,...,221,-159,567,380,601,0,581,1687,1542,not default
9,9,140000,female,high school,1,28,0,0,2,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,not default


In [33]:
#write to csv
df.to_csv('credit1_extract_data_out_2.csv')

In [34]:
!pwd
!ls

/Users/debasmitamohanty/Desktop/UT-DA/Course-2/Task1
C2T1 copy 2.ipynb              Untitled.ipynb
C2T1 copy.ipynb                Untitled1.ipynb
C2T1.ipynb                     credit1_extract_data_out.csv
C2T1Aug31st2020.ipynb          credit1_extract_data_out_2.csv
C2T1Report.docx


In [35]:
df = pd.read_csv('credit1_extract_data_out_2.csv')
#data = data.sample(100)
#print(df.shape)

In [36]:
df.dtypes

Unnamed: 0          int64
MyUnknownColumn    object
X1                 object
X2                 object
X3                 object
X4                 object
X5                 object
X6                 object
X7                 object
X8                 object
X9                 object
X10                object
X11                object
X12                object
X13                object
X14                object
X15                object
X16                object
X17                object
X18                object
X19                object
X20                object
X21                object
X22                object
X23                object
Y                  object
dtype: object

In [37]:
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header

In [38]:
df.head(10)

Unnamed: 0,0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,1,20000,female,university,1,24,2,2,-1,...,0,0,0,0,689,0,0,0,0,default
2,2,2,120000,female,university,2,26,-1,2,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,default
3,3,3,90000,female,university,2,34,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,not default
4,4,4,50000,female,university,1,37,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,not default
5,5,5,50000,male,university,1,57,-1,0,-1,...,20940,19146,19131,2000,36681,10000,9000,689,679,not default
6,6,6,50000,male,graduate school,2,37,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,not default
7,7,7,500000,male,graduate school,2,29,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,not default
8,8,8,100000,female,university,2,23,0,-1,-1,...,221,-159,567,380,601,0,581,1687,1542,not default
9,9,9,140000,female,high school,1,28,0,0,2,...,12211,11793,3719,3329,0,432,1000,1000,1000,not default
10,10,10,20000,male,high school,2,35,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,not default


In [39]:
# sorting data frame by name 
df.sort_values("AGE", axis = 0, ascending = False, 
                 inplace = True, na_position ='last') 
#disply data
df

Unnamed: 0,0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
202,202,,X1,X2,X3,X4,X5,X6,X7,X8,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
203,203,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
18449,18449,18246,440000,male,graduate school,1,79,0,0,0,...,447112,438187,447543,15715,16519,16513,15800,16531,15677,not default
25345,25345,25142,210000,male,university,1,75,0,0,0,...,203776,205901,210006,9700,8810,9000,7300,7500,7600,not default
450,450,247,250000,female,university,1,75,0,-1,-1,...,1010,5572,794,1631,1536,1010,5572,794,1184,not default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27834,27834,27631,10000,female,university,2,21,2,2,3,...,9768,8430,20735,3905,0,0,300,2165,2666,default
1428,1428,1225,20000,male,university,2,21,0,0,-1,...,1300,0,0,1240,2333,0,0,0,0,not default
21780,21780,21577,20000,female,university,2,21,0,0,0,...,4725,9135,18355,3000,2000,1200,5000,10000,8906,not default
2416,2416,2213,10000,male,university,2,21,0,0,0,...,8880,9580,9000,1217,1000,200,700,200,0,not default


In [41]:
new_credit_header = df.iloc[1]
credit = df.iloc[2:]
credit.columns = new_credit_header

In [42]:
credit.head(10)

203,203.1,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
18449,18449,18246,440000,male,graduate school,1,79,0,0,0,...,447112,438187,447543,15715,16519,16513,15800,16531,15677,not default
25345,25345,25142,210000,male,university,1,75,0,0,0,...,203776,205901,210006,9700,8810,9000,7300,7500,7600,not default
450,450,247,250000,female,university,1,75,0,-1,-1,...,1010,5572,794,1631,1536,1010,5572,794,1184,not default
25340,25340,25137,180000,male,graduate school,1,75,1,-2,-2,...,0,0,0,0,0,0,0,0,0,default
29379,29379,29176,160000,female,high school,1,74,0,0,0,...,16905,0,19789,3783,2268,16905,0,19789,26442,not default
29383,29383,29180,500000,female,university,1,73,-2,-2,-2,...,2835,8896,3850,2826,2652,2835,8896,3850,711,default
10903,10903,10700,140000,male,university,1,73,0,0,0,...,135772,138675,137197,5082,5008,5000,6016,5018,5264,not default
29367,29367,29164,360000,female,high school,1,73,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,default
591,591,388,80000,male,high school,1,73,-1,0,0,...,70646,72870,75018,3500,4330,3500,3500,3500,5100,default
20468,20468,20265,150000,female,high school,1,72,-2,-2,-2,...,3871,6200,336,1250,0,3871,6200,336,0,not default


In [43]:
credit = credit.replace(['','?'], np.nan)
credit

203,203.1,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
18449,18449,18246,440000,male,graduate school,1,79,0,0,0,...,447112,438187,447543,15715,16519,16513,15800,16531,15677,not default
25345,25345,25142,210000,male,university,1,75,0,0,0,...,203776,205901,210006,9700,8810,9000,7300,7500,7600,not default
450,450,247,250000,female,university,1,75,0,-1,-1,...,1010,5572,794,1631,1536,1010,5572,794,1184,not default
25340,25340,25137,180000,male,graduate school,1,75,1,-2,-2,...,0,0,0,0,0,0,0,0,0,default
29379,29379,29176,160000,female,high school,1,74,0,0,0,...,16905,0,19789,3783,2268,16905,0,19789,26442,not default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27834,27834,27631,10000,female,university,2,21,2,2,3,...,9768,8430,20735,3905,0,0,300,2165,2666,default
1428,1428,1225,20000,male,university,2,21,0,0,-1,...,1300,0,0,1240,2333,0,0,0,0,not default
21780,21780,21577,20000,female,university,2,21,0,0,0,...,4725,9135,18355,3000,2000,1200,5000,10000,8906,not default
2416,2416,2213,10000,male,university,2,21,0,0,0,...,8880,9580,9000,1217,1000,200,700,200,0,not default


In [44]:
credit.isnull().any()

203
203                           False
ID                            False
LIMIT_BAL                     False
SEX                           False
EDUCATION                     False
MARRIAGE                      False
AGE                           False
PAY_0                         False
PAY_2                         False
PAY_3                         False
PAY_4                         False
PAY_5                         False
PAY_6                         False
BILL_AMT1                     False
BILL_AMT2                     False
BILL_AMT3                     False
BILL_AMT4                     False
BILL_AMT5                     False
BILL_AMT6                     False
PAY_AMT1                      False
PAY_AMT2                      False
PAY_AMT3                      False
PAY_AMT4                      False
PAY_AMT5                      False
PAY_AMT6                      False
default payment next month    False
dtype: bool

In [45]:
credit.isnull().sum()

203
203                           0
ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [46]:
credit = credit.dropna()
credit

203,203.1,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
18449,18449,18246,440000,male,graduate school,1,79,0,0,0,...,447112,438187,447543,15715,16519,16513,15800,16531,15677,not default
25345,25345,25142,210000,male,university,1,75,0,0,0,...,203776,205901,210006,9700,8810,9000,7300,7500,7600,not default
450,450,247,250000,female,university,1,75,0,-1,-1,...,1010,5572,794,1631,1536,1010,5572,794,1184,not default
25340,25340,25137,180000,male,graduate school,1,75,1,-2,-2,...,0,0,0,0,0,0,0,0,0,default
29379,29379,29176,160000,female,high school,1,74,0,0,0,...,16905,0,19789,3783,2268,16905,0,19789,26442,not default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27834,27834,27631,10000,female,university,2,21,2,2,3,...,9768,8430,20735,3905,0,0,300,2165,2666,default
1428,1428,1225,20000,male,university,2,21,0,0,-1,...,1300,0,0,1240,2333,0,0,0,0,not default
21780,21780,21577,20000,female,university,2,21,0,0,0,...,4725,9135,18355,3000,2000,1200,5000,10000,8906,not default
2416,2416,2213,10000,male,university,2,21,0,0,0,...,8880,9580,9000,1217,1000,200,700,200,0,not default


In [47]:
credit.duplicated().any()

False

In [48]:
header = credit.dtypes.index
print(header)

Index([                         203,                         'ID',
                        'LIMIT_BAL',                        'SEX',
                        'EDUCATION',                   'MARRIAGE',
                              'AGE',                      'PAY_0',
                            'PAY_2',                      'PAY_3',
                            'PAY_4',                      'PAY_5',
                            'PAY_6',                  'BILL_AMT1',
                        'BILL_AMT2',                  'BILL_AMT3',
                        'BILL_AMT4',                  'BILL_AMT5',
                        'BILL_AMT6',                   'PAY_AMT1',
                         'PAY_AMT2',                   'PAY_AMT3',
                         'PAY_AMT4',                   'PAY_AMT5',
                         'PAY_AMT6', 'default payment next month'],
      dtype='object', name=203)


In [53]:
credit['ID'] = (credit['ID'] == 1).astype('int')
credit['LIMIT_BAL'] = (credit['LIMIT_BAL'] == 1).astype('int')
credit['MARRIAGE'] = (credit['MARRIAGE'] == 1).astype('int')
credit['AGE'] = (credit['AGE'] == 1).astype('int')
credit['AGE'] = (credit['AGE'] == 1).astype('int')
credit['PAY_0'] = (credit['PAY_0'] == 1).astype('int')
credit['PAY_2'] = (credit['PAY_2'] == 1).astype('int')
credit['PAY_3'] = (credit['PAY_3'] == 1).astype('int')
credit['PAY_4'] = (credit['PAY_4'] == 1).astype('int')
credit['PAY_5'] = (credit['PAY_5'] == 1).astype('int')
credit['PAY_6'] = (credit['PAY_6'] == 1).astype('int')
credit['BILL_AMT1'] = (credit['BILL_AMT1'] == 1).astype('int')
credit['BILL_AMT2'] = (credit['BILL_AMT2'] == 1).astype('int')
credit['BILL_AMT3'] = (credit['BILL_AMT3'] == 1).astype('int')
credit['BILL_AMT4'] = (credit['BILL_AMT4'] == 1).astype('int')
credit['BILL_AMT5'] = (credit['BILL_AMT5'] == 1).astype('int')
credit['BILL_AMT6'] = (credit['BILL_AMT6'] == 1).astype('int')
credit['PAY_AMT1'] = (credit['PAY_AMT1'] == 1).astype('int')
credit['PAY_AMT2'] = (credit['PAY_AMT2'] == 1).astype('int')
credit['PAY_AMT3'] = (credit['PAY_AMT3'] == 1).astype('int')
credit['PAY_AMT4'] = (credit['PAY_AMT4'] == 1).astype('int')
credit['PAY_AMT5'] = (credit['PAY_AMT5'] == 1).astype('int')
credit['PAY_AMT6'] = (credit['PAY_AMT6'] == 1).astype('int')
credit.dtypes

203
203                            int64
ID                             int64
LIMIT_BAL                      int64
SEX                           object
EDUCATION                     object
MARRIAGE                       int64
AGE                            int64
PAY_0                          int64
PAY_2                          int64
PAY_3                          int64
PAY_4                          int64
PAY_5                          int64
PAY_6                          int64
BILL_AMT1                      int64
BILL_AMT2                      int64
BILL_AMT3                      int64
BILL_AMT4                      int64
BILL_AMT5                      int64
BILL_AMT6                      int64
PAY_AMT1                       int64
PAY_AMT2                       int64
PAY_AMT3                       int64
PAY_AMT4                       int64
PAY_AMT5                       int64
PAY_AMT6                       int64
default payment next month    object
dtype: object