# Credit Default Prediction - Supervised Learning

## ML Process Flow:
### 1. Import data to python
### 2. data preprocessing
### 3. training the models
### 4. evaluate the models

In [4]:
import numpy as np
import pandas as pd

#### dataset source: https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset

## Data Dictionary
1. ID: ID of each client
2. LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
3. SEX: Gender (1=male, 2=female)
4. EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
5. MARRIAGE: Marital status (1=married, 2=single, 3=others)
6. AGE: Age in years
7. PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
8. PAY_2: Repayment status in August, 2005 (scale same as above)
9. PAY_3: Repayment status in July, 2005 (scale same as above)
10. PAY_4: Repayment status in June, 2005 (scale same as above)
11. PAY_5: Repayment status in May, 2005 (scale same as above)
12. PAY_6: Repayment status in April, 2005 (scale same as above)
13. BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
14. BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
15. BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
16. BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
17. BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
18. BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
19. PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
20. PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
21. PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
22. PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
23. PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
24. PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
25. default.payment.next.month: Default payment (1=yes, 0=no)

## Import Data

In [5]:
def ImportData(data):
    data = pd.read_csv(data)
    print("Original data: (# of observations, # of features)", data.shape)
    data = data.drop_duplicates()
    print("Original data after removing duplicates: (# of observations, # of features)", data.shape)
    return data

In [6]:
data = ImportData("credit-data1.csv")

Original data: (# of observations, # of features) (30000, 25)
Original data after removing duplicates: (# of observations, # of features) (30000, 25)


In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,,1
2,2,,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,3,50000.0,2.0,,1.0,37.0,0.0,,0.0,0.0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,,1069.0,1000.0,0
4,4,50000.0,,,1.0,57.0,-1.0,0.0,-1.0,0.0,...,20940.0,19146.0,19131.0,2000.0,,10000.0,9000.0,689.0,679.0,0


In [121]:
data.shape

(30000, 25)

## Data Preprocessing

In [8]:
data_output = data['default.payment.next.month']

In [9]:
data_to_drop = ['default.payment.next.month','Unnamed: 0']

In [10]:
def InputOutputSplit(data, output_column, drop_column):
    data_output
    data_input = data.drop(drop_column, axis=1)
    return data_input, data_output

In [11]:
data.head()

Unnamed: 0.1,Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,,1
2,2,,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,3,50000.0,2.0,,1.0,37.0,0.0,,0.0,0.0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,,1069.0,1000.0,0
4,4,50000.0,,,1.0,57.0,-1.0,0.0,-1.0,0.0,...,20940.0,19146.0,19131.0,2000.0,,10000.0,9000.0,689.0,679.0,0


In [12]:
data_input, data_output = InputOutputSplit(data, data_output, data_to_drop)

In [13]:
data_input.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,0.0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,
2,,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
3,50000.0,2.0,,1.0,37.0,0.0,,0.0,0.0,0.0,...,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,,1069.0,1000.0
4,50000.0,,,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,35835.0,20940.0,19146.0,19131.0,2000.0,,10000.0,9000.0,689.0,679.0


## Train Test Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
?train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size = 0.25, random_state = 123)

## Data Imputation

In [17]:
x_train.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [119]:
x_train.isnull().sum()

LIMIT_BAL    1819
SEX          1743
EDUCATION    1813
MARRIAGE     1727
AGE          1814
PAY_0        1856
PAY_2        1815
PAY_3        1775
PAY_4        1813
PAY_5        1801
PAY_6        1768
BILL_AMT1    1775
BILL_AMT2    1848
BILL_AMT3    1782
BILL_AMT4    1762
BILL_AMT5    1869
BILL_AMT6    1843
PAY_AMT1     1899
PAY_AMT2     1896
PAY_AMT3     1788
PAY_AMT4     1770
PAY_AMT5     1816
PAY_AMT6     1811
dtype: int64

In [18]:
numerical_columns = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

categorical_columns = ['SEX','EDUCATION','MARRIAGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']

### Numerical Imputation

In [19]:
x_train_numerical = x_train._get_numeric_data()

In [20]:
x_train_numerical.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
16095,140000.0,2.0,2.0,1.0,36.0,1.0,2.0,3.0,2.0,0.0,...,61459.0,59798.0,61287.0,8383.0,5200.0,0.0,0.0,3009.0,1000.0,94000.0
28548,210000.0,2.0,2.0,2.0,33.0,0.0,0.0,0.0,-2.0,-2.0,...,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,0.0
25096,20000.0,1.0,3.0,2.0,53.0,,0.0,,-1.0,-1.0,...,390.0,18280.0,2880.0,1600.0,1105.0,390.0,18280.0,2880.0,1600.0,
12260,90000.0,2.0,2.0,2.0,,2.0,3.0,,3.0,3.0,...,37825.0,40299.0,39093.0,,2000.0,0.0,,0.0,0.0,1000.0
21549,,2.0,3.0,2.0,22.0,-2.0,-2.0,-2.0,,-2.0,...,1697.0,0.0,0.0,5000.0,0.0,1699.0,0.0,0.0,5000.0,0.0


#### cannot use _get_numeric_data for getting 

In [21]:
x_train_numerical = x_train[numerical_columns]

In [22]:
x_train_numerical.isnull().sum()

LIMIT_BAL    1819
AGE          1814
BILL_AMT1    1775
BILL_AMT2    1848
BILL_AMT3    1782
BILL_AMT4    1762
BILL_AMT5    1869
BILL_AMT6    1843
PAY_AMT1     1899
PAY_AMT2     1896
PAY_AMT3     1788
PAY_AMT4     1770
PAY_AMT5     1816
PAY_AMT6     1811
dtype: int64

In [131]:
from sklearn.preprocessing import Imputer

In [132]:
imput = Imputer(missing_values='NaN', strategy='median')

In [133]:
imput.fit(x_train_numerical)
x_train_num_imputed = pd.DataFrame(imput.transform(x_train_numerical))
x_train_num_imputed.columns = x_train_numerical.columns
x_train_num_imputed.index = x_train_numerical.index

In [134]:
x_train_num_imputed.isnull().sum()

LIMIT_BAL    0
AGE          0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
dtype: int64

In [139]:
def NumericalImputation(data, numerical):
    train_numerical_data = data[numerical]
    imput.fit(train_numerical_data)
    
    data_imput = pd.DataFrame(imput.transform(train_numerical_data))
    data_imput.columns = numerical
    data_imput.index = data.index
    return data_imput, imput

In [141]:
x_train_numerical_imputed, imput_numerical = NumericalImputation(x_train, x_train_numerical.columns)

In [142]:
x_train_numerical_imputed.isnull().sum()

LIMIT_BAL    0
AGE          0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
dtype: int64

### Categorical Imputation

In [143]:
x_train_categorical = x_train[categorical_columns]

In [144]:
x_train_categorical.isnull().sum()

SEX          1743
EDUCATION    1813
MARRIAGE     1727
PAY_0        1856
PAY_2        1815
PAY_3        1775
PAY_4        1813
PAY_5        1801
PAY_6        1768
dtype: int64

In [145]:
x_train_categorical.fillna(value='KOSONG').head()

Unnamed: 0,SEX,EDUCATION,MARRIAGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
16095,2,2,1,1,2,3,2,0,0
28548,2,2,2,0,0,0,-2,-2,-2
25096,1,3,2,KOSONG,0,KOSONG,-1,-1,-1
12260,2,2,2,2,3,KOSONG,3,3,3
21549,2,3,2,-2,-2,-2,KOSONG,-2,-2


In [146]:
x_train_categorical.isnull().sum()

SEX          1743
EDUCATION    1813
MARRIAGE     1727
PAY_0        1856
PAY_2        1815
PAY_3        1775
PAY_4        1813
PAY_5        1801
PAY_6        1768
dtype: int64

In [147]:
def CategoricalImputation(data, cat_columns, replacement):
    categorical_data = data[cat_columns].fillna(value=replacement)
    return categorical_data

In [148]:
x_train_categorical = CategoricalImputation(x_train, categorical_columns, 'KOSONG')

In [149]:
x_train_categorical.isnull().sum()

SEX          0
EDUCATION    0
MARRIAGE     0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
dtype: int64

### Categorical Variables Preprocessing

#### we need to transform our categorical variables to numerical form.

In [150]:
categorical_dummies = pd.get_dummies(x_train_categorical)

In [151]:
categorical_dummies.head()

Unnamed: 0,SEX_1.0,SEX_2.0,SEX_KOSONG,EDUCATION_1.0,EDUCATION_2.0,EDUCATION_3.0,EDUCATION_4.0,EDUCATION_KOSONG,MARRIAGE_0.0,MARRIAGE_1.0,...,PAY_5_0.0,PAY_5_2.0,PAY_5_3.0,PAY_5_KOSONG,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_KOSONG
16095,0,1,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
28548,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
25096,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
12260,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
21549,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [152]:
def ExtractCategorical(data, categorical):
    x_train_categorical = CategoricalImputation(data, categorical, 'KOSONG')
    categorical_dummies = pd.get_dummies(x_train_categorical)
    return categorical_dummies

In [153]:
x_train_cat_dummies = ExtractCategorical(x_train, categorical_columns)

In [154]:
x_train_cat_dummies.head()

Unnamed: 0,SEX_1.0,SEX_2.0,SEX_KOSONG,EDUCATION_1.0,EDUCATION_2.0,EDUCATION_3.0,EDUCATION_4.0,EDUCATION_KOSONG,MARRIAGE_0.0,MARRIAGE_1.0,...,PAY_5_0.0,PAY_5_2.0,PAY_5_3.0,PAY_5_KOSONG,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_KOSONG
16095,0,1,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
28548,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
25096,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
12260,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
21549,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [155]:
dummies_columns = x_train_cat_dummies.columns

### Merging Numerical and Categorical Data

In [156]:
x_train_concat = pd.concat([x_train_numerical_imputed, x_train_cat_dummies], axis=1)

In [157]:
x_train_concat.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_5_0.0,PAY_5_2.0,PAY_5_3.0,PAY_5_KOSONG,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_KOSONG
16095,140000.0,36.0,59379.0,63007.0,61459.0,59798.0,61287.0,8383.0,5200.0,0.0,...,1,0,0,0,0,0,1,0,0,0
28548,210000.0,33.0,43047.0,9378.0,0.0,0.0,0.0,0.0,1000.0,0.0,...,0,0,0,0,1,0,0,0,0,0
25096,20000.0,53.0,13561.0,13050.0,390.0,18280.0,2880.0,1600.0,1105.0,390.0,...,0,0,0,0,0,1,0,0,0,0
12260,90000.0,34.0,37639.0,38743.0,37825.0,40299.0,39093.0,16809.0,2000.0,0.0,...,0,0,1,0,0,0,0,0,1,0
21549,140000.0,22.0,2498.0,20897.0,1697.0,0.0,0.0,5000.0,0.0,1699.0,...,0,0,0,0,1,0,0,0,0,0


In [158]:
x_train_concat.isnull().any()

LIMIT_BAL           False
AGE                 False
BILL_AMT1           False
BILL_AMT2           False
BILL_AMT3           False
BILL_AMT4           False
BILL_AMT5           False
BILL_AMT6           False
PAY_AMT1            False
PAY_AMT2            False
PAY_AMT3            False
PAY_AMT4            False
PAY_AMT5            False
PAY_AMT6            False
SEX_1.0             False
SEX_2.0             False
SEX_KOSONG          False
EDUCATION_1.0       False
EDUCATION_2.0       False
EDUCATION_3.0       False
EDUCATION_4.0       False
EDUCATION_KOSONG    False
MARRIAGE_0.0        False
MARRIAGE_1.0        False
MARRIAGE_2.0        False
MARRIAGE_3.0        False
MARRIAGE_KOSONG     False
PAY_0_-2.0          False
PAY_0_-1.0          False
PAY_0_0.0           False
                    ...  
PAY_2_-2.0          False
PAY_2_-1.0          False
PAY_2_0.0           False
PAY_2_2.0           False
PAY_2_3.0           False
PAY_2_KOSONG        False
PAY_3_-2.0          False
PAY_3_-1.0  

In [159]:
x_train_concat.shape

(22500, 64)

### Standardizing Variables

In [160]:
from sklearn.preprocessing import StandardScaler

In [161]:
def Standardize(data):
    data_columns = data.columns
    data_index = data.index
    normalize = StandardScaler()
    normalize.fit(data)
    
    data_normalize = pd.DataFrame(normalize.transform(data))
    data_normalize.columns = data_columns
    data_normalize.index = data_index
    
    return data_normalize, normalize

In [162]:
x_train_clean, normalize = Standardize(x_train_concat)

In [163]:
x_train_clean.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_5_0.0,PAY_5_2.0,PAY_5_3.0,PAY_5_KOSONG,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_KOSONG
16095,-0.203809,0.07391,0.149569,0.23701,0.249116,0.2965,0.389746,-0.497209,-0.008807,-0.267788,...,0.962995,-0.293903,-0.104706,-0.294973,-0.421767,-0.464213,1.001512,-0.303,-0.09937,-0.292025
28548,0.356793,-0.265197,-0.081959,-0.543801,-0.675216,-0.666988,-0.654829,-0.643108,-0.272452,-0.267788,...,-1.038427,-0.293903,-0.104706,-0.294973,2.370979,-0.464213,-0.99849,-0.303,-0.09937,-0.292025
25096,-1.164841,1.995517,-0.499962,-0.490338,-0.669351,-0.372454,-0.605742,-0.615261,-0.265861,-0.249173,...,-1.038427,-0.293903,-0.104706,-0.294973,-0.421767,2.154185,-0.99849,-0.303,-0.09937,-0.292025
12260,-0.604239,-0.152161,-0.158624,-0.116261,-0.106335,-0.017675,0.011472,-0.350561,-0.209679,-0.267788,...,-1.038427,-0.293903,9.550556,-0.294973,-0.421767,-0.464213,-0.99849,-0.303,10.063435,-0.292025
21549,-0.203809,-1.508589,-0.656795,-0.37609,-0.649694,-0.666988,-0.654829,-0.556087,-0.335225,-0.186693,...,-1.038427,-0.293903,-0.104706,-0.294973,2.370979,-0.464213,-0.99849,-0.303,-0.09937,-0.292025


## Machine Learning Training
1. Choose score to optimize and hyperparameter space
2. cross-validation: random vs Grid-search CV
3. need to beat the benchmark


In [164]:
# benchmark = largest target class proportion. 
# In other words, the benchmark value here is 88%, 
# and we get that from answering all observations with "no" answer.

y_train.value_counts(normalize=True)

0    0.777378
1    0.222622
Name: default.payment.next.month, dtype: float64

### Model#1: K-Nearest Neighbors

In [165]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train_clean, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

### Model#2: Logistic Regression

In [166]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=123)
logreg.fit(x_train_clean, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Model#2: Random Forest

In [167]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=123)
rf.fit(x_train_clean, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [189]:
random_forest_1 = RandomForestClassifier(random_state=123, n_estimators=50)
random_forest_1.fit(x_train_clean, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

## Prediction Result in training dataset

In [168]:
logreg_predict_train = pd.DataFrame(logreg.predict(x_train_clean))
logreg_predict_train.head()

Unnamed: 0,0
0,0
1,0
2,0
3,1
4,0


In [169]:
knn_predict_train = pd.DataFrame(knn.predict(x_train_clean))
knn_predict_train.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [191]:
rf_predict_train = pd.DataFrame(rf.predict(x_train_clean))
rf_predict_train.head()

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0


In [192]:
rf1_predict_train = pd.DataFrame(random_forest_1.predict(x_train_clean))
rf1_predict_train.head()

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0


## Test Model Performance in Training Dataset

In [171]:
benchmark = y_train.value_counts(normalize=True)[0]
benchmark

0.77737777777777772

In [172]:
logreg.score(x_train_clean, y_train)

0.81799999999999995

In [173]:
knn.score(x_train_clean, y_train)

0.84088888888888891

In [174]:
rf.score(x_train_clean, y_train)

0.97946666666666671

In [193]:
random_forest_1.score(x_train_clean, y_train)

0.99942222222222221

## Prediction Result in test dataset

In [179]:
def ExtractTest(numerical_columns, categorical_columns, dummies_columns, data, imput_numericals, standardizer):
        
    numerical_data = data[numerical_columns]
    categorical_data = data[categorical_columns]
    
    numerical_data = pd.DataFrame(imput_numericals.transform(numerical_data))
    numerical_data.columns = numerical_columns
    numerical_data.index = data.index
    
    categorical_data = categorical_data.fillna(value="KOSONG")
    categorical_data.index = data.index
    categorical_dummies = pd.get_dummies(categorical_data) 
    categorical_dummies.reindex(index=data.index, columns=dummies_columns)
    x_test = pd.concat([numerical_data, categorical_dummies], axis = 1)
    x_test_transform = pd.DataFrame(standardizer.transform(x_test))
    x_test_transform.columns = x_test.columns
    
    return x_test_transform

In [183]:
x_test_clean = ExtractTest(numerical_columns=numerical_columns, 
                                categorical_columns=categorical_columns, 
                                dummies_columns = dummies_columns, data=x_test,
                                imput_numericals=imput, standardizer= normalize)

In [184]:
x_test_clean.shape

(7500, 64)

In [185]:
def testPrediction(x_test, y_test, classifier, compute_score):
    if compute_score == True:
        score = classifier.score(x_test, y_test)
        print ("Accuracy: ", score)
        
    test_predict = classifier.predict(x_test)
    
    return  test_predict, score

In [186]:
logreg_test_predict, logreg_score = testPrediction(x_test_clean, y_test, logreg, compute_score=True)

Accuracy:  0.818933333333


In [187]:
knn_test_predict, knn_score = testPrediction(x_test_clean, y_test, knn, compute_score=True)

Accuracy:  0.7928


In [188]:
rf_test_predict, rf_score = testPrediction(x_test_clean, y_test, rf, compute_score=True)

Accuracy:  0.804133333333


In [194]:
rf1_test_predict, rf1_score = testPrediction(x_test_clean, y_test, random_forest_1, compute_score=True)

Accuracy:  0.815733333333
