In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

#machine learning packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score

#models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#export object
import joblib

# Import data 

In [2]:
df = pd.read_csv("cleaned_data.csv")

# balancing the data 

In [5]:
default_class = {}

for y in set(list(df["default payment next month"])):
    default_class[y] = df[df["default payment next month"] == y]

In [10]:
#resample from the defaulters to the amount of non-defaulters
expanded_defaulter = default_class[1].sample(n=len(default_class[0]),replace=True)

In [9]:
df = pd.concat([default_class[0], expanded_defaulter],axis="index")
df["default payment next month"].value_counts(normalize=True)

0    0.5
1    0.5
Name: default payment next month, dtype: float64

# one-hot encoding

In [11]:
#categorical variables
catorgical_variables = ["SEX", "MARRIAGE"]

for col in catorgical_variables:
    dummy = pd.get_dummies(df[col])
    df.drop(col,axis="columns", inplace=True)
    df = pd.concat([df,dummy], axis="columns")

# train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("default payment next month", axis="columns"), df["default payment next month"], 
                                                    test_size = 0.2, random_state = 42)

# feature scaling

In [17]:
sc = StandardScaler()
numeric_columns = X_train.columns[0:3]

In [18]:
X_train[numeric_columns] = sc.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = sc.transform(X_test[numeric_columns])

In [19]:
joblib.dump(sc, "feature_scaler.pkl")

['feature_scaler.pkl']

In [20]:
compiled_y_test = pd.DataFrame(y_test.copy())

# model training

## logistic model (69.7%)

In [22]:
logistic = LogisticRegression(random_state=42)
logistic.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [24]:
y_pred = pd.Series(logistic.predict(X_test), name="logisctic Prediction")
compiled_y_test = pd.concat([compiled_y_test.reset_index(drop=True), y_pred], axis = "columns")

In [25]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[3565 1138]
 [1698 2945]]


0.6965546757971325

In [27]:
#building the coefficient table to understand the impact of each variable and also to ensure no single variable is driving 
#the predictive result
intercept_table = pd.DataFrame(pd.Series(logistic.intercept_, name="Coefs"))
intercept_table[0] = "intercept"
intercept_table

coef_table = pd.DataFrame(list(X_train.columns)).copy()
coef_table.insert(len(coef_table.columns), "Coefs", logistic.coef_.transpose())

coef_table = pd.concat([intercept_table,coef_table], axis = 0)
coef_table = coef_table[[0,"Coefs"]]
coef_table.columns = ["Variables", "Coefficient"]

coef_table

Unnamed: 0,Variables,Coefficient
0,intercept,0.004953
0,EDUCATION,-0.023155
1,AGE,0.037494
2,max_backlog_month,0.869153
3,balance_to_credit,0.028645
4,female,-0.068449
5,male,0.073319
6,married,0.105657
7,others,-0.025613
8,single,-0.075174


## Decision tree (87.1%)

In [29]:
Tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)
Tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=42)

In [30]:
y_pred = pd.Series(Tree.predict(X_test), name="Tree Prediction")
compiled_y_test = pd.concat([compiled_y_test.reset_index(drop=True), y_pred], axis = "columns")

In [31]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[3721  982]
 [ 222 4421]]


0.8711748341536486

## Random Forest (88.3%)

In [33]:
forest = RandomForestClassifier(n_estimators = 201, criterion = 'entropy', random_state = 0)
forest.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=201, random_state=0)

In [34]:
y_pred = pd.Series(forest.predict(X_test), name="Forest Prediction")
compiled_y_test = pd.concat([compiled_y_test.reset_index(drop=True), y_pred], axis = "columns")

In [35]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[3802  901]
 [ 192 4451]]


0.8830515728653969

In [36]:
#export this model as it is chosen
joblib.dump(forest, "random_forest_predictor.pkl")

['random_forest_predictor.pkl']

# Emsembling previous 3 models (96.0%)

In [38]:
#combine all model and let them vote
ensembled_pred = compiled_y_test.drop("default payment next month",axis="columns")
compiled_y_test["ensembled prediction"] = ensembled_pred.mode(axis=1)
compiled_y_test

Unnamed: 0,default payment next month,logisctic Prediction,Tree Prediction,Forest Prediction,ensembled prediction
0,0,1,0,0,0
1,1,1,1,1,1
2,0,1,1,1,1
3,1,1,1,1,1
4,0,0,0,0,0
...,...,...,...,...,...
9341,0,0,0,0,0
9342,1,0,1,1,1
9343,0,0,0,0,0
9344,0,0,0,0,0


In [40]:
#accuracy is as good as random forest
cm = confusion_matrix(compiled_y_test["default payment next month"], compiled_y_test["ensembled prediction"] )
print(cm)
accuracy_score(compiled_y_test["ensembled prediction"] , y_pred)

[[3846  857]
 [ 188 4455]]


0.9595548897924245