In [4]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

#machine learning packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score

#models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#export object
import joblib

# import cleaned dataset

In [5]:
df = pd.read_csv("cleaned_borrower_profile.csv")

In [6]:
df.dtypes

gender               object
maritalstatus        object
numofdependence       int64
education             int64
jobpos               object
age                   int64
household_income    float64
work_experience     float64
years_home_owned      int64
status                int64
dtype: object

In [7]:
df["status"].value_counts(normalize=True)

0    0.844345
1    0.155655
Name: status, dtype: float64

# balancing the data. Make the 2 class 50:50

In [8]:
default_class = {}

for y in set(list(df["status"])):
    default_class[y] = df[df["status"] == y]

In [9]:
expanded_defaulter = default_class[1].sample(n=len(default_class[0]),replace=True)

In [10]:
df = pd.concat([default_class[0], expanded_defaulter],axis="index")
df["status"].value_counts(normalize=True)

0    0.5
1    0.5
Name: status, dtype: float64

# one-hot encoding dependent variable

In [11]:
df.columns

Index(['gender', 'maritalstatus', 'numofdependence', 'education', 'jobpos',
       'age', 'household_income', 'work_experience', 'years_home_owned',
       'status'],
      dtype='object')

In [12]:
cat_varibles = ['gender', 'maritalstatus', "jobpos"]

In [13]:
for col in cat_varibles:
    dummy = pd.get_dummies(df[col])
    df.drop(col,axis="columns", inplace=True)
    df = pd.concat([df,dummy], axis="columns")

# train test split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("status", axis="columns"), df["status"], 
                                                    test_size = 0.2, random_state = 42)

# feature scaling

In [15]:
sc = StandardScaler()
numeric_columns = X_train.columns[0:6]


In [16]:
X_train[numeric_columns] = sc.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = sc.transform(X_test[numeric_columns])

In [17]:
joblib.dump(sc, "feature_scaler.pkl")

['feature_scaler.pkl']

In [18]:
compiled_y_test = pd.DataFrame(y_test.copy())

# Models Training

## Logistic regression (56.8% AUC)

In [19]:
logistic = LogisticRegression(random_state=42)
logistic.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [20]:
y_pred = pd.Series(logistic.predict(X_test), name="logisctic Prediction")
compiled_y_test = pd.concat([compiled_y_test.reset_index(drop=True), y_pred], axis = "columns")

In [21]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[17204 16500]
 [12995 20645]]


0.5620248277500594

In [22]:
roc_auc_score(y_test, y_pred)

0.5620738940649419

In [23]:
#building the coefficient table to understand the impact of each variable and also to ensure no single variable is driving 
#the predictive result
intercept_table = pd.DataFrame(pd.Series(logistic.intercept_, name="Coefs"))
intercept_table[0] = "intercept"
intercept_table

coef_table = pd.DataFrame(list(X_train.columns)).copy()
coef_table.insert(len(coef_table.columns), "Coefs", logistic.coef_.transpose())

coef_table = pd.concat([intercept_table,coef_table], axis = 0)
coef_table = coef_table[[0,"Coefs"]]
coef_table.columns = ["Variables", "Coefficient"]

coef_table

Unnamed: 0,Variables,Coefficient
0,intercept,-0.111358
0,numofdependence,-0.031053
1,education,-0.147965
2,age,-0.091843
3,household_income,-0.092427
4,work_experience,0.021964
5,years_home_owned,0.035025
6,Female,-0.096832
7,Male,-0.017883
8,DIVORCE,0.20812


## Kernal Support Vector

In [24]:
# #too slow
# kernal = SVC(kernel = 'rbf', random_state = 42)
# kernal.fit(X_train, y_train)

In [25]:
# y_pred = pd.Series(Tree.predict(X_test), name="SVC Prediction")

In [26]:
# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# accuracy_score(y_test, y_pred)

## Naive bayes model (51.8% AUC)

In [27]:
NB = GaussianNB()
NB.fit(X_train, y_train)

GaussianNB()

In [28]:
y_pred = pd.Series(NB.predict(X_test), name="NB Prediction")
# compiled_y_test = pd.concat([compiled_y_test.reset_index(drop=True), y_pred], axis = "columns")

In [29]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 5692 28012]
 [ 3561 30079]]


0.5311683297695414

In [30]:
roc_auc_score(y_test, y_pred)

0.5315129540720035

## Decision Tree (90% AUC)

In [31]:
Tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)
Tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=42)

In [32]:
y_pred = pd.Series(Tree.predict(X_test), name="Tree Prediction")
compiled_y_test = pd.concat([compiled_y_test.reset_index(drop=True), y_pred], axis = "columns")

In [33]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[27408  6296]
 [  529 33111]]


0.8986546685673557

In [34]:
roc_auc_score(y_test, y_pred)

0.8987359598129678

# Random Forest (94% AUC)

In [35]:
forest = RandomForestClassifier(n_estimators = 201, criterion = 'entropy', random_state = 0)
forest.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=201, random_state=0)

In [36]:
y_pred = pd.Series(forest.predict(X_test), name="Forest Prediction")
compiled_y_test = pd.concat([compiled_y_test.reset_index(drop=True), y_pred], axis = "columns")

In [37]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[30494  3210]
 [  547 33093]]


0.944211808030411

In [38]:
roc_auc_score(y_test, y_pred)

0.9442493373802225

In [39]:
#export this model as it is chosen
joblib.dump(forest, "random_forest_predictor.pkl")

['random_forest_predictor.pkl']

# Ensemble method of all trained models 
Conclusion
* performed 2% better than the best trained model: randomforest.
* However, there are a lot more false postive
* Coder's call to use random_forest as the result reduces the chance of rejecting profitable clients
* proceed to hypertune random forest

In [40]:
#combine all model and let them vote
ensembled_pred = compiled_y_test.drop('status',axis="columns")
compiled_y_test["ensembled prediction"] = ensembled_pred.mode(axis=1)
compiled_y_test

Unnamed: 0,status,logisctic Prediction,Tree Prediction,Forest Prediction,ensembled prediction
0,0,1,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,1,1,1,1,1
4,0,1,0,1,1
...,...,...,...,...,...
67339,0,1,0,0,0
67340,0,0,0,0,0
67341,0,1,1,0,1
67342,1,1,1,1,1


In [42]:
#accuracy is as good as random forest
cm = confusion_matrix(compiled_y_test["status"], compiled_y_test["ensembled prediction"] )
print(cm)
accuracy_score(compiled_y_test["ensembled prediction"] , y_pred)

[[28539  5165]
 [  525 33115]]


0.9611992159657876

# hypertuning random forest (201 trees is good enough) 

In [133]:
trees = [201,401,701]

for ntree in trees:
    print(ntree)
    forest = RandomForestClassifier(n_estimators = ntree, criterion = 'entropy', random_state = 0)
    forest.fit(X_train, y_train)

    y_pred = pd.Series(forest.predict(X_test), name="Forest Prediction")
    compiled_y_test = pd.concat([compiled_y_test.reset_index(drop=True), y_pred], axis = "columns")

    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(accuracy_score(y_test, y_pred))

201
[[30537  3167]
 [  573 33067]]
0.944464243288192
401
[[30568  3136]
 [  571 33069]]
0.9449542646709432
701
[[30570  3134]
 [  574 33066]]
0.9449394155381325


In [159]:
X_test.to_csv("sample_format.csv", index=False)

# check if this part appears new