In [None]:
#Imports

from sklearn import tree
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Read CSV

train_data = pd.read_csv("dataset/train_data_cleaned.csv")
train_data

In [None]:
#Drop the default column

train_data_df = train_data.drop("credit_card_default", axis=1)

feature_names = train_data.columns
train_data_df

In [None]:
#Target Credit default or no default

target = train_data["credit_card_default"]
target_names = ["no default", "default"]



In [None]:
#Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data_df, target, random_state=42)

In [None]:
#Create the scaler

from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)


In [None]:
#Scale the data

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


<b>Decision Tree<b>
    

In [None]:
#Decision tree classifier

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)

In [None]:
#Create feature importances - decision tree

sorted(zip(clf.feature_importances_, feature_names), reverse=True)

In [None]:
#Create feature importances graph - decision tree

feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)
feat_importances.nlargest(5).plot(kind='bar', color=['purple', 'green', 'blue', 'orange', 'red'])
plt.title("Top 5 Feature Importances Decision Tree")
plt.xlabel("Features")
plt.ylabel("% of Importance")


In [None]:
# Calculate classification report - decision tree
from sklearn.metrics import classification_report
predictions = clf.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

In [None]:
#Confusion matrix - decision tree

import seaborn as sns
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(y_test, predictions), cmap="Purples", 
        annot = True, fmt="d")

In [None]:
#Save the model - decision tree
import joblib

filename= 'saved_models/decision_tree_trained.joblib'
joblib.dump(clf, filename)

In [None]:
#Load the model - decision tree
loaded_model = joblib.load(filename)
print('Test Acc: %.3f' % loaded_model.score(X_test_scaled, y_test))

<b>Random Forest<b>


In [None]:
feature_names1 = train_data_df.columns

target = train_data["credit_card_default"]
target_names1 = ["no default", "default"]


In [None]:
#Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

In [None]:
#Create feature imporances Random Forest

sorted(zip(rf.feature_importances_, feature_names1), reverse=True)

In [None]:
#Create graph of top 5 feature importances Random Forest

feat_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
feat_importances.nlargest(5).plot(kind='bar', color=['purple', 'green', 'blue', 'orange', 'red'])
plt.title("Top 5 Feature Importances Random Forest")
plt.xlabel("Features")
plt.ylabel("% of Importance")




In [None]:
# Calculate classification report - random forest

from sklearn.metrics import classification_report
predictions1 = rf.predict(X_test_scaled)
print(classification_report(y_test, predictions1,
                            target_names=target_names))

In [None]:
#Confusion matrix - random forest

import seaborn as sns
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(y_test, predictions), cmap="Blues", 
        annot = True, fmt="d")

In [None]:
#Save the model - random forest

import joblib

filename1= 'saved_models/random_forest_trained.joblib'
joblib.dump(rf, filename1)

In [None]:
#Load the model - random forest

loaded_model = joblib.load(filename1)
print('Test Acc: %.3f' % loaded_model.score(X_test_scaled, y_test))

<b>Decision Tree Graph<b>

In [None]:
#Create decision tree graph

import graphviz 
dot_data = tree.export_graphviz(
    clf, out_file=None, 
    feature_names=feature_names1,  
    class_names=target_names1,  
    filled=True, rounded=True,  
    special_characters=True)  

import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('credit_decision_tree.png')

graph = graphviz.Source(dot_data)  
graph 