In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Read dataset files
# test_df = pd.read_csv('dataset/test.csv')
train_df_cleaned = pd.read_csv('dataset/train_data_cleaned.csv')
#data cleanup step performed in credit-default-ml.ipynb as well as getdummies step

## Training Data

In [None]:
train_df_cleaned

-------------

In [None]:
#Ricardo Start

In [None]:
# Assign X (data) and y (target)
X = train_df_cleaned.drop("credit_card_default", axis=1)
y = train_df_cleaned["credit_card_default"]
target_names = ["no-default", "default"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)


X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
#Fit(train) 

lgR=classifier.fit(X_train_scaled, y_train)


In [None]:
print(f"Training Data Score: {lgR.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lgR.score(X_test_scaled, y_test)}")

In [None]:
predictions = lgR.predict(X_test_scaled)
print(f"First 35 Predictions:   {predictions[:35]}")
print(f"First 35 Actual labels: {y_test[:35].tolist()}")

In [None]:
results=pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
results

In [None]:
#Save the model
import joblib

filename = 'saved_models/lgR_trained.joblib'
joblib.dump(lgR, filename)

In [None]:
#Load the model
loaded_model = joblib.load(filename)
print('Test Acc: %.3f' % loaded_model.score(X_test_scaled, y_test))

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = lgR.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(y_test, predictions), cmap="BuPu", 
        annot = True, fmt="d")

In [None]:
predictions_df = pd.DataFrame({"Prediction": predictions})
actual_df = pd.DataFrame({"Actual": y_test})

In [None]:
predictions_df.value_counts()


In [None]:
actual_df.value_counts()

In [None]:

#Ricardo End

-------------

In [None]:
#Kelly Start

In [None]:
filename = 'saved_models/decision_tree_trained.joblib'
decision_tree_model = joblib.load(filename)
print('Test Acc: %.3f' % decision_tree_model.score(X_test_scaled, y_test))

In [None]:
filename = 'saved_models/random_forest_trained.joblib'
random_forest_model = joblib.load(filename)
print('Test Acc: %.3f' % random_forest_model.score(X_test_scaled, y_test))

In [None]:
#Kelly End

-------------

In [None]:
#Timmy Start

In [None]:
filename = 'saved_models/KNN_trained.joblib'
KNN_model = joblib.load(filename)
print('Test Acc: %.3f' % KNN_model.score(X_test_scaled, y_test))

In [None]:
#Timmy End

-------------

In [None]:
#Jumaan Start

In [None]:
filename = 'saved_models/SVM_trained.joblib'
loaded_model = joblib.load(filename)
print('Test Acc: %.3f' % loaded_model.score(X_test_scaled, y_test))

In [None]:
#Jumaan End

-------------

In [None]:
#Feipeng Start

In [None]:
from tensorflow.keras.utils import to_categorical

# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

In [None]:
# Load the model
from tensorflow.keras.models import load_model

deep_trained = load_model("saved_models/neural_network_deep_trained.h5")

In [None]:
deep_model_loss, deep_model_accuracy = deep_trained.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Deep Learning Neural Network - Loss: {deep_model_loss}, Accuracy: {deep_model_accuracy}")

In [None]:
# print('Test Acc: %.3f' % deep_trained.score(X_test_scaled, y_test_categorical))

In [None]:
#Feipeng end

## Analysis of the Models

In [None]:
import csv
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import sys
import numpy


In [None]:
# load dataset
location = 'dataset/train_data_cleaned1.csv'

In [None]:
with open("dataset/train_data_cleaned.csv") as csvFile:
    reader = csv.reader(csvFile)
    names_all = next(reader)

In [None]:
dataframe = pandas.read_csv(location, names=names_all,low_memory=False)
array = dataframe.values
X = array[:,0:32]
Y = array[:,32]

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X)


X_scaled = X_scaler.transform(X)

In [None]:
# prepare configuration for cross validation test harness
seed = 7

In [None]:
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('RFC', RandomForestClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))


In [None]:
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

In [None]:
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, shuffle=True,random_state= 1)
	cv_results = model_selection.cross_val_score(model, X_scaled, Y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

In [None]:
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results,widths = 0.6, patch_artist = False)
ax.set_xticklabels(names)


fig.set_size_inches(18.5, 10.5)
fig.savefig('test2png.png', dpi=100)

plt.show()