In [1]:
%load_ext lab_black

## Load Packages and Data

In [2]:
import pandas as pd

from sklearn.model_selection import cross_validate

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from joblib import dump

In [3]:
def cv_report(model, X, y):

    cv_dict = cross_validate(model, X, y, scoring=["accuracy", "precision", "recall"])

    cv_accuracy = cv_dict["test_accuracy"].mean()
    cv_precision = cv_dict["test_precision"].mean()
    cv_recall = cv_dict["test_recall"].mean()

    print(f"cv_accuracy: {cv_accuracy}")
    print(f"cv_precision: {cv_precision}")
    print(f"cv_recall: {cv_recall}")

In [4]:
df = pd.read_csv("clean_default.csv")

In [5]:
df

Unnamed: 0,default,student,balance,income
0,False,False,729.526495,44361.625074
1,False,True,817.180407,12106.134700
2,False,False,1073.549164,31767.138947
3,False,False,529.250605,35704.493935
4,False,False,785.655883,38463.495879
...,...,...,...,...
9995,False,False,711.555020,52992.378914
9996,False,False,757.962918,19660.721768
9997,False,False,845.411989,58636.156984
9998,False,False,1569.009053,36669.112365


## EDA

In [6]:
df["default"].value_counts()

False    9667
True      333
Name: default, dtype: int64

In [7]:
9667 / (9667 + 333)

0.9667

In [8]:
df.dtypes

default       bool
student       bool
balance    float64
income     float64
dtype: object

## Preprocess

In [9]:
df.columns

Index(['default', 'student', 'balance', 'income'], dtype='object')

In [10]:
X = df[["student", "balance", "income"]].values

In [11]:
y = df["default"].values

## Dummy Classifier

In [12]:
dc = DummyClassifier(strategy="most_frequent")

In [13]:
cv_report(dc, X, y)

cv_accuracy: 0.9667
cv_precision: 0.0
cv_recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


## KNN

In [14]:
knn = KNeighborsClassifier()

In [15]:
cv_report(knn, X, y)

cv_accuracy: 0.9672000000000001
cv_precision: 0.5270515754268613
cv_recall: 0.15002261420171867


## Decision Tree

In [16]:
dt = DecisionTreeClassifier()

In [17]:
cv_report(dt, X, y)

cv_accuracy: 0.9545999999999999
cv_precision: 0.32750659338258475
cv_recall: 0.34825870646766166


## Random Forest

In [18]:
rf = RandomForestClassifier()

In [19]:
cv_report(rf, X, y)

cv_accuracy: 0.9691000000000001
cv_precision: 0.5677848655409632
cv_recall: 0.3360922659430122


## Naive Bayes

In [20]:
nb = GaussianNB()

In [21]:
cv_report(nb, X, y)

cv_accuracy: 0.9705999999999999
cv_precision: 0.63525
cv_recall: 0.26105834464043415


## Logistic Regression

In [22]:
lr = LogisticRegression()

In [23]:
cv_report(lr, X, y)

cv_accuracy: 0.9688000000000001
cv_precision: 0.6082707509881423
cv_recall: 0.21338760741745819


## Train the Final Model

In [24]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
dump(rf, "clf.joblib")

['clf.joblib']