In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### Load in the Data
___

In [10]:
# Load in data: df
df = pd.read_csv('https://s3.amazonaws.com/clcarverloans/data/df_cleaned.csv')

### Clean the Data
___

In [12]:
# Remove usesless rows: df_select
cols = ['Borrower_Genders', 'Country Code', 'Loan Amount', 'Lender_Term',
       'Repayment_Interval', 'Distribution_Model', 'Sector', 'Activity','Delinquent']
df_select = train_set[cols]

# One_hot_encode categorical columns: df_dummies
df_dummies = pd.get_dummies(df_select)

# Select features and target: X, y
y = df_dummies.pop('Delinquent')
X = df_dummies

In [13]:
# Split data into train and test set: train_set, test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Test Simple Model

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Build function to fit and evaluate model: eval_model
def eval_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print ('Precision Score: ', precision_score(y_test, y_pred))
    print ('Recall Score: ', recall_score(y_test, y_pred))
    print ('F1 Score: ', f1_score(y_test, y_pred))
    print('Confusion Matrix: ', confusion_matrix(y_test, y_pred))

In [34]:
# Evaluate simple Decision Tree: dt
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=5)
eval_model(dt,  X_train, X_test, y_train, y_test)

Accuracy:  0.9834632328024399
Precision Score:  0.7680180180180181
Recall Score:  0.7074688796680498
F1 Score:  0.736501079913607
Confusion Matrix:  [[14170   103]
 [  141   341]]


In [35]:
feature_importance = sorted(list(zip(dt.feature_importances_, X_train.columns)), reverse=True)
feature_importance[0:10]

[(0.20238770356869887, 'Country Code_TG'),
 (0.1391011667832302, 'Borrower_Genders_group'),
 (0.12355291357294462, 'Loan Amount'),
 (0.11979989488254253, 'Lender_Term'),
 (0.05174771345812416, 'Country Code_LR'),
 (0.05081250865042287, 'Country Code_KH'),
 (0.0362987011365347, 'Country Code_TZ'),
 (0.03153361930472385, 'Country Code_AF'),
 (0.017000262462345185, 'Country Code_VN'),
 (0.014911591001821642, 'Country Code_PE')]

### Test Other Models

In [42]:
dt_2 = DecisionTreeClassifier(class_weight={0:1, 1:24}, random_state=42)
eval_model(dt_2,  X_train, X_test, y_train, y_test)

Accuracy:  0.9806167400881057
Precision Score:  0.6695501730103807
Recall Score:  0.8029045643153527
F1 Score:  0.7301886792452832
Confusion Matrix:  [[14082   191]
 [   95   387]]


In [43]:
from sklearn.linear_model.logistic import LogisticRegression
logreg = LogisticRegression(class_weight={0:1, 1:24}, random_state=42)
eval_model(logreg, X_train, X_test, y_train, y_test)

Accuracy:  0.9527617756692647
Precision Score:  0.40611353711790393
Recall Score:  0.9647302904564315
F1 Score:  0.5716041794714197
Confusion Matrix:  [[13593   680]
 [   17   465]]


In [45]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight={0:1, 1:24}, random_state=42)
eval_model(rf, X_train, X_test, y_train, y_test)

Accuracy:  0.983598780074551
Precision Score:  0.7586206896551724
Recall Score:  0.7302904564315352
F1 Score:  0.7441860465116279
Confusion Matrix:  [[14161   112]
 [  130   352]]


In [49]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(weights='distance')
eval_model(knn, X_train, X_test, y_train, y_test)

Accuracy:  0.9788546255506608
Precision Score:  0.785234899328859
Recall Score:  0.4854771784232365
F1 Score:  0.6
Confusion Matrix:  [[14209    64]
 [  248   234]]
