### Load and Clean Data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Create function to convert colors: rgb_colors
def rgb_color(r,g,b):
    r = r/255.
    g = g/255.
    b = b/255.
    return(r,g,b)

# Create colors:
tableau_blue = rgb_color(31,119,180)
tableau_grey = rgb_color(127,127,127)
tableau_light = rgb_color(162,162,162)

In [15]:
df = pd.read_csv('https://s3.amazonaws.com/clcarverloans/data/df_indicators.csv')

In [16]:
df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

### EDA

In [7]:
corr_matrix = df.corr()
corr_matrix['Delinquent'].sort_values(ascending=False)

Delinquent                       1.000000
Lender_Term                      0.026250
Loan Amount                      0.013149
contribution_living              0.005025
contribution_health              0.004648
Id                               0.003649
mpi_headcount_percent            0.001736
poverty_national                 0.001598
mpi_index                        0.000844
Cluster                         -0.000838
poverty_international ($1.90)   -0.004419
mpi_intensity                   -0.005834
contribution_education          -0.011012
Name: Delinquent, dtype: float64

### Model Testing

In [17]:
# Remove usesless rows: df_select
df.drop(columns=['Id', 'Funded Date', 'Country Code','Use'], inplace=True)

# One_hot_encode categorical columns: df_dummies
df_dummies = pd.get_dummies(df)

# Select features and target: X, y
y = df_dummies.pop('Delinquent')
X = df_dummies

# Split data into train and test set: train_set, test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Create a list of models to test
model_list = [DecisionTreeClassifier(class_weight={0:1, 1:24}), KNeighborsClassifier(), LogisticRegression(class_weight={0:1, 1:24}), RandomForestClassifier(class_weight={0:1, 1:24}), GradientBoostingClassifier()]

In [19]:
# Build function to score each model: model_scoree
def model_score(model, X_train, X_test, y_train, y_test, sub_plot, i):
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    y_pred = model.predict(X_test)
    
    CM = confusion_matrix(y_test, y_pred)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    plt.subplot(2,3,i+1)
    plt.subplots_adjust(hspace=0.5, wspace=0.3)
    plt.plot(fpr, tpr, color=tableau_blue, linewidth=2)
    plt.title(model.__class__.__name__, fontsize=8)
    plt.text(0.2, 0.5, ('FPR : {}'.format(round(FP/(FP+TN), 3))), fontsize=8)
    plt.text(0.2, 0.3, ("Recall : {}".format(round(recall_score(y_test, y_pred), 3))), fontsize=8)

In [None]:
plt.figure(figsize=(8,12))
sub_plot = plt.subplots(2,3, sharex=True, sharey=True)
for i, model in enumerate(model_list):
    model_score(model, X_train, X_test, y_train, y_test, sub_plot, i)
plt.show()