In [8]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

### Load in the Data
___

In [2]:
# Load in data: df
df = pd.read_csv('https://s3.amazonaws.com/clcarverloans/data/df_cluster.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Funded Date,Borrower_Genders,Country Code,Country,Town_Name,Loan Amount,Lender_Term,Repayment_Interval,Distribution_Model,Sector,Activity,Use,Delinquent,Cluster
0,0,155661,2010-01-04T14:29:24Z,female,PH,Philippines,"Dipolog-Piñan, Zamboanga del Norte",1075,7.0,monthly,field_partner,Housing,Personal Housing Expenses,To purchase construction materials and labor f...,False,0.0
1,1,155674,2010-01-01T19:15:17Z,male,AF,Afghanistan,Kabul Afghanistan,1075,14.0,monthly,field_partner,Construction,Construction,to buy construction materials,False,0.0
2,2,155677,2010-01-02T00:21:08Z,male,AF,Afghanistan,Kabul Afghanistan,1075,14.0,monthly,field_partner,Construction,Construction,to purchase construction materials,False,0.0
3,3,155698,2010-01-01T17:31:27Z,group,AF,Afghanistan,Kabul Afghanistan,425,10.0,monthly,field_partner,Services,Services,to expand and resupply three small businesses,False,5.0
4,4,155710,2010-01-01T18:16:31Z,group,AF,Afghanistan,Kabul Afghanistan,850,12.0,monthly,field_partner,Services,Services,to expand four small businesses,False,5.0


### Clean the Data
___

In [17]:
# Remove usesless rows: df_select
cols = ['Borrower_Genders', 'Country Code', 'Loan Amount', 'Lender_Term',
       'Repayment_Interval', 'Distribution_Model', 'Sector', 'Activity','Cluster','Delinquent']
df_select = df[cols]

df_select.dropna(inplace=True)

# One_hot_encode categorical columns: df_dummies
df_dummies = pd.get_dummies(df_select)

# Select features and target: X, y
y = df_dummies.pop('Delinquent')
X = df_dummies

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
# Split data into train and test set: train_set, test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Test Simple Model

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Build function to fit and evaluate model: eval_model
def eval_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print ('Precision Score: ', precision_score(y_test, y_pred))
    print ('Recall Score: ', recall_score(y_test, y_pred))
    print ('F1 Score: ', f1_score(y_test, y_pred))
    print('Confusion Matrix: ', confusion_matrix(y_test, y_pred))

In [22]:
# Evaluate simple Decision Tree: dt
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=5)
eval_model(dt,  X_train, X_test, y_train, y_test)

Accuracy:  0.9842758770265141
Precision Score:  0.7913533834586466
Recall Score:  0.7016666666666667
F1 Score:  0.7438162544169611
Confusion Matrix:  [[17732   111]
 [  179   421]]


In [23]:
feature_importance = sorted(list(zip(dt.feature_importances_, X_train.columns)), reverse=True)
feature_importance[0:10]

[(0.20700005596435353, 'Country Code_TG'),
 (0.1368616077948382, 'Borrower_Genders_group'),
 (0.13120581695548703, 'Loan Amount'),
 (0.10610111676734349, 'Lender_Term'),
 (0.05227961188612153, 'Country Code_LR'),
 (0.04938424836962927, 'Country Code_KH'),
 (0.04316686329542834, 'Country Code_TZ'),
 (0.025748867140651777, 'Cluster'),
 (0.025382675583433833, 'Country Code_AF'),
 (0.01960882989542089, 'Country Code_VN')]

### Test Other Models

In [25]:
dt_2 = DecisionTreeClassifier(class_weight={0:1, 1:24}, random_state=42)
eval_model(dt_2,  X_train, X_test, y_train, y_test)

Accuracy:  0.9808057257496069
Precision Score:  0.6830357142857143
Recall Score:  0.765
F1 Score:  0.7216981132075472
Confusion Matrix:  [[17630   213]
 [  141   459]]


In [26]:
from sklearn.linear_model.logistic import LogisticRegression
logreg = LogisticRegression(class_weight={0:1, 1:24}, random_state=42)
eval_model(logreg, X_train, X_test, y_train, y_test)

Accuracy:  0.9494117009163369
Precision Score:  0.3871186440677966
Recall Score:  0.9516666666666667
F1 Score:  0.5503614457831326
Confusion Matrix:  [[16939   904]
 [   29   571]]


In [40]:
feature_coeficients = sorted(list(zip(logreg.coef_[0], X_train.columns)), reverse=True)
feature_coeficients

[(4.275999646610793, 'Activity_Printing'),
 (4.1741311919884145, 'Borrower_Genders_group'),
 (4.160797619943647, 'Country Code_UA'),
 (3.9660962941475293, 'Country Code_LR'),
 (3.9132803079049174, 'Country Code_US'),
 (3.190432658859434, 'Country Code_TG'),
 (2.7873708103623676, 'Activity_Child Care'),
 (2.7042538290425813, 'Country Code_TZ'),
 (2.6444715658611924, 'Activity_Vehicle'),
 (2.5984945471287, 'Country Code_QS'),
 (2.4197259116101897, 'Country Code_TJ'),
 (2.298495360555386, 'Country Code_CL'),
 (1.9182296331812143, 'Country Code_DO'),
 (1.9032132898069785, 'Country Code_AF'),
 (1.794909450626502, 'Activity_Plastics Sales'),
 (1.7703672152076244, 'Country Code_SV'),
 (1.7300826703714247, 'Country Code_GH'),
 (1.6664116042243502, 'Activity_Fishing'),
 (1.6627820204289252, 'Country Code_EC'),
 (1.6571607652354112, 'Country Code_CR'),
 (1.6569979898002996, 'Country Code_PS'),
 (1.6401649478014118, 'Activity_Farm Supplies'),
 (1.5254318709969983, 'Country Code_RW'),
 (1.32696374

In [42]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight={0:1, 1:24},  random_state=42)
eval_model(rf, X_train, X_test, y_train, y_test)

Accuracy:  0.9833541180935856
Precision Score:  0.7639639639639639
Recall Score:  0.7066666666666667
F1 Score:  0.7341991341991342
Confusion Matrix:  [[17712   131]
 [  176   424]]


In [49]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(weights='distance')
eval_model(knn, X_train, X_test, y_train, y_test)

Accuracy:  0.9788546255506608
Precision Score:  0.785234899328859
Recall Score:  0.4854771784232365
F1 Score:  0.6
Confusion Matrix:  [[14209    64]
 [  248   234]]
