# Random Forest Classifier

#### Load the packages and import the data

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/loan_data.csv")
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [2]:
data.columns

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [3]:
X = data[['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec']]
y = data["not.fully.paid"]

#### Create the dummy variable only DataFrame

In [4]:
X_dummies = pd.get_dummies(X[["purpose"]], drop_first = True)
X = pd.concat([X, X_dummies], axis = 1)
X = X.drop(["purpose"], axis = 1)

#### Split the data into a train_set and test_set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1111)

#### Fit the Random Forest Classifier Model

In [6]:
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier(n_estimators = 200,  # Choose value
                                   criterion="entropy", 
                                   random_state = 1111)  
rfc_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=1111,
            verbose=0, warm_start=False)

#### Predict on Test Set

In [7]:
y_pred = rfc_model.predict(X_test)
pred_summary = X_test.copy()
pred_summary[y.name] = y_test
pred_summary["y_pred"] = y_pred
pred_summary.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business,not.fully.paid,y_pred
3448,1,0.0737,46.57,10.915088,13.57,787,4469.958333,14105,18.8,0,0,0,0,0,0,1,0,0,0,0
2305,1,0.0737,60.54,10.545341,14.49,777,4950.041667,1595,5.1,0,0,0,0,0,0,0,0,0,0,0
6126,1,0.1114,164.02,11.163368,12.65,742,1950.041667,5238,73.4,0,0,0,1,0,0,0,0,0,0,0
4984,1,0.0894,158.86,10.545341,18.54,727,4020.0,13862,74.5,0,0,0,0,0,0,0,1,0,0,0
4315,1,0.1322,676.02,11.561716,24.89,707,5820.0,63861,65.1,0,0,0,0,1,0,0,0,0,1,0


#### Evaluate the Random Forest Model

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Random Forest Model", "\n")
print(pd.DataFrame(confusion_matrix(y_test, y_pred)), 
      "      Accuracy:", round(accuracy_score(y_test, y_pred), 3), 
      "\n")
print(classification_report(y_test, y_pred))

Random Forest Model 

      0   1
0  2376  11
1   480   7       Accuracy: 0.829 

             precision    recall  f1-score   support

          0       0.83      1.00      0.91      2387
          1       0.39      0.01      0.03       487

avg / total       0.76      0.83      0.76      2874

