In [1]:
# Use the ensemble module from the sklearn library

# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("../Resources/loans_data_encoded.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [3]:
# Define the features (inputs) set
X = df_loans.copy()
X = X.drop("bad", axis=1)
X = X.drop("gender_male", axis=1)
X = X.drop("gender_female", axis=1)
X = X.drop("education_Bachelor", axis=1)
X = X.drop("education_Master or Above", axis=1)
X.head()

Unnamed: 0,amount,term,age,month_num,education_High School or Below,education_college
0,1000,30,45,6,1,0
1,1000,30,50,7,0,0
2,1000,30,33,8,0,0
3,1000,15,27,9,0,1
4,1000,30,28,10,0,1


In [4]:
# Define the target (outputs) set
y = df_loans["bad"].ravel()
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [5]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
# Creating a StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

The RandomForestClassifier takes a variety of parameters, but for our purposes we only need the n_estimators and the random_state.

The n_estimators will allow us to set the number of trees that will be created by the algorithm. Generally, the higher number makes the predictions stronger and more stable, but can slow down the output because of the higher training time allocated. The best practice is to use between 64 and 128 random forests, though higher numbers are quite common despite the higher training time. For our purposes, we'll create 128 random forests.

After we create the random forest instance, we need to fit the model with our training sets.

In [8]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [9]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0], dtype=int64)

In [10]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,54,30
Actual 1,28,13


In [11]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.536

In [12]:
# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,54,30
Actual 1,28,13


Accuracy Score : 0.536
Classification Report
              precision    recall  f1-score   support

           0       0.66      0.64      0.65        84
           1       0.30      0.32      0.31        41

    accuracy                           0.54       125
   macro avg       0.48      0.48      0.48       125
weighted avg       0.54      0.54      0.54       125



From the confusion matrix results, the precision for the bad loan applications is low, indicating a large number of false positives, which indicates an unreliable positive classification. The recall is also low for the bad loan applications, which is indicative of a large number of false negatives. The F1 score is also low (33).

In summary, this random forest model is not good at classifying fraudulent loan applications because the model's accuracy, 0.520, and F1 score are low.

## Rank the Importance of Features
One nice byproduct of the random forest algorithm is to rank the features by their importance, which allows us to see which features have the most impact on the decision.

To calculate the feature importance, we can use thefeature_importances_attribute with the following code:

In [13]:
# Calculate feature importance in the Random Forest model
importances = rf_model.feature_importances_
importances

array([0.05527956, 0.083377  , 0.47627742, 0.31881675, 0.03075444,
       0.03549482])

In [14]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4762774155433003, 'age'),
 (0.3188167541048824, 'month_num'),
 (0.08337700488414826, 'term'),
 (0.05527956048657813, 'amount'),
 (0.03549482315572748, 'education_college'),
 (0.030754441825363306, 'education_High School or Below')]