In [2]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


## Loading and Preprocessing Loans Encoded Data

Load the `sba_loans_encoded.csv` in a pandas DataFrame called `df_loans`

In [3]:
# Loading data
file_path = Path("../Resources/InspectionList2.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()


Unnamed: 0,Seg_ID,UP_MH,DS_MH,"Dia., Inch","Length, Ft","MWL, %","DS, % depth",defects,Video Quality,Risk,Inspection Raning
0,1,36206.0,36207.0,42,200,95,0.0,1,0,1,3
1,2,36206.0,36207.0,60,200,95,0.0,1,0,1,3
2,3,36206.0,36207.0,60,200,95,0.0,1,0,1,1
3,4,36204.0,838.0,102,651,80,0.0,4,1,4,9
4,5,8197.0,36206.0,96,950,75,0.0,1,1,1,8


Define the features set, by copying the `df_loans` DataFrame and dropping the `Default` column.

In [4]:
# Define features set
X = df_loans.copy()
X.drop("Inspection Raning", axis=1, inplace=True)
X.head()


Unnamed: 0,Seg_ID,UP_MH,DS_MH,"Dia., Inch","Length, Ft","MWL, %","DS, % depth",defects,Video Quality,Risk
0,1,36206.0,36207.0,42,200,95,0.0,1,0,1
1,2,36206.0,36207.0,60,200,95,0.0,1,0,1
2,3,36206.0,36207.0,60,200,95,0.0,1,0,1
3,4,36204.0,838.0,102,651,80,0.0,4,1,4
4,5,8197.0,36206.0,96,950,75,0.0,1,1,1


Create the target vector by assigning the values of the `Default` column from the `df_loans` DataFrame.

In [5]:
# Define target vector
y = df_loans["Risk"].values.reshape(-1, 1)
y[:5]


array([[1],
       [1],
       [1],
       [4],
       [1]], dtype=int64)

Split the data into training and testing sets.

In [6]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [7]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [8]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [9]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

Once the data is scaled, create a random forest instance and train it with the training data (`X_train_scaled` and `y_train`), define `n_estimators=500` and `random_state=78`.

In [10]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [11]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

## Making Predictions Using the Random Forest Model

Validate the trained model by predicting loan defaults using the testing data (`X_test_scaled`).

In [12]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [17]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"], columns=["Predicted 0", "Predicted 1","Predicted 2", "Predicted 3"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,9,0,0,0
Actual 1,0,36,0,0
Actual 2,0,0,5,0
Actual 3,0,0,0,5


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        36
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5

    accuracy                           1.00        55
   macro avg       1.00      1.00      1.00        55
weighted avg       1.00      1.00      1.00        55



## Feature Importance

In this section, you are asked to fetch the features' importance from the random forest model and display the top 10 most important features.

In [18]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.4878673181685844, 'Risk'),
 (0.12135484982357712, 'defects'),
 (0.10443664553797649, 'DS, % depth'),
 (0.07580637386164031, 'MWL, %'),
 (0.07376438509319544, 'Seg_ID'),
 (0.04450521813211011, 'DS_MH'),
 (0.03150967991448792, 'UP_MH'),
 (0.03009755475461015, 'Dia., Inch'),
 (0.026860299178648125, 'Length, Ft'),
 (0.0037976755351698723, 'Video Quality')]

## Analysis Questions

Finally, analyze the model's evaluation results and answer the following questions.

* **Question 1:** Would you trust this model to detect if a loan will default? 

 * **Sample Answer:** Yes. The model's accuracy is good a predicting if a loan will default because of the high accuracy and F-1 and recall scores. 


* **Question 2:** What are your insights about the top 10 most important features?

 * **Sample Answer:** It seems that the "Bank" is not relevant for the model, so we can create a new random forest model by only taking the top 5 to 10 features. Also, for piloting this model in a business environment, we will only need to fetch new data about these features.