<a href="https://colab.research.google.com/github/digdigbear/Programming_Individual_Predictive_Models_Hyperparameter_Tuning.ipynb/blob/main/programming_individual_predictive_models_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report, make_scorer, ConfusionMatrixDisplay
import seaborn as sns
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split,RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
import plotly.graph_objects as go
from scipy.stats import randint
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

**DATA PREPARATION**

In [None]:
#Loading dataset

df = pd.read_csv('/content/loan_standardized_with_debt_to_income.csv')

loan_df = df.drop('loan_id', axis=1)
loan_df.head()

Unnamed: 0,no_of_dependents,cibil_score,education_Graduate,self_employed_Yes,loan_status_Approved,debt_to_income,asset_combined
0,2,1.032792,1,0,1,-0.4412,1.918096
1,0,-1.061051,0,1,0,-0.15937,-1.279072
2,3,-0.54484,1,0,0,-0.68281,2.294615
3,3,-0.771045,1,0,0,0.081424,1.543931
4,5,-1.264055,0,1,0,-0.782394,1.547079


In [None]:
#Splitting

x = loan_df.drop(['loan_status_Approved'], axis=1)
y = loan_df['loan_status_Approved']

In [None]:
#Setting up 5-fold cross validation

kf = KFold(n_splits=5, shuffle=True, random_state=42)

**RANDOM FOREST**

*Developing base model*

In [None]:
random_state_value = 50

# Split the data into training and test sets (70% training, 30% testing)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=random_state_value)


rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy)

Accuracy: 0.994535519125683


*Hyper-parameters tunnning*

Group work

In [None]:
#Hyperparameter Tuning (search RandomizedSearchCV to optimize the hyperparameters of a Random Forest  )
param_dist_group = {'n_estimators': randint(50,500),
                    'max_depth': randint(1,20)}

# Create a random forest classifier
rf = RandomForestClassifier(random_state=random_state_value)

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf,
                                 param_distributions = param_dist_group,
                                 n_iter=5,
                                 cv=5,random_state=random_state_value)

# Fit the random search object to the data
rand_search.fit(x_train, y_train)

In [None]:
# # Get the best model after RandomizedSearchCV
# best_rf = rand_search.best_estimator_

# # Check the value of max_features in the best model
# print(f"Best model max_features: {best_rf.max_features}")


# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)


Best hyperparameters: {'max_depth': 17, 'n_estimators': 445}


In [None]:
# Get details of tested parameters and their scores
print(pd.DataFrame(rand_search.cv_results_)[['params', 'mean_test_score']])

                                   params  mean_test_score
0  {'max_depth': 17, 'n_estimators': 445}         0.993642
1  {'max_depth': 14, 'n_estimators': 339}         0.993307
2   {'max_depth': 5, 'n_estimators': 120}         0.985272
3   {'max_depth': 6, 'n_estimators': 120}         0.989959
4  {'max_depth': 14, 'n_estimators': 183}         0.992973


Individual work_Mtry

In [None]:
# mtry method
param_dist_mtry = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(1, 20),
    'max_features': ['sqrt', 'log2', None, 10]  # Example of adding 'max_features'
}

# Create a random forest classifier
rf = RandomForestClassifier(random_state=random_state_value)

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf,
                                 param_distributions = param_dist_mtry,
                                 n_iter=5,
                                 cv=5,random_state=random_state_value)

# Fit the random search object to the data
rand_search.fit(x_train, y_train)

In [None]:
# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 17, 'max_features': 'sqrt', 'n_estimators': 445}


In [None]:

# Get details of tested parameters and their scores
results_df = pd.DataFrame(rand_search.cv_results_)

# Display relevant columns including n_estimators, max_depth, max_features, etc.
print(results_df[['params', 'mean_test_score']])


                                              params  mean_test_score
0  {'max_depth': 17, 'max_features': 'sqrt', 'n_e...         0.993642
1  {'max_depth': 14, 'max_features': 'log2', 'n_e...         0.993307
2  {'max_depth': 7, 'max_features': 'log2', 'n_es...         0.992972
3  {'max_depth': 14, 'max_features': 'log2', 'n_e...         0.992972
4  {'max_depth': 8, 'max_features': 10, 'n_estima...         0.993307


In [None]:
# Extract individual parameters into separate columns
params_df = pd.json_normalize(results_df['params'])

# Concatenate this with the original DataFrame (for mean_test_score)
full_results_df = pd.concat([params_df, results_df[['mean_test_score']]], axis=1)

# Display the full results with all hyperparameters and their scores
print(full_results_df)


   max_depth max_features  n_estimators  mean_test_score
0         17         sqrt           445         0.993642
1         14         log2           182         0.993307
2          7         log2           120         0.992972
3         14         log2           308         0.992972
4          8           10           213         0.993307


In [None]:
for max_feature in ['sqrt', 'log2', None, 10]:
    best_rf = RandomForestClassifier(
        n_estimators=445,  # Best value from the first code
        max_depth=17,      # Best value from the first code
        max_features=max_feature,
        random_state=random_state_value
    )
    scores = cross_val_score(best_rf, x_train, y_train, cv=5)
    print(f"max_features={max_feature}, Mean CV Score: {scores.mean()}")


max_features=sqrt, Mean CV Score: 0.993641563447113
max_features=log2, Mean CV Score: 0.993641563447113
max_features=None, Mean CV Score: 0.9933071152865779
max_features=10, Mean CV Score: 0.9933071152865779


Individual work_ node sizes

In [None]:
# node sizes method
param_dist_node_sizes = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(1, 20),
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf
}

# Create a random forest classifier
rf = RandomForestClassifier(random_state=random_state_value)

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf,
                                 param_distributions = param_dist_node_sizes,
                                 n_iter=5,
                                 cv=5,random_state=random_state_value)

# Fit the random search object to the data
rand_search.fit(x_train, y_train)



In [None]:
# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 17, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 339}


In [None]:
# Get details of tested parameters and their scores
results_df = pd.DataFrame(rand_search.cv_results_)

# Display relevant columns including n_estimators, max_depth, max_features, etc.
print(results_df[['params', 'mean_test_score']])

                                              params  mean_test_score
0  {'max_depth': 17, 'min_samples_leaf': 1, 'min_...         0.994145
1  {'max_depth': 5, 'min_samples_leaf': 4, 'min_s...         0.988875
2  {'max_depth': 14, 'min_samples_leaf': 2, 'min_...         0.993559
3  {'max_depth': 8, 'min_samples_leaf': 1, 'min_s...         0.993559
4  {'max_depth': 7, 'min_samples_leaf': 1, 'min_s...         0.993559


In [None]:
# Extract individual parameters into separate columns
params_df = pd.json_normalize(results_df['params'])

# Concatenate this with the original DataFrame (for mean_test_score)
full_results_df = pd.concat([params_df, results_df[['mean_test_score']]], axis=1)

# Display the full results with all hyperparameters and their scores
print(full_results_df)

   max_depth  min_samples_leaf  min_samples_split  n_estimators  \
0         17                 1                  5           339   
1          5                 4                  5           120   
2         14                 2                 10           145   
3          8                 1                 10           213   
4          7                 1                  5           391   

   mean_test_score  
0         0.994145  
1         0.988875  
2         0.993559  
3         0.993559  
4         0.993559  
