In [5]:
# Import necessary libraries

import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# load and split diabetes dataset

# from : https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html

# Load the diabetes dataset from scikit-learn
diabetes = load_diabetes()

# Create DataFrame for features and Series for target variable
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.Series(diabetes.target, name='target')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)


In [7]:
# Initialize a Random Forest Regressor
rf = RandomForestRegressor()

# Define hyperparameter grid for Randomized Search CV
param_grid = {
    'n_estimators': [50, 100],   # Number of trees in the forest
    'max_depth': [10, 20], # Maximum depth of each tree
    'max_features': ['sqrt', 'log2'], # Number of features to consider when looking for the best split
}

# Perform Randomized Search CV to find best hyperparameters
random_search = RandomizedSearchCV(estimator=rf, 
                                   param_distributions=param_grid,
                                   n_iter=8, cv=5, 
                                   verbose=2, random_state=42, 
                                   n_jobs=-1)

random_search.fit(X_train, y_train)

# Print the best parameters found by Randomized Search CV
print("Best parameters found:", random_search.best_params_)

# Assign the best model found to a variable
best_model = random_search.best_estimator_

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found: {'n_estimators': 100, 'max_features': 'sqrt', 'max_depth': 20}


In [8]:
def evaluate_performance(model, X_train, y_train, X_test, y_test):
    """
    Evaluate the performance of a regression model on training and testing sets.
    
    Parameters:
        model: Regression model to evaluate.
        X_train: Features of the training set.
        y_train: Target variable of the training set.
        X_test: Features of the testing set.
        y_test: Target variable of the testing set.
        
    Returns:
        result_str: String indicating the performance metrics.
        result_list: List containing the performance metrics.
    """
    # Calculate R-squared for training and testing sets
    train_rsq = model.score(X_train, y_train)
    test_rsq = model.score(X_test, y_test)
    
    # Calculate root mean squared error (RMSE) for training and testing sets
    train_rmse = mean_squared_error(y_train, model.predict(X_train), squared=False)
    test_rmse = mean_squared_error(y_test, model.predict(X_test), squared=False)
    
    # Format and return performance metrics
    return "train rsq, val rsq, train rmse, val rmse", [train_rsq, test_rsq, train_rmse, test_rmse]

# Evaluate the performance of the random forest regressor
result_str, result_list = evaluate_performance(best_model, X_train, y_train, X_test, y_test)

# Print the performance metrics
print(result_str)
print(result_list)


train rsq, val rsq, train rmse, val rmse
[0.9242278505516258, 0.48155455659613755, 20.924778471344865, 57.224427245380326]
