# Hyperparameters Tuning for ML Models

The overall goal of the notebook is to **build and compare various machine learning models for regression tasks** using Python's scikit-learn library.

The specific models implemented and compared include Random Forest Regression , Support Vector Regression, and XGBoost regression. The notebook provides functions to perform hyperparameter tuning and cross-validation to assess model performance. Additionally, the notebook provides a function to load data from a CSV file and separate the features and target columns, and function for split the data into train and test, making it easier to use the implemented models with new datasets.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [2]:
def load_data(csv_path, feature_col_start, feature_col_end, target_col):
    """
    Load a CSV file into a Pandas DataFrame,drop Nan, and separate the feature and target columns.

    Parameters:
        csv_path (str): Path to the CSV file to load.
        feature_col_start, feature_col_end, (ints): Range of column indices to use as features.
        target_col (str or int): Name or index of the column to use as target.

    Returns:
        new_df: A df containing the features + labels DataFrame.
    """
    # Load CSV into a Pandas DataFrame
    df = pd.read_csv(csv_path)

    # drop nan
    df = df.dropna()

    # Extract the feature and target columns
    new_df = df[df.columns[feature_col_start: feature_col_end]]
    new_df[target_col] = df[target_col]

    return new_df

In [3]:
def split_data(df, target_col, test_size=0.3, random_state=42):
    """
    Splits the input DataFrame into training and testing sets.
    
    Parameters:
    -----------
    df (pandas DataFrame): The input DataFrame containing the features and target variable.
    target_col (str): The name of the target column in the DataFrame.
    test_size (float, optional): The proportion of the data to use for testing (default=0.3).
    random_state (int, optional): The random seed to use for the train-test split (default=42).
        
    Returns:
    --------
    X_train (pandas DataFrame): The training set features.     
    X_test (pandas DataFrame): The testing set features.        
    y_train (pandas Series): The training set target variable.
    y_test (pandas Series): The testing set target variable.
    """
    # Extract the features and target variable from the DataFrame
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Return the training and testing sets
    return X_train, X_test, y_train, y_test

Hyperparameters search


In [4]:
def rf_regression(x_train, y_train):

  # Define the hyperparameters to search over
  param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2'],
        'bootstrap': [True, False],
        'oob_score': [True, False],
        'warm_start': [True, False],
        'ccp_alpha': [0, 0.1, 0.5, 1],
        'max_samples': [None, 0.5, 0.7, 0.9]
  }
    
  # Create the random forest regression model
  rf = RandomForestRegressor()
  
  # Perform a grid search over the hyperparameters using cross-validation
  grid_search = GridSearchCV(rf, param_grid, cv=5,verbose=2)
  grid_search.fit(x_train, y_train)
  
  # Print the best hyperparameters and their corresponding score
  print(f"Best hyperparameters: {grid_search.best_params_}")
  print(f"Best score: {grid_search.best_score_}")

  # Use cross-validation to calculate MSE, R2, and MAE
  scoring = ['neg_mean_squared_error', 'r2', 'neg_mean_absolute_error']
  cv_results = cross_validate(grid_search.best_estimator_, x_train, y_train, cv=3, scoring=scoring)
  
  # Print the mean and standard deviation of each metric across the 5 folds
  mse_mean = -1 * cv_results['test_neg_mean_squared_error'].mean()
  mse_std = cv_results['test_neg_mean_squared_error'].std()
  r2_mean = cv_results['test_r2'].mean()
  r2_std = cv_results['test_r2'].std()
  mae_mean = -1 * cv_results['test_neg_mean_absolute_error'].mean()
  mae_std = cv_results['test_neg_mean_absolute_error'].std()
  
  print(f"MSE: {mse_mean:.2f} +/- {mse_std:.2f}")
  print(f"R2: {r2_mean:.2f} +/- {r2_std:.2f}")
  print(f"MAE: {mae_mean:.2f} +/- {mae_std:.2f}")
  
  # Return a ready-to-use model with the best hyperparameters
  best_model = grid_search.best_estimator_
  return best_model


In [5]:
def svm_regression(x_train, y_train):
    # Define the hyperparameters to search over
    param_grid = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'] + [0.1, 1, 10],
        'epsilon': [0.1, 0.01, 0.001],
        'degree': [2, 3, 4],
        'coef0': [-1, 0, 1]
    }
    
    # Create the SVM regression model
    svm = SVR()
    
    # Perform a grid search over the hyperparameters using cross-validation
    grid_search = GridSearchCV(svm, param_grid, cv=5,verbose=2)
    grid_search.fit(x_train, y_train)
    
    # Print the best hyperparameters and their corresponding score
    print(f"Best hyperparameters: {grid_search.best_params_}")
    print(f"Best score: {grid_search.best_score_}")

        # Use cross-validation to calculate MSE, R2, and MAE
    scoring = ['neg_mean_squared_error', 'r2', 'neg_mean_absolute_error']
    cv_results = cross_validate(grid_search.best_estimator_, x_train, y_train, cv=3, scoring=scoring)
    
    # Print the mean and standard deviation of each metric across the 5 folds
    mse_mean = -1 * cv_results['test_neg_mean_squared_error'].mean()
    mse_std = cv_results['test_neg_mean_squared_error'].std()
    r2_mean = cv_results['test_r2'].mean()
    r2_std = cv_results['test_r2'].std()
    mae_mean = -1 * cv_results['test_neg_mean_absolute_error'].mean()
    mae_std = cv_results['test_neg_mean_absolute_error'].std()
    
    print(f"MSE: {mse_mean:.2f} +/- {mse_std:.2f}")
    print(f"R2: {r2_mean:.2f} +/- {r2_std:.2f}")
    print(f"MAE: {mae_mean:.2f} +/- {mae_std:.2f}")
    
    # Return a ready-to-use model with the best hyperparameters
    best_model = grid_search.best_estimator_
    return best_model

In [6]:
def xgb_regression(x_train, y_train):
    # Define the hyperparameters to search over
    param_grid = {
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5]}
    
    # Create the XGB regression model
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

    # Perform a grid search over the hyperparameters using cross-validation
    grid_search = GridSearchCV(xgb_model, param_grid, cv=3,verbose=2)
    grid_search.fit(x_train, y_train)

    # Print the best hyperparameters and their corresponding score
    print(f"Best hyperparameters: {grid_search.best_params_}")
    print(f"Best score: {grid_search.best_score_}")

    # Use cross-validation to calculate MSE, R2, and MAE
    scoring = ['neg_mean_squared_error', 'r2', 'neg_mean_absolute_error']
    cv_results = cross_validate(grid_search.best_estimator_, x_train, y_train, cv=3, scoring=scoring)

    # Print the mean and standard deviation of each metric across the 5 folds
    mse_mean = -1 * cv_results['test_neg_mean_squared_error'].mean()
    mse_std = cv_results['test_neg_mean_squared_error'].std()
    r2_mean = cv_results['test_r2'].mean()
    r2_std = cv_results['test_r2'].std()
    mae_mean = -1 * cv_results['test_neg_mean_absolute_error'].mean()
    mae_std = cv_results['test_neg_mean_absolute_error'].std()

    print(f"MSE: {mse_mean:.2f} +/- {mse_std:.2f}")
    print(f"R2: {r2_mean:.2f} +/- {r2_std:.2f}")
    print(f"MAE: {mae_mean:.2f} +/- {mae_std:.2f}")

    # Return a ready-to-use model with the best hyperparameters
    best_model = grid_search.best_estimator_
    return best_model


## Example

In [7]:
# Define input parameters
csv_path = '/content/data.csv'
feature_idx_i,feature_idx_f = 16,-2 # columns index of features
target_col = 'A' # labael column (regression)

In [8]:
# Load data
data = load_data(csv_path, feature_idx_i,feature_idx_f, target_col)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[target_col] = df[target_col]


Unnamed: 0,397.32,400.2,403.09,405.97,408.85,411.74,414.63,417.52,420.4,423.29,...,978.88,981.96,985.05,988.13,991.22,994.31,997.4,1000.49,1003.58,A
0,0.179808,0.152106,0.129191,0.115715,0.107613,0.102074,0.101501,0.099727,0.096248,0.096929,...,0.458213,0.464172,0.45852,0.462214,0.467727,0.467549,0.466043,0.471523,0.447471,2.01727
1,0.221156,0.186298,0.160032,0.146194,0.136323,0.128331,0.124891,0.12185,0.116359,0.114495,...,0.71797,0.717748,0.722268,0.726763,0.738159,0.741649,0.739217,0.762054,0.622104,1.872474
2,0.221893,0.185626,0.164002,0.154074,0.146511,0.137888,0.133002,0.13092,0.128935,0.126446,...,0.670528,0.675308,0.669332,0.689363,0.685825,0.698885,0.689815,0.705207,0.580815,2.043818
3,0.162126,0.129779,0.104428,0.089685,0.080833,0.075142,0.068085,0.063978,0.058188,0.054447,...,0.57067,0.574177,0.580435,0.579218,0.582644,0.592902,0.597743,0.609343,0.480618,2.123489
4,0.206857,0.164631,0.137415,0.118823,0.102912,0.09785,0.090029,0.084146,0.07765,0.072445,...,0.602451,0.609186,0.624415,0.62275,0.633371,0.64097,0.649146,0.659158,0.5361,2.122085


In [9]:
X_train, X_test, y_train, y_test = split_data(data, target_col, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((429, 204), (184, 204), (429,), (184,))

In [None]:
svr = svm_regression(X_train, y_train)