# Project: Iris Dataset 🌸

- **Project Name:** Iris Classification Project
- **Project Type:** Multi-class Classification
- **Author:** Dr. Saad Laouadi

### Project Overview:
This project leverages the famous **Iris Dataset** for **multi-class classification**, focusing on identifying the species of iris flowers based on their petal and sepal measurements.

### Key Features:
- **Classification Task**: Predict the species of iris flowers (Setosa, Versicolor, Virginica)
- **Algorithms Used**: [Specify any algorithms or models you've implemented]
- **Evaluation Metrics**: [List metrics like accuracy, precision, F1-score, etc.]

---

**Copyright © Dr. Saad Laouadi**  
**All Rights Reserved** 🛡️

In [1]:
# Import necessary modules
import os
from pathlib import Path
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import (
    GradientBoostingClassifier, 
    RandomForestClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier, 
    BaggingClassifier
)
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.exceptions import ConvergenceWarning


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Data paths
DATA_URL = "https://raw.githubusercontent.com/dr-saad-la/ML-Datasets/refs/heads/main/benchmark-ml-datasets/iris.csv"

# Retrieve the base path from the environment variable
BASE_LOCAL_PATH = Path(os.getenv('DATA_PATH'))

# Ensure the environment variable is set, then create the full local path using pathlib
if BASE_LOCAL_PATH:
    LOCAL_PATH = BASE_LOCAL_PATH.joinpath("ML-Datasets/benchmark-ml-datasets/iris.csv")
else:
    print("no environment variable is found")    

# Suppress ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [2]:
def load_dataset(data_source: str, data_url: str, local_path: Path, base_path: Path = None) -> pd.DataFrame:
    """
    Load a dataset from either a URL or a local file, based on the selected data source.

    Parameters:
    - data_source (str): Choose 'url' for online dataset or 'local' for local file.
    - data_url (str): The URL to load the dataset from if 'url' is selected.
    - local_path (Path): The local file path to load the dataset from if 'local' is selected.
    - base_path (Path): The base directory to strip when printing the local path (optional).

    Returns:
    - pd.DataFrame: The loaded dataset as a pandas DataFrame.
    """
    if data_source == 'url':
        print("Loading data from URL...")
        return pd.read_csv(data_url)
    elif local_path.exists():  
        if base_path:
            relative_path = local_path.relative_to(base_path)
        else:
            relative_path = local_path
        print(f"Loading data from local path: /{relative_path}")
        return pd.read_csv(local_path)
    else:
        print(f"Error: The local file was not found at /{relative_path}")
        return None


def process_data(df):
    # Step 1: Encode the 'class' column (categorical) into numerical labels
    label_encoder = LabelEncoder()
    df['class'] = label_encoder.fit_transform(df['class'])
    
    # Step 2: Separate features (X) and target (y)
    X = df.drop('class', axis=1)
    y = df['class']
    
    # Step 3: Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    # Step 4: Normalize the feature data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test


def train_knn(X_train, X_test, y_train, y_test, n_neighbors=5):
    # Step 1: Initialize the KNN model
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    # Step 2: Train the model on the training data
    knn.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = knn.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"KNN Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


def train_logistic_regression(X_train, X_test, y_train, y_test):
    # Step 1: Initialize the Logistic Regression model
    logistic_regression = LogisticRegression(max_iter=1000)  
    # Step 2: Train the model on the training data
    logistic_regression.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = logistic_regression.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Logistic Regression Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


# Function for hyperparameter tuning of Logistic Regression
def tune_logistic_regression(X_train, y_train):
    # Step 1: Define the Logistic Regression model
    logistic_regression = LogisticRegression(max_iter=1000)
    
    # Step 2: Set the hyperparameter grid
    param_grid = {
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],  # Different solvers for optimization
        'C': [0.01, 0.1, 1.0, 10.0, 100.0],                     # Regularization strength (inverse)
        'penalty': ['l2']                                       # L2 regularization (for solvers supporting it)
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=logistic_regression,
                               param_grid=param_grid,
                               cv=5,       # 5-fold cross-validation
                               verbose=1,  # Show progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Function to train and evaluate the SVM model
def train_svm(X_train, X_test, y_train, y_test, kernel='linear', C=1.0):
    # Step 1: Initialize the SVM model
    svm_model = SVC(kernel=kernel, C=C)  # 'kernel' specifies the type of SVM, 'C' is the regularization parameter
    
    # Step 2: Train the model on the training data
    svm_model.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = svm_model.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"SVM Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Function for hyperparameter tuning of SVM
def tune_svm(X_train, y_train):
    # Step 1: Define the SVC model
    svm_model = SVC()

    # Step 2: Set the hyperparameter grid
    param_grid = {
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Different kernels for the SVM
        'C': [0.1, 1.0, 10, 100],  # Regularization parameter
        'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly', 'sigmoid'
        'degree': [2, 3, 4]  # Only for 'poly' kernel
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=svm_model,
                               param_grid=param_grid,
                               cv=5,  # 5-fold cross-validation
                               verbose=1,  # Print progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Function to train and evaluate GradientBoostingClassifier
def train_gradient_boosting(X_train, X_test, y_train, y_test, n_estimators=100, learning_rate=0.1, max_depth=3):
    """
    Train and evaluate a GradientBoostingClassifier.
    
    Parameters:
    - X_train, X_test: Scaled training and testing features
    - y_train, y_test: Training and testing labels
    - n_estimators: The number of boosting stages to be run
    - learning_rate: Shrinks the contribution of each tree by this factor
    - max_depth: Maximum depth of each tree

    Returns:
    - Trained GradientBoostingClassifier
    """
    # Step 1: Initialize the Gradient Boosting model
    gb_model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    
    # Step 2: Train the model on the training data
    gb_model.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = gb_model.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Gradient Boosting Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return gb_model

# Function for hyperparameter tuning of GradientBoostingClassifier
def tune_gradient_boosting(X_train, y_train):
    """
    Perform hyperparameter tuning for GradientBoostingClassifier using GridSearchCV.
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    
    Returns:
    - Best estimator after tuning
    """
    # Step 1: Define the Gradient Boosting model
    gb_model = GradientBoostingClassifier()

    # Step 2: Set the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of boosting stages
        'learning_rate': [0.01, 0.1, 0.2],  # Learning rate for shrinking the contribution of each tree
        'max_depth': [3, 4, 5],  # Maximum depth of each tree
        'subsample': [0.8, 1.0],  # Fraction of samples used for fitting individual base learners
        'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=gb_model,
                               param_grid=param_grid,
                               cv=5,  # 5-fold cross-validation
                               verbose=1,  # Show progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Function to train and evaluate RandomForestClassifier
def train_random_forest(X_train, X_test, y_train, y_test, n_estimators=100, max_depth=None, min_samples_split=2):
    """
    Train and evaluate a RandomForestClassifier.
    
    Parameters:
    - X_train, X_test: Scaled training and testing features
    - y_train, y_test: Training and testing labels
    - n_estimators: The number of trees in the forest
    - max_depth: Maximum depth of the trees (None for no restriction)
    - min_samples_split: Minimum number of samples required to split a node

    Returns:
    - Trained RandomForestClassifier
    """
    # Step 1: Initialize the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    
    # Step 2: Train the model on the training data
    rf_model.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = rf_model.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Random Forest Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return rf_model

# Function for hyperparameter tuning of RandomForestClassifier
def tune_random_forest(X_train, y_train):
    """
    Perform hyperparameter tuning for RandomForestClassifier using GridSearchCV.
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    
    Returns:
    - Best estimator after tuning
    """
    # Step 1: Define the Random Forest model
    rf_model = RandomForestClassifier(random_state=42)

    # Step 2: Set the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
        'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
        'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
        'bootstrap': [True, False]  # Whether to use bootstrapping
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=rf_model,
                               param_grid=param_grid,
                               cv=5,  # 5-fold cross-validation
                               verbose=1,  # Show progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Function to train and evaluate AdaBoostClassifier
def train_adaboost(X_train, X_test, y_train, y_test, n_estimators=50, learning_rate=1.0):
    """
    Train and evaluate an AdaBoostClassifier.
    
    Parameters:
    - X_train, X_test: Scaled training and testing features
    - y_train, y_test: Training and testing labels
    - n_estimators: The number of weak learners (default is 50)
    - learning_rate: Shrinks the contribution of each classifier by this factor (default is 1.0)

    Returns:
    - Trained AdaBoostClassifier
    """
    # Step 1: Initialize the AdaBoost model
    adaboost_model = AdaBoostClassifier(n_estimators=n_estimators, 
                                        learning_rate=learning_rate, 
                                        random_state=0,
                                       algorithm="SAMME")
    
    # Step 2: Train the model on the training data
    adaboost_model.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = adaboost_model.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"AdaBoost Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return adaboost_model

# Function for hyperparameter tuning of AdaBoostClassifier
def tune_adaboost(X_train, y_train):
    """
    Perform hyperparameter tuning for AdaBoostClassifier using GridSearchCV.
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    
    Returns:
    - Best estimator after tuning
    """
    # Step 1: Define the AdaBoost model
    adaboost_model = AdaBoostClassifier(random_state=0, algorithm = "SAMME")

    # Step 2: Set the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200, 300, 500, 750, 1000, 1500],  # Number of weak learners (trees)
        'learning_rate': [0.01, 0.1, 0.5, 1.0]  # Shrinkage of weak learners' contribution
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=adaboost_model,
                               param_grid=param_grid,
                               cv=5,  # 5-fold cross-validation
                               verbose=1,  # Show progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Function to train and evaluate ExtraTreesClassifier
def train_extra_trees(X_train, X_test, y_train, y_test, n_estimators=100, max_depth=None, min_samples_split=2):
    """
    Train and evaluate an ExtraTreesClassifier.
    
    Parameters:
    - X_train, X_test: Scaled training and testing features
    - y_train, y_test: Training and testing labels
    - n_estimators: The number of trees in the forest (default is 100)
    - max_depth: Maximum depth of the trees (None for no restriction)
    - min_samples_split: Minimum number of samples required to split a node (default is 2)

    Returns:
    - Trained ExtraTreesClassifier
    """
    # Step 1: Initialize the Extra Trees model
    et_model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    
    # Step 2: Train the model on the training data
    et_model.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = et_model.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Extra Trees Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return et_model

# Function for hyperparameter tuning of ExtraTreesClassifier
def tune_extra_trees(X_train, y_train):
    """
    Perform hyperparameter tuning for ExtraTreesClassifier using GridSearchCV.
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    
    Returns:
    - Best estimator after tuning
    """
    # Step 1: Define the Extra Trees model
    et_model = ExtraTreesClassifier(random_state=42)

    # Step 2: Set the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
        'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
        'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
        'bootstrap': [True, False]  # Whether to use bootstrapping
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=et_model,
                               param_grid=param_grid,
                               cv=5,  # 5-fold cross-validation
                               verbose=1,  # Show progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Function to train and evaluate MLPClassifier
def train_mlp(X_train, X_test, y_train, y_test, hidden_layer_sizes=(100,), max_iter=200, learning_rate_init=0.001):
    """
    Train and evaluate an MLPClassifier.
    
    Parameters:
    - X_train, X_test: Scaled training and testing features
    - y_train, y_test: Training and testing labels
    - hidden_layer_sizes: The number of neurons in the hidden layers (default is (100,))
    - max_iter: Maximum number of iterations for training (default is 200)
    - learning_rate_init: The initial learning rate for weight updates (default is 0.001)

    Returns:
    - Trained MLPClassifier
    """
    # Step 1: Initialize the MLP model
    mlp_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=max_iter, learning_rate_init=learning_rate_init, random_state=42)
    
    # Step 2: Train the model on the training data
    mlp_model.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = mlp_model.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"MLP Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return mlp_model

# Function for hyperparameter tuning of MLPClassifier
def tune_mlp(X_train, y_train):
    """
    Perform hyperparameter tuning for MLPClassifier using GridSearchCV.
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    
    Returns:
    - Best estimator after tuning
    """
    # Step 1: Define the MLP model
    mlp_model = MLPClassifier(random_state=42)

    # Step 2: Set the hyperparameter grid
    param_grid = {
        'hidden_layer_sizes': [(100,), (100, 50), (50, 50)],  # Number of neurons in hidden layers
        'max_iter': [1000, 1500],                             # Maximum number of iterations
        'learning_rate_init': [0.001, 0.01, 0.1]              # Initial learning rate
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=mlp_model,
                               param_grid=param_grid,
                               cv=5,  # 5-fold cross-validation
                               verbose=1,  # Show progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Function to train and evaluate BaggingClassifier
def train_bagging(X_train, X_test, y_train, y_test, base_estimator=None, n_estimators=10):
    """
    Train and evaluate a BaggingClassifier.
    
    Parameters:
    - X_train, X_test: Scaled training and testing features
    - y_train, y_test: Training and testing labels
    - base_estimator: The base model to use for bagging (default is DecisionTreeClassifier)
    - n_estimators: The number of base estimators (default is 10)
    
    Returns:
    - Trained BaggingClassifier
    """
    # Step 1: Initialize the base estimator (default is DecisionTreeClassifier)
    if base_estimator is None:
        base_estimator = DecisionTreeClassifier()
    
    # Step 2: Initialize the Bagging model
    bagging_model = BaggingClassifier(estimator=base_estimator, n_estimators=n_estimators, random_state=42)
    
    # Step 3: Train the model on the training data
    bagging_model.fit(X_train, y_train)
    
    # Step 4: Make predictions on the test data
    y_pred = bagging_model.predict(X_test)
    
    # Step 5: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Bagging Classifier Accuracy: {accuracy:.4f}")
    
    # Step 6: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return bagging_model

# Function for hyperparameter tuning of BaggingClassifier
def tune_bagging(X_train, y_train):
    """
    Perform hyperparameter tuning for BaggingClassifier using GridSearchCV.
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    
    Returns:
    - Best estimator after tuning
    """
    # Step 1: Define the Bagging model
    base_estimator = DecisionTreeClassifier()
    bagging_model = BaggingClassifier(estimator=base_estimator, random_state=42)

    # Step 2: Set the hyperparameter grid
    param_grid = {
        'n_estimators': [10, 50, 100],  # Number of base estimators
        'max_samples': [0.5, 0.7, 1.0],  # Proportion of samples used for training
        'max_features': [0.5, 0.7, 1.0],  # Proportion of features used for training
        'bootstrap': [True, False],  # Whether to use bootstrapping for sampling
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=bagging_model,
                               param_grid=param_grid,
                               cv=5,  # 5-fold cross-validation
                               verbose=1,  # Show progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

In [3]:
# Choose data source: 'url' for online dataset, 'local' for local file
data_source = 'local'  # CHANGE TO 'url' IF YOU YOU LOAD FROM THE `URL`

# Load the datasets
data = load_dataset(data_source, DATA_URL, LOCAL_PATH, BASE_LOCAL_PATH)

# Display the first few rows of the dataset (if loaded successfully)
if data is not None:
    print(data.head())

Loading data from local path: /ML-Datasets/benchmark-ml-datasets/iris.csv
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
# Process the data
X_train_scaled, X_test_scaled, y_train, y_test = process_data(data)

In [6]:
# Check the first few rows of the processed data
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"y_train shape: {y_train.shape}")

X_train_scaled shape: (105, 4)
y_train shape: (105,)


In [7]:
# Train the KNN model
train_knn(X_train_scaled, X_test_scaled, y_train, y_test)

KNN Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [8]:
# Train the Logistic Regression model
train_logistic_regression(X_train_scaled, X_test_scaled, y_train, y_test)

Logistic Regression Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [9]:
# Example usage: Perform hyperparameter tuning
best_logistic_regression = tune_logistic_regression(X_train_scaled, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters: {'C': 100.0, 'penalty': 'l2', 'solver': 'newton-cg'}
Best Cross-Validation Accuracy: 0.9714


In [10]:
# Example: Use the best model to make predictions and evaluate
y_pred = best_logistic_regression.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [11]:
# Train the SVM model
train_svm(X_train_scaled, X_test_scaled, y_train, y_test)

SVM Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [12]:
# find the best svm classifier hyperparameters
best_svm = tune_svm(X_train_scaled, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Hyperparameters: {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Best Cross-Validation Accuracy: 0.9619


In [13]:
y_pred = best_svm.predict(X_test_scaled)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

SVM Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [14]:
# Train and evaluate the Gradient Boosting model
gb_model = train_gradient_boosting(X_train_scaled, X_test_scaled, y_train, y_test)

Gradient Boosting Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [15]:
# Perform hyperparameter tuning for GradientBoostingClassifier
best_gb_model = tune_gradient_boosting(X_train_scaled, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 50, 'subsample': 0.8}
Best Cross-Validation Accuracy: 0.9619


In [16]:
# Use the best model to make predictions and evaluate
y_pred = best_gb_model.predict(X_test_scaled)
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Gradient Boosting Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [17]:
# Train and evaluate the Random Forest model
rf_model = train_random_forest(X_train_scaled, X_test_scaled, y_train, y_test)

Random Forest Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [18]:
# Perform hyperparameter tuning for RandomForestClassifier
best_rf_model = tune_random_forest(X_train_scaled, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Hyperparameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.9429


In [19]:
#  Use the best model to make predictions and evaluate
y_pred = best_rf_model.predict(X_test_scaled)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [20]:
# Train and evaluate the AdaBoost model
adaboost_model = train_adaboost(X_train_scaled, X_test_scaled, y_train, y_test)

AdaBoost Accuracy: 0.9333
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.89      0.94      0.92        18
           2       0.90      0.82      0.86        11

    accuracy                           0.93        45
   macro avg       0.93      0.92      0.93        45
weighted avg       0.93      0.93      0.93        45



In [21]:
# Perform hyperparameter tuning for AdaBoostClassifier
best_adaboost_model = tune_adaboost(X_train_scaled, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Hyperparameters: {'learning_rate': 0.01, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.9619


In [22]:
# Use the best model to make predictions and evaluate
y_pred = best_adaboost_model.predict(X_test_scaled)
print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

AdaBoost Accuracy: 0.9111
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.85      0.94      0.89        18
           2       0.89      0.73      0.80        11

    accuracy                           0.91        45
   macro avg       0.91      0.89      0.90        45
weighted avg       0.91      0.91      0.91        45



In [23]:
# Example usage: Train and evaluate the Extra Trees model
et_model = train_extra_trees(X_train_scaled, X_test_scaled, y_train, y_test)

Extra Trees Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [24]:
# Perform hyperparameter tuning for ExtraTreesClassifier
best_et_model = tune_extra_trees(X_train_scaled, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Hyperparameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Best Cross-Validation Accuracy: 0.9524


In [25]:
# Use the best model to make predictions and evaluate
y_pred = best_et_model.predict(X_test_scaled)
print(f"Extra Trees Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Extra Trees Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [26]:
# Train and evaluate the MLP model
mlp_model = train_mlp(X_train_scaled, X_test_scaled, y_train, y_test)

MLP Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [27]:
# Perform hyperparameter tuning for MLPClassifier
best_mlp_model = tune_mlp(X_train_scaled, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters: {'hidden_layer_sizes': (100,), 'learning_rate_init': 0.1, 'max_iter': 1000}
Best Cross-Validation Accuracy: 0.9429


In [28]:
# Use the best model to make predictions and evaluate
y_pred = best_mlp_model.predict(X_test_scaled)
print(f"MLP Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

MLP Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [29]:
# Train and evaluate the Bagging model
bagging_model = train_bagging(X_train_scaled, X_test_scaled, y_train, y_test)

Bagging Classifier Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [30]:
# Perform hyperparameter tuning for BaggingClassifier
best_bagging_model = tune_bagging(X_train_scaled, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Hyperparameters: {'bootstrap': True, 'max_features': 0.5, 'max_samples': 0.7, 'n_estimators': 50}
Best Cross-Validation Accuracy: 0.9524


In [31]:
# Use the best model to make predictions and evaluate
y_pred = best_bagging_model.predict(X_test_scaled)
print(f"Bagging Classifier Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Bagging Classifier Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [32]:
# Traing CatBoost Classifier
from catboost import CatBoostClassifier

In [33]:
def train_catboost(X_train, X_test, y_train, y_test, iterations=1000, learning_rate=0.1, depth=6):
    """
    Train and evaluate a CatBoostClassifier.
    
    Parameters:
    - X_train, X_test: Scaled training and testing features
    - y_train, y_test: Training and testing labels
    - iterations: The number of boosting iterations (default is 1000)
    - learning_rate: Learning rate for the model (default is 0.1)
    - depth: Depth of the trees (default is 6)
    
    Returns:
    - Trained CatBoostClassifier
    """
    # Step 1: Initialize the CatBoost model
    catboost_model = CatBoostClassifier(iterations=iterations, 
                                        learning_rate=learning_rate, 
                                        depth=depth,
                                        verbose=100, 
                                        random_state=0)
    
    # Step 2: Train the model on the training data
    catboost_model.fit(X_train, y_train)
    
    # Step 3: Make predictions on the test data
    y_pred = catboost_model.predict(X_test)
    
    # Step 4: Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"CatBoost Accuracy: {accuracy:.4f}")
    
    # Step 5: Print a classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return catboost_model

# Function for hyperparameter tuning of CatBoostClassifier
def tune_catboost(X_train, y_train):
    """
    Perform hyperparameter tuning for CatBoostClassifier using GridSearchCV.
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    
    Returns:
    - Best estimator after tuning
    """
    # Step 1: Define the CatBoost model
    catboost_model = CatBoostClassifier(verbose=0, random_state=42)

    # Step 2: Set the hyperparameter grid
    param_grid = {
        'iterations': [500, 1000],  # Number of boosting iterations
        'learning_rate': [0.01, 0.05, 0.1],  # Learning rate
        'depth': [4, 6, 8],  # Depth of the trees
    }
    
    # Step 3: Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=catboost_model,
                               param_grid=param_grid,
                               cv=5,  # 5-fold cross-validation
                               verbose=1,  # Show progress
                               n_jobs=-1)  # Use all available cores
    
    # Step 4: Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Step 5: Print the best parameters and best score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

In [34]:
# Train and evaluate the CatBoost model
catboost_model = train_catboost(X_train_scaled, X_test_scaled, y_train, y_test)

0:	learn: 0.9827943	total: 56.8ms	remaining: 56.7s
100:	learn: 0.0472036	total: 82.4ms	remaining: 733ms
200:	learn: 0.0197420	total: 107ms	remaining: 424ms
300:	learn: 0.0122758	total: 132ms	remaining: 306ms
400:	learn: 0.0089466	total: 157ms	remaining: 235ms
500:	learn: 0.0070530	total: 185ms	remaining: 184ms
600:	learn: 0.0057753	total: 210ms	remaining: 139ms
700:	learn: 0.0048400	total: 234ms	remaining: 100ms
800:	learn: 0.0041801	total: 260ms	remaining: 64.5ms
900:	learn: 0.0036865	total: 287ms	remaining: 31.5ms
999:	learn: 0.0032939	total: 311ms	remaining: 0us
CatBoost Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

In [35]:
# Perform hyperparameter tuning for CatBoostClassifier
best_catboost_model = tune_catboost(X_train_scaled, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.01}
Best Cross-Validation Accuracy: 0.9429


In [36]:
# Use the best model to make predictions and evaluate
y_pred = best_catboost_model.predict(X_test_scaled)
print(f"CatBoost Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

CatBoost Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [37]:
# Train XGBoost Classifier
from xgboost import XGBClassifier

def train_xgboost(X_train, X_test, y_train, y_test, learning_rate=0.1, n_estimators=100, max_depth=3, subsample=1.0, colsample_bytree=1.0):
    """
    Train an XGBoostClassifier with specified hyperparameters.
    
    Parameters:
    - X_train, X_test: Training and testing features (numpy arrays or DataFrames)
    - y_train, y_test: Training and testing labels (numpy arrays or Series)
    - learning_rate: Step size shrinkage used to prevent overfitting (default: 0.1)
    - n_estimators: Number of boosting rounds (default: 100)
    - max_depth: Maximum depth of the tree (default: 3)
    - subsample: Subsample ratio of the training data (default: 1.0)
    - colsample_bytree: Subsample ratio of columns when constructing trees (default: 1.0)
    
    Returns:
    - model: Trained XGBoostClassifier model
    """
    # Initialize the XGBoost model with the specified hyperparameters
    xgb_model = XGBClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        eval_metric='mlogloss',
        random_state=42
    )
    
    # Train the model
    xgb_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = xgb_model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"XGBoost Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return xgb_model

def tune_xgboost(X_train, y_train):
    """
    Perform hyperparameter tuning for XGBoostClassifier using GridSearchCV.
    
    Parameters:
    - X_train: Training features (numpy arrays or DataFrames)
    - y_train: Training labels (numpy arrays or Series)
    
    Returns:
    - best_model: XGBoostClassifier with the best hyperparameters
    """
    # Initialize the base model
    xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
    
    # Define the hyperparameter grid to search
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
    
    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=xgb_model,
                               param_grid=param_grid,
                               scoring='accuracy',
                               cv=5,  # 5-fold cross-validation
                               verbose=1,
                               n_jobs=-1)  # Use all available cores
    
    # Fit GridSearchCV on the training data
    grid_search.fit(X_train, y_train)
    
    # Print the best parameters and score
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    
    # Return the best model
    return grid_search.best_estimator_



def train_and_tune_xgboost(X_train, X_test, y_train, y_test, param_grid=None):
    """
    Train and fine-tune an XGBoostClassifier using GridSearchCV.

    Parameters:
    - X_train, X_test: Features for training and testing (numpy arrays or DataFrames)
    - y_train, y_test: Labels for training and testing (numpy arrays or Series)
    - param_grid: Dictionary of hyperparameters to search for the best model (default is None, which uses basic hyperparameters)

    Returns:
    - best_model: The XGBoostClassifier with the best hyperparameters
    """
    
    # Step 1: Initialize the base XGBoostClassifier model
    xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
    
    # Step 2: If no param_grid is provided, define a default one
    if param_grid is None:
        param_grid = {
            'learning_rate': [0.01, 0.05, 0.1],   # Learning rate
            'n_estimators': [100, 200, 500],      # Number of boosting rounds
            'max_depth': [3, 5, 7],               # Maximum depth of a tree
            'subsample': [0.6, 0.8, 1.0],         # Subsampling ratio of the training data
            'colsample_bytree': [0.6, 0.8, 1.0]   # Subsampling ratio of columns when constructing trees
        }
    
    # Step 3: Initialize GridSearchCV with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=xgb_model,
                               param_grid=param_grid,
                               scoring='accuracy',  # Evaluate using accuracy
                               cv=5,                # 5-fold cross-validation
                               verbose=1,           # Show progress
                               n_jobs=-1)           # Use all available cores
    
    # Step 4: Fit GridSearchCV on the training data
    grid_search.fit(X_train, y_train)
    
    # Step 5: Retrieve the best model
    best_model = grid_search.best_estimator_
    
    # Step 6: Make predictions on the test data using the best model
    y_pred = best_model.predict(X_test)
    
    # Step 7: Evaluate the model using accuracy score and classification report
    accuracy = accuracy_score(y_test, y_pred)
    print(f"XGBoost Accuracy: {accuracy:.4f}")
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Step 8: Return the best model
    return best_model

In [38]:
#  Train and fine-tune XGBoost
best_xgboost_model = train_xgboost(X_train_scaled, X_test_scaled, y_train, y_test)

XGBoost Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [39]:
# Perform hyperparameter tuning for XGBoostClassifier
best_xgb_model = tune_xgboost(X_train_scaled, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Hyperparameters: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.6}
Best Cross-Validation Accuracy: 0.9524


In [40]:
# Use the best model to make predictions and evaluate
y_pred = best_xgb_model.predict(X_test_scaled)
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

CatBoost Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [41]:
xgb_classifier = train_and_tune_xgboost(X_train_scaled, X_test_scaled, y_train, y_test)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
XGBoost Accuracy: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [42]:
# Train LGBMClassifier
from lightgbm import LGBMClassifier

In [90]:
def train_lgbm(X_train, X_test, y_train, y_test, learning_rate=0.1, n_estimators=100, max_depth=-1, subsample=1.0, colsample_bytree=1.0):
    """
    Train an LGBMClassifier with specified hyperparameters.
    
    Parameters:
    - X_train, X_test: Training and testing features (numpy arrays or DataFrames)
    - y_train, y_test: Training and testing labels (numpy arrays or Series)
    - learning_rate: Step size shrinkage used to prevent overfitting (default: 0.1)
    - n_estimators: Number of boosting rounds (default: 100)
    - max_depth: Maximum depth of the tree (default: -1, which means no limit)
    - subsample: Subsample ratio of the training data (default: 1.0)
    - colsample_bytree: Subsample ratio of columns when constructing trees (default: 1.0)
    
    Returns:
    - model: Trained LGBMClassifier model
    """
    warnings.filterwarnings('ignore')
    # Initialize the LGBM model with the specified hyperparameters
    lgbm_model = LGBMClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=0,
        verbose = -1
    )
    
    # Train the model
    lgbm_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = lgbm_model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"LGBM Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return lgbm_model


def tune_lgbm_optimized(X_train, y_train):
    """
    Perform optimized hyperparameter tuning for LGBMClassifier using RandomizedSearchCV
    with reduced search space and early stopping.
    
    Parameters:
    - X_train: Training features (numpy arrays or DataFrames)
    - y_train: Training labels (numpy arrays or Series)
    
    Returns:
    - best_model: LGBMClassifier with the best hyperparameters
    """
    # Suppress warnings and output
    warnings.filterwarnings('ignore')
    
    # Initialize the base LGBMClassifier with a reasonable starting configuration
    lgbm_model = LGBMClassifier(random_state=42, verbose=-1)
    
    # Define a reduced hyperparameter grid to search
    param_grid = {
        'learning_rate': [0.05, 0.1],   # Reduced options
        'n_estimators': [50, 100],      # Fewer boosting rounds for faster training
        'max_depth': [3, 5],            # Restrict tree depth to prevent overfitting
        'subsample': [0.8],             # Fixed subsample ratio
        'colsample_bytree': [0.8]       # Fixed column subsample ratio
    }
    
    # Initialize RandomizedSearchCV with reduced iterations and cross-validation folds
    randomized_search = RandomizedSearchCV(estimator=lgbm_model,
                                           param_distributions=param_grid,
                                           n_iter=5,  # Try only 5 random combinations
                                           scoring='accuracy',
                                           cv=2,  # Use 2-fold cross-validation
                                           verbose=0,
                                           n_jobs=-1)  # Use all available cores
    
    # Fit RandomizedSearchCV on the training data
    randomized_search.fit(X_train, y_train)
    
    # Print the best parameters and cross-validation score
    print(f"Best Hyperparameters: {randomized_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {randomized_search.best_score_:.4f}")
    
    # Return the best estimator
    return randomized_search.best_estimator_



from sklearn.model_selection import RandomizedSearchCV


def tune_lgbm_optimized(X_train, y_train):
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'subsample': [0.8],
        'colsample_bytree': [0.8]
    }
    
    # Initialize the base LGBM model
    lgbm_model = LGBMClassifier(random_state=42, verbose=-1)
    
    # Initialize RandomizedSearchCV with 10 iterations and 3-fold CV
    randomized_search = RandomizedSearchCV(estimator=lgbm_model,
                                           param_distributions=param_grid,
                                           n_iter=10,  # Randomly sample 10 combinations
                                           scoring='accuracy',
                                           cv=3,  # Use 3-fold cross-validation
                                           verbose=1,
                                           n_jobs=-1)  # Use all available cores
    
    randomized_search.fit(X_train, y_train)
    
    print(f"Best Hyperparameters: {randomized_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {randomized_search.best_score_:.4f}")
    
    return randomized_search.best_estimator_

In [86]:
# Train LBGClassifier
lgbm_model = train_lgbm(X_train_scaled, X_test_scaled, y_train, y_test)

LGBM Accuracy: 0.9556
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.94      0.94      0.94        18
           2       0.91      0.91      0.91        11

    accuracy                           0.96        45
   macro avg       0.95      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45



In [87]:
# Example usage:
best_lgbm_model = tune_lgbm_optimized(X_train_scaled, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Best Cross-Validation Accuracy: 0.9429


In [88]:
# Use the best model to make predictions and evaluate
y_pred = best_lgbm_model.predict(X_test_scaled)
print(f"LGBM Classifier Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

LGBM Classifier Accuracy: 0.9556
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.94      0.94      0.94        18
           2       0.91      0.91      0.91        11

    accuracy                           0.96        45
   macro avg       0.95      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45



In [91]:
# Tuning and Evaluating the lgbm classifier
best_lgbm_model = tune_lgbm_optimized(X_train_scaled, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Best Cross-Validation Accuracy: 0.9429


In [92]:
# Use the best model to make predictions and evaluate
y_pred = best_lgbm_model.predict(X_test_scaled)
print(f"LGBM Classifier Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

LGBM Classifier Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

