In [139]:
from loguru import logger
import pandas as pd
from pathlib import Path
import numpy as np
import sklearn

# Load initial dataset
Explore bostonhousing dataset and generate intermediate processing dataset

In [140]:
def load_data():
    file_name_without_extension = "bostonhousing-693729dedb019653836667"
    original_file_path = Path("./datas") / f"{file_name_without_extension}.csv"
    clean_dataset_name = Path("./datas") / "cleaned_dataset.csv"

    try:
        full_pd_dataset = pd.read_csv(original_file_path)
        logger.info(f"Successfully loaded data from {original_file_path}")
        logger.info(f"Data shape: {full_pd_dataset.shape}")

        return full_pd_dataset
    except FileNotFoundError:
        logger.error(f"{original_file_path.absolute()} was not found")
    except Exception as e:
        logger.error(f"Something went wrong loading data: {e}")

## iqr outlier detection
IQR (Inter Quartile Range) : Determine "outlier" all values out of the interval `[Q1 - f * IQR, Q3 + f * IQR]` where f is a factor (std 1.5) and IQR is Q3 - Q1

In [141]:
def iqr_detection(df, column, factor=1.5):
    """
        IQR outlier detection in a specific column
        Args:
            df (pd.DataFrame): Input dataframe
            column (str): column name to analyze
            factor (float): multiplicant factor for IQR (standard : 1.5)
        Returns:
            pd.Series : boolean mask (True = outlier, False = non outlier)
    """

    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR

    outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)

    return outlier_mask

## z-score outlier detection
Z-score count all standard deviation from one observation to mean. Standard threshold +/- 3 standard deviation

In [142]:
def zscore_detection(df, column, threshold=3):
    """
    Z-score outlier detection for one column

    Args:
        df (pd.DataFrame): Input dataframe
        column (str): Column to analyze
        threshold (float): Z-Score thersold (standard = 3.0).

    Returns:
        pd.Series: boolean mask (True = outlier, False = non-outlier).
    """
    # Z-score
    z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
    
    # Mask: True if Z-score > threshold (outlier)
    outlier_mask = z_scores > threshold
    return outlier_mask

## Isolation forest outlier detection
Isolation Forest is a non supervised learning algorithm isolating abnormal observations. Require scikit-learn

In [143]:
from sklearn.ensemble import IsolationForest

def isolation_forest_detection(df, column, contamination='auto', random_state=42):
    """
    Outlier detection using Isolation Forest algorithm.
    
    NOTE: This algorithm is multi-dimensional. Here it is applied 
          to unique column to keep coherence with other outlier detection methods.
          For better performance it needs multiple relevant columns

    Args:
        df (pd.DataFrame): Input dataframe
        column (str): Column to test
        contamination (str/float): Expected outliers number in dataframe
        random_state (int): Reproductibility

    Returns:
        pd.Series: boolean mask (True = outlier, False = non-outlier).
    """
    # Algorithm require an input as 2D (N, 1)
    X = df[[column]].values
    
    model = IsolationForest(
        contamination=contamination, 
        random_state=random_state,
        n_jobs=-1 # Use all cores
    )
    
    # Fit model
    model.fit(X)
    
    # predict method returns: 1 for inliers, -1 for outliers
    predictions = model.predict(X)
    
    # Mask: True if prediction is -1 (outlier)
    outlier_mask = pd.Series(predictions == -1, index=df.index)
    return outlier_mask

In [144]:
def outlier_detector(df, column, algorithm, **kwargs):
    """
    Apply one of the outlier detection function and returns cleaned dataset

    Args:
        df (pd.DataFrame): Input dataframe.
        column (str): Column name to analyze.
        algorithm (function): Detection function (e.g., iqr_detection).
        **kwargs: Specifics args to give to function (e.g., factor, threshold).

    Returns:
        pd.DataFrame: Dataframe without outliers identified
    """
    if df is None:
        logger.error("No dataframe. Detection aborted")
        return None
    
    algo_name = algorithm.__name__.replace('_detection', '').upper()
    logger.info(f"Running detection using : {algo_name}")
    
    try:
        # 1. Get outliers mask
        outlier_mask = algorithm(df, column, **kwargs)
        
        # 2. Count outliers
        num_outliers = outlier_mask.sum()
        total_rows = len(df)
        logger.info(f"{num_outliers} outliers was detected in '{column}' ({num_outliers/total_rows:.2%})")
        
        # 3. Build cleaned dataframe (Mask False = non-outlier)
        cleaned_df = df[~outlier_mask].copy() # Use .copy() to avoid SettingWithCopyWarning
        
        logger.success(f"Outliers cleaning done. DataFrame reduce to {total_rows} à {len(cleaned_df)} rows.")
        
        return cleaned_df
        
    except Exception as e:
        logger.error(f"Error using {algo_name}: {e}")
        return df # Return original dataframe avoiding to break the pipeline

## Normalization step
From a cleaned dataframe, apply a normalisation algorithm for all columns using MinMax scaling.

Xnorm = X - Xmin / Xmax - Xmin

Using this method we expect that all characteristics are in the same range ([0, 1]), distributions are kept, no problem with outliers already detected.

On Boston datas MinMax is better than MaxAbs cause : datas are already positives. MaxAbs is better on sparse data or datas are 0 centered.

In [145]:
from sklearn.preprocessing import MinMaxScaler


def minmax_normalize(df, target_column='medv', binary_columns=['chas']):
    """
    Normalize numeric columns using MinMax Scaling (0 à 1).

    Args:
        df (pd.DataFrame): Input dataframe.
        target_column (str): Target column (not normalized).
        binary_columns (list): Binary columns (not normalized).

    Returns:
        pd.DataFrame: normalized dataframe.
        MinMaxScaler: Adjusted Scaler, usefull to future denormalization.
    """
    if df is None:
        logger.error("No dataframe. Normalization aborted.")
        return None, None

    # 1. Identify all columns to normalize (expect target and binaries)
    columns_to_scale = [
        col for col in df.columns 
        if col not in [target_column] and col not in binary_columns
    ]
    
    logger.info(f"Columns to normalize ({len(columns_to_scale)}): {columns_to_scale}")

    # 2. Créer et ajuster le scaler
    scaler = MinMaxScaler()
    
    # Adujst scaler only for training columns
    # Use .values.reshape(-1, 1) to make sur that Pandas will correctly handle transformation
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    
    logger.success("MinMax normalization successfully applyed on features.")
    
    return df, scaler

## Standardization using z-score

Observing regression model, this function should be applied.

In [146]:
from sklearn.discriminant_analysis import StandardScaler


def zscore_standardize(df, target_column='medv', binary_columns=['chas']):
    """
    Z-Score standardization for all numerical columns
    (Mean=0, Standard variation=1).

    Args:
        df (pd.DataFrame): Input dataframe.
        target_column (str): Target column (non standard).
        binary_columns (list):Binaries columns (non standard).

    Returns:
        pd.DataFrame: Standardized dataframe.
        StandardScaler: Adujsted scaler, if we need to destandardized.
    """
    if df is None:
        logger.error("No dataframe. Standardization aborted.")
        return None, None

    # 1. Identify columns to process
    columns_to_scale = [
        col for col in df.columns 
        if col not in [target_column] and col not in binary_columns
    ]
    
    logger.info(f"Columns to process ({len(columns_to_scale)}): {columns_to_scale}")

    # 2. Create and adjust scaler
    scaler = StandardScaler()
    
    # Ajuster le scaler uniquement sur les colonnes d'entraînement
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    
    logger.success("Z-Score successfully applied on features")
    
    return df, scaler

## NaN detection and imputation using one of the method : median, mean or constant

NaN can issue some deviance. So, we impute data.

In [147]:
from typing import Literal


def impute_missing_values(
    df: pd.DataFrame, 
    strategy: Literal['median', 'mean', 'constant'], 
    constant_value=None
) -> pd.DataFrame:
    """
    Detect and impute missing values (NaN)

    Args:
        df (pd.DataFrame): Input dataframe.
        strategy (Literal['median', 'mean', 'constant']): Imputation method to use
        constant_value (float, optional): Constant value to input if constant strategy

    Returns:
        pd.DataFrame: Dataframe after imputation.
    """
    if df is None:
        logger.error("No dataframe. Imputation aborted.")
        return None

    df_imputed = df.copy()
    
    # Identify numerical columns (statistical imputation)
    # Exclude 'object' types (strings)
    numeric_cols = df_imputed.select_dtypes(include=['number']).columns
    
    logger.info(f"Start imputation using : {strategy.upper()} strategy")
    
    for col in numeric_cols:
        # 1. NaN detection
        nan_count = df_imputed[col].isnull().sum()
        
        if nan_count == 0:
            continue
        
        # 2. Imputation value determination according strategy
        impute_val = None
        
        if strategy == 'median':
            impute_val = df_imputed[col].median()
        elif strategy == 'mean':
            impute_val = df_imputed[col].mean()
        elif strategy == 'constant':
            impute_val = constant_value
            if impute_val is None:
                logger.warning(f"'constant' strategy used for '{col}' but constant_value is None. Ignored.")
                continue
        
        # 3. Imputation
        df_imputed[col].fillna(impute_val, inplace=True)
        logger.debug(f"Column '{col}': {nan_count} NaN values imputed with {impute_val:.4f} ({strategy}).")
        
    # Final check
    if df_imputed.isnull().sum().sum() == 0:
        logger.success("Imputation done : No NaN detected.")
    else:
        logger.warning("Warning : Some NaN detected (probably non numerical columns).")
        
    return df_imputed

## Artifacts : graphs

- residual,
- predictions

In [148]:
import os
import matplotlib.pyplot as plt
import mlflow

def save_and_log_regression_plots(model, X_train, y_train, run_id):
    """Generate and store residual and prediction graphs."""
    
    # 1. Artefact folder
    artifact_path = f"mlruns/{run_id}/artifacts/plots"
    os.makedirs(artifact_path, exist_ok=True)
    
    # Predict on training dataset for plots
    y_pred = model.predict(X_train)
    
    # --- Plot #1 : residual ---
    residues = y_train - y_pred
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, residues, alpha=0.5)
    plt.hlines(y=0, xmin=y_pred.min(), xmax=y_pred.max(), color='red', linestyle='--')
    plt.title('Residual graph')
    plt.xlabel('Predicted values')
    plt.ylabel('Residual (Real - Predicted)')
    resid_plot_path = os.path.join(artifact_path, "residuals_plot.png")
    plt.savefig(resid_plot_path)
    plt.close() # Free memory
    mlflow.log_artifact(resid_plot_path, "plots")
    logger.debug("Residual graph stored")

    # --- Plot #2 : Predicted vs Real ---
    plt.figure(figsize=(8, 8))
    plt.scatter(y_train, y_pred, alpha=0.5)
    plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
    plt.title('Predicted vs. Real values')
    plt.xlabel('Real values')
    plt.ylabel('Predicted values')
    pred_plot_path = os.path.join(artifact_path, "predictions_vs_actual.png")
    plt.savefig(pred_plot_path)
    plt.close()
    mlflow.log_artifact(pred_plot_path, "plots")
    logger.debug("Predicted plot stored")

## Training and logging model using MLFlow
Training will be based on :

- CV (Cross validation),
- RMSE (Root Mean Squared Error) as scoring metric
- R2 scoring metric

In [149]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import KFold, cross_validate

def train_and_log_model(
    model, 
    X_train, 
    y_train, 
    model_name, 
    n_splits=5
):
    """
    Train model, Cross Validation performance measurement, store all with MLFlow
    """
    logger.info(f"MLFlow training experience for: {model_name}")
    
    # Metrics settings
    cv_scoring_metrics = {
        'neg_mean_squared_error': 'neg_mean_squared_error',
        'r2': 'r2'
    }

    # Start MLFlow run
    with mlflow.start_run(run_name=model_name) as run:
        
        run_id = run.info.run_id

        # 1. Cross Validation (CV)
        # Using KFlold (5 splits good default value)
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        # Using RMSE (Root Mean Squared Error) as scoring metric
        # cross_val_score use default negative scoring (neg_mean_squared_error)
        cv_results = cross_validate(
            model, 
            X_train, 
            y_train, 
            scoring=cv_scoring_metrics, 
            cv=cv, 
            n_jobs=-1
        )
        
        # RMSE : Convert to positive RMSE
        rmse_scores = np.sqrt(-cv_results['test_neg_mean_squared_error'])
        mlflow.log_metric("cv_mean_rmse", rmse_scores.mean())
        mlflow.log_metric("cv_std_rmse", rmse_scores.std())
        logger.success(f"CV Mean RMSE for {model_name}: {rmse_scores.mean():.3f}")

        # R2
        r2_scores = cv_results['test_r2']
        mlflow.log_metric("cv_mean_r2", r2_scores.mean())
        mlflow.log_metric("cv_std_r2", r2_scores.std())
        logger.info(f"CV mean R2 for {model_name}: {r2_scores.mean():.3f}")
        
        # 2. Final training (on the whole training dataset)
        model.fit(X_train, y_train)
        
        # Plots and store results
        save_and_log_regression_plots(model, X_train, y_train, run_id)
        
        # 3. MLFlow logging
        
        # Hyperparameters storage
        mlflow.log_params(model.get_params())
        
        # CV metrics storage
        mlflow.log_params(model.get_params())
        
        logger.success(f"Mean CV RMSE for {model_name}: {rmse_scores.mean():.3f}")
        
        # Save final trained model
        mlflow.sklearn.log_model(model, "model")
        
    return model

## main : Handle all processes

In [150]:
# Load dataset
df = load_data()

if df is not None:
    logger.info("Datas was sucessfully loaded and stored in 'df'")
else:
    logger.debug("No data was loaded")

[32m2025-12-09 15:55:22.448[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_data[0m:[36m8[0m - [1mSuccessfully loaded data from datas/bostonhousing-693729dedb019653836667.csv[0m
[32m2025-12-09 15:55:22.449[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_data[0m:[36m9[0m - [1mData shape: (506, 14)[0m
[32m2025-12-09 15:55:22.450[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mDatas was sucessfully loaded and stored in 'df'[0m


In [151]:
# Just print a few lines...
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Outlier detection
Process outlier detection using one of the algorithm : iqr, z-score or isolation forest

In [152]:
# Apply outliers detection

TARGET_COLUMN = "medv"
ALGORITHM = "iqr_detection" # Can be : isolation_forest_detection | zscore_detection

df_iqr_cleaned = outlier_detector(
    df,
    column=TARGET_COLUMN,
    algorithm=iqr_detection,
    factor=1.5
)
df_iqr_cleaned.head()

df_zscore_cleaned = outlier_detector(
    df,
    column=TARGET_COLUMN,
    algorithm=zscore_detection,
    threshold=3.0
)
df_zscore_cleaned.head()

df_if_cleaned = outlier_detector(
    df=df,
    column=TARGET_COLUMN,
    algorithm=isolation_forest_detection,
    contamination=0.05 # Considering about 5% of outliers
)
df_if_cleaned.head()

[32m2025-12-09 15:55:22.475[0m | [1mINFO    [0m | [36m__main__[0m:[36moutlier_detector[0m:[36m19[0m - [1mRunning detection using : IQR[0m
[32m2025-12-09 15:55:22.479[0m | [1mINFO    [0m | [36m__main__[0m:[36moutlier_detector[0m:[36m28[0m - [1m40 outliers was detected in 'medv' (7.91%)[0m
[32m2025-12-09 15:55:22.483[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36moutlier_detector[0m:[36m33[0m - [32m[1mOutliers cleaning done. DataFrame reduce to 506 à 466 rows.[0m
[32m2025-12-09 15:55:22.486[0m | [1mINFO    [0m | [36m__main__[0m:[36moutlier_detector[0m:[36m19[0m - [1mRunning detection using : ZSCORE[0m
[32m2025-12-09 15:55:22.488[0m | [1mINFO    [0m | [36m__main__[0m:[36moutlier_detector[0m:[36m28[0m - [1m0 outliers was detected in 'medv' (0.00%)[0m
[32m2025-12-09 15:55:22.492[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36moutlier_detector[0m:[36m33[0m - [32m[1mOutliers cleaning done. DataFrame reduce to 506 à 506 ro

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Apply normalization using MinMax algorithm

In [153]:
# Use dataframe reduced by iqr in the pipeline
df_normalized, scaler = minmax_normalize(
    df_iqr_cleaned,
    target_column=TARGET_COLUMN,
    binary_columns=["chas"]
)
df_normalized.head()

[32m2025-12-09 15:55:22.572[0m | [1mINFO    [0m | [36m__main__[0m:[36mminmax_normalize[0m:[36m27[0m - [1mColumns to normalize (12): ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'][0m
[32m2025-12-09 15:55:22.575[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mminmax_normalize[0m:[36m36[0m - [32m[1mMinMax normalization successfully applyed on features.[0m


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.0,0.18,0.058148,0,0.314815,0.577505,0.641607,0.268711,0.0,0.208015,0.287234,1.0,0.083356,24.0
1,0.000236,0.0,0.234444,0,0.17284,0.547998,0.782698,0.348524,0.043478,0.104962,0.553191,1.0,0.198944,21.6
2,0.000236,0.0,0.234444,0,0.17284,0.694386,0.599382,0.348524,0.043478,0.104962,0.553191,0.989737,0.05696,34.7
3,0.000293,0.0,0.053333,0,0.150206,0.658555,0.441813,0.448173,0.086957,0.066794,0.648936,0.994276,0.026674,33.4
4,0.000705,0.0,0.053333,0,0.150206,0.687105,0.528321,0.448173,0.086957,0.066794,0.648936,1.0,0.093081,36.2


## Appply imputation using "mean" strategy
Mean better applies to normalized, near "normal" and outliers cleaned datas

In [154]:
df_imputed_mean = impute_missing_values(
    df_normalized,
    strategy="mean"
)
df_imputed_mean.head()

[32m2025-12-09 15:55:22.583[0m | [1mINFO    [0m | [36m__main__[0m:[36mimpute_missing_values[0m:[36m30[0m - [1mStart imputation using : MEAN strategy[0m
[32m2025-12-09 15:55:22.584[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mimpute_missing_values[0m:[36m58[0m - [32m[1mImputation done : No NaN detected.[0m


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.0,0.18,0.058148,0,0.314815,0.577505,0.641607,0.268711,0.0,0.208015,0.287234,1.0,0.083356,24.0
1,0.000236,0.0,0.234444,0,0.17284,0.547998,0.782698,0.348524,0.043478,0.104962,0.553191,1.0,0.198944,21.6
2,0.000236,0.0,0.234444,0,0.17284,0.694386,0.599382,0.348524,0.043478,0.104962,0.553191,0.989737,0.05696,34.7
3,0.000293,0.0,0.053333,0,0.150206,0.658555,0.441813,0.448173,0.086957,0.066794,0.648936,0.994276,0.026674,33.4
4,0.000705,0.0,0.053333,0,0.150206,0.687105,0.528321,0.448173,0.086957,0.066794,0.648936,1.0,0.093081,36.2


In [155]:
# Prepare trained and predicted datas from df_imputed_means (last clean dataset)
X = df_imputed_mean.drop(columns=[TARGET_COLUMN])
y = df_imputed_mean[TARGET_COLUMN]

# Split dataset in training and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Concrete training using multiple strategies

1. LinearRegression : good start to establish base performance

In [156]:
from sklearn.linear_model import LinearRegression


lr_model = LinearRegression()
trained_lr = train_and_log_model(
    model=lr_model,
    X_train=X_train,
    y_train=y_train,
    model_name="Linear_Regression_Baseline"
)

[32m2025-12-09 15:55:22.597[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m15[0m - [1mMLFlow training experience for: Linear_Regression_Baseline[0m
[32m2025-12-09 15:55:22.657[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m47[0m - [32m[1mCV Mean RMSE for Linear_Regression_Baseline: 3.327[0m
[32m2025-12-09 15:55:22.662[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m53[0m - [1mCV mean R2 for Linear_Regression_Baseline: 0.722[0m
[32m2025-12-09 15:55:23.057[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msave_and_log_regression_plots[0m:[36m28[0m - [34m[1mResidual graph stored[0m
[32m2025-12-09 15:55:23.097[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msave_and_log_regression_plots[0m:[36m41[0m - [34m[1mPredicted plot stored[0m
[32m2025-12-09 15:55:23.101[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m69[0m - [32m[1mMea

## Concrete training using multiple strategies

2. Random Forest : strong through scaling issues

In [157]:
from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
trained_rf = train_and_log_model(
    model=rf_model,
    X_train=X_train,
    y_train=y_train,
    model_name="Random_Forest_Default"
)

[32m2025-12-09 15:55:24.665[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m15[0m - [1mMLFlow training experience for: Random_Forest_Default[0m
[32m2025-12-09 15:55:25.690[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m47[0m - [32m[1mCV Mean RMSE for Random_Forest_Default: 2.738[0m
[32m2025-12-09 15:55:25.695[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m53[0m - [1mCV mean R2 for Random_Forest_Default: 0.813[0m
[32m2025-12-09 15:55:25.796[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msave_and_log_regression_plots[0m:[36m28[0m - [34m[1mResidual graph stored[0m
[32m2025-12-09 15:55:25.828[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msave_and_log_regression_plots[0m:[36m41[0m - [34m[1mPredicted plot stored[0m
[32m2025-12-09 15:55:25.833[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m69[0m - [32m[1mMean CV RMSE for R

## Concrete training using multiple strategies

3. lgbm_model : Accurate algorithm especially for tabular datas, often fastest than XGBoost

In [158]:
import lightgbm as lgb

lgbm_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1)
trained_lgbm = train_and_log_model(
    model=lgbm_model,
    X_train=X_train,
    y_train=y_train,
    model_name="LightGBM_Default"
)

[32m2025-12-09 15:55:27.150[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m15[0m - [1mMLFlow training experience for: LightGBM_Default[0m


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 748
[LightGBM] [Info] Number of data points in the train set: 297, number of used features: 12
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 760
[LightGBM] [Info] Total Bins 762
[LightGBM] [Info] Number of data points in the train set: 298, number of used features: 13
[LightGBM] [Info] Number of data points in the train set: 298, number of used features: 13
[LightGBM] [In

[32m2025-12-09 15:55:28.327[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m47[0m - [32m[1mCV Mean RMSE for LightGBM_Default: 2.649[0m
[32m2025-12-09 15:55:28.331[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m53[0m - [1mCV mean R2 for LightGBM_Default: 0.825[0m


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 740
[LightGBM] [Info] Number of data points in the train set: 297, number of used features: 12
[LightGBM] [Info] Start training from score 20.825589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 957
[LightGBM] [Info] Number of data points in the train set: 372, number of used features: 13
[LightGBM] [Info] Start training from score 20.776613


[32m2025-12-09 15:55:28.509[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msave_and_log_regression_plots[0m:[36m28[0m - [34m[1mResidual graph stored[0m
[32m2025-12-09 15:55:28.541[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msave_and_log_regression_plots[0m:[36m41[0m - [34m[1mPredicted plot stored[0m
[32m2025-12-09 15:55:28.549[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m69[0m - [32m[1mMean CV RMSE for LightGBM_Default: 2.649[0m




## Concrete training using multiple strategies

4. K-Nearest Neighbors : Distance models can be accurate on normalized and standardized data

In [159]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor(n_neighbors=5) # 5 neighbors is a good default value
trained_knn = train_and_log_model(
    model=knn_model,
    X_train=X_train,
    y_train=y_train,
    model_name="KNN_Distance_Based"
)

[32m2025-12-09 15:55:29.953[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m15[0m - [1mMLFlow training experience for: KNN_Distance_Based[0m
[32m2025-12-09 15:55:30.806[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m47[0m - [32m[1mCV Mean RMSE for KNN_Distance_Based: 3.316[0m
[32m2025-12-09 15:55:30.810[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m53[0m - [1mCV mean R2 for KNN_Distance_Based: 0.726[0m
[32m2025-12-09 15:55:30.853[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msave_and_log_regression_plots[0m:[36m28[0m - [34m[1mResidual graph stored[0m
[32m2025-12-09 15:55:30.886[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msave_and_log_regression_plots[0m:[36m41[0m - [34m[1mPredicted plot stored[0m
[32m2025-12-09 15:55:30.890[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_and_log_model[0m:[36m69[0m - [32m[1mMean CV RMSE for KNN_Distan