In [144]:
import pandas as pd
import numpy as np
import math
import logging

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-learn imports
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error, mean_squared_log_error

# Statsmodels import for VIF calculation
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import mlflow
import mlflow.sklearn 
from mlflow.models.signature import infer_signature

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV


# ### 1. Data Loading & Basic Inspection

In [145]:

def read_data(filename):
    """
    Reads the CSV file and returns a pandas DataFrame.
    """
    try:
        df = pd.read_csv(filename)
        if df.empty:
            raise ValueError("🚨 Data loaded but is empty.")
        return df
    except FileNotFoundError:
        raise FileNotFoundError("❌ Data file not found at specified path.")
    except Exception as e:
        raise RuntimeError(f"❌ Unexpected error while loading data: {e}")

In [146]:
def print_infos(df):
    """
    Prints general information and description of the DataFrame.
    """
    print("Data info : \n")
    display(df.info())

    print("\n\nData description: \n")
    display(df.describe())


In [147]:
def check_null(df):
    """
    Check and display the number of null values in each column.
    """
    print(df.isnull().sum().sort_values(ascending=False))

# ### 1. Clean data columns

In [148]:
def intial_clean_data(df):
    """
    Drops unnecessary columns from the DataFrame.
    """
    cols_to_drop = ['instant', 'dteday', 'registered', 'casual']
    df = df.drop(columns=cols_to_drop)
    return df

# ### 1. Convert Data type and dummies

In [149]:
# pipline funnction
def convert_to_category_type(df, columns):
    """
    Converts specified columns to categorical type.
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
        else:
            raise KeyError(f"Column '{col}' not found in DataFrame.")
    return df

In [150]:
def convert_dummies(df):
    """
    Converts categorical columns to dummy/indicator variables.
    """
    df = pd.get_dummies(df)
    return df

In [151]:
def get_numeric_columns(df):
    """
    Returns the numeric columns in the DataFrame.
    """
    return df.select_dtypes(include=['number']).columns.tolist()

In [152]:
def convert_bool_float(df):
    """
    Converts all boolean columns in a DataFrame to float (0.0 and 1.0).

    Parameters:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A new DataFrame with boolean columns converted to float.
    """
    df = df.copy()
    for col in df.select_dtypes(include='bool').columns:
        df[col] = df[col].astype(float)
    return df
    
    

In [153]:
def convert_int_float(df):
    df = df.copy()
    for col in df.select_dtypes(include='int').columns:
        df[col] = df[col].astype(float)
    return df


# ### 1. Display Data

In [154]:
def dispayl_histplot(df):
    """
    Displays histograms for all numeric columns in the DataFrame.
    """
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    
    n_cols = 4  
    n_rows = math.ceil(len(numeric_columns) / n_cols)
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows))
    
    axes = axes.flatten()
    
    for i, column in enumerate(numeric_columns):
        sns.histplot(df[column], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {column}')
        
    for i in range(len(numeric_columns), len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

In [155]:
def display_barplot(df, column):
    """
    Displays barplot for a specific column.
    """
    plt.figure(figsize = (10,4))
    plt.subplot(1, 2, 1)
    sns.barplot(x=column, y='cnt', data=df)


In [156]:
def display_violinplot(df):
    """
    Displays violin plot for multiple features against the 'cnt' column.
    """
    plt.figure(figsize=(20, 15))
    columns_to_plot = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 
                       'weathersit', 'temp', 'atemp', 'hum', 'windspeed']
    
    for idx, col in enumerate(columns_to_plot):
        plt.subplot(4, 3, idx + 1)
        if df[col].nunique() < 10:
            sns.violinplot(x=col, y='cnt', data=df)
        else:
            sns.scatterplot(x=col, y='cnt', data=df, alpha=0.5)
        plt.title(f'{col} vs cnt')
    
    plt.tight_layout()
    plt.show()

In [157]:
def display_corelation(df):
    """
    Displays the correlation heatmap between numerical features.
    """
    corr_matrix = df.corr(numeric_only=True)
    plt.figure(figsize=(25, 20))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Between Features')
    plt.tight_layout()
    plt.show()

In [178]:
def data_split(df, y, test_size=0.15, val_size=0.15, random_state=42):
    """
    Splits the DataFrame into train, validation, and test sets.
    """

    X_temp, X_test, y_temp, y_test = train_test_split(df, y, test_size=test_size, random_state=random_state)

    val_relative_size = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_relative_size, random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test

# ### 1. Feature selection

In [159]:
def feature_selection_rfe(X_train_scaled, y_train_scaled, n_features=15):
    """
    Selects features using Recursive Feature Elimination (RFE).
    """
    lr_rfe = LinearRegression()
    lr_rfe.fit(X_train_scaled, y_train_scaled)

    rfe = RFE(estimator=lr_rfe, n_features_to_select=n_features)
    rfe = rfe.fit(X_train_scaled, y_train_scaled)
    
    return pd.DataFrame(
        rfe.transform(X_train_scaled),
        columns=X_train_scaled.columns[rfe.support_],
        index=X_train_scaled.index
    )

In [160]:
def calculate_vif(X_train_rfe):
    """
    Calculates Variance Inflation Factor (VIF) for each feature.
    """
    vif = pd.DataFrame()
    vif['Features'] = X_train_rfe.columns
    vif['VIF'] = [variance_inflation_factor(X_train_rfe.values, i) for i in range(X_train_rfe.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    return vif.sort_values(by="VIF", ascending=False)

In [161]:
def transform_rfe(rfe, df):
       return pd.DataFrame(
        rfe.transform(df),
        columns=df.columns[rfe.support_],
        index=df.index
    )

# ### 1. Scale data

In [162]:
def scale_features(X):
    feature_scaler = MinMaxScaler()  
    X_scaled = feature_scaler.fit_transform(X) 
    return feature_scaler, X_scaled

In [163]:
def scale_target(y):
    target_scaler = MinMaxScaler()  
    y_scaled = target_scaler.fit_transform(y.values.reshape(-1, 1)) 
    return target_scaler, y_scaled

In [164]:
def unscale_features(scaler, X_scaled):
    X_unscaled = scaler.inverse_transform(X_scaled)
    return X_unscaled

In [165]:
def unscale_target(scaler, y_scaled):
    y_unscaled = scaler.inverse_transform(y_scaled)
    return y_unscaled

In [166]:
def save_model(model, filename):
    """
    Saves the trained model to a file.
    """
    import joblib
    joblib.dump(model, filename)

In [167]:
def load_model(filename):
    """
    Loads a saved model from a file.
    """
    import joblib
    return joblib.load(filename)

# ### 1. Model Training

In [168]:
def train_linear_model(X_train_scaled, y_train_scaled):
    """
    Trains a Linear Regression model and evaluates it.
    """
    model = LinearRegression()
    model.fit(X_train_scaled, y_train_scaled)
    return model

In [169]:
def evaluate_model(model, X_val, y_val, target_scaler, model_name="model"):
    """
    Evaluate the model using RMSLE, RMSE, NRMSE, and Relative Error.

    Parameters:
    - model: Trained regression model with a predict method
    - X_val: Validation features
    - y_val: True target values (scaled)
    - target_scaler: Scaler used to inverse transform target values
    - model_name: Optional name for the model

    Returns:
    - Dictionary with evaluation metrics
    """

    # Predict and reshape
    y_pred = model.predict(X_val)
    y_pred_array = np.array(y_pred).reshape(-1, 1)
    y_val_array = np.array(y_val).reshape(-1, 1)

    # Inverse transform predictions and targets
    y_pred_original = target_scaler.inverse_transform(y_pred_array)
    y_test_original = target_scaler.inverse_transform(y_val_array)

    # Clip predictions to avoid negative values
    y_pred_clipped = np.clip(y_pred_original, 0, None)

    # Calculate metrics
    rmsle = np.sqrt(mean_squared_log_error(y_test_original, y_pred_clipped))
    rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_clipped))
    nrmse = rmse / (y_test_original.max() - y_test_original.min())
    relative_error = rmse / y_test_original.mean()

    # Print or log
    print(f"📊 Evaluation Results for {model_name}:")
    print(f" - RMSLE: {rmsle:.4f}")
    print(f" - RMSE: {rmse:.4f}")
    print(f" - Normalized RMSE: {nrmse:.2%}")
    print(f" - Relative Error: {relative_error:.2%}")

    return {
        "rmsle": rmsle,
        "rmse": rmse,
        "nrmse": nrmse,
        "relative_error": relative_error
    }


# ### 1. Run script

In [222]:
if __name__ == "__main__":

    
    # Load and clean data
    df = read_data("./data/hour.csv")
    df_cleaned = intial_clean_data(df)

    # Visualize data
    # dispayl_histplot(df_cleaned)
    # display_violinplot(df_cleaned)
    # display_corelation(df_cleaned)

    # Convert to category and one-hot encode
    df_cleaned = convert_to_category_type(df_cleaned, ['weekday', 'weathersit', 'mnth', 'season'])
    df_cleaned = convert_dummies(df_cleaned)
    df_cleaned = convert_bool_float(df_cleaned)
    df_cleaned = convert_int_float(df_cleaned)

    X = df_cleaned.drop(columns=['cnt'])
    y = df_cleaned['cnt']


    models ={
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(n_estimators=100),
        "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1)
    }
    
    for rfe_selection in range(10, 10):
        for model_name, model in models.items():
            with mlflow.start_run(run_name=model_name + "_" +   str(rfe_selection)):
                X_train, X_val, X_test, y_train, y_val, y_test = data_split(df_ref, y)
                df_ref = feature_selection_rfe(X, y, rfe_selection)
    
                feature_scaler, X_train_scaled = scale_features(X_train)
                target_scaler, y_train_scaled = scale_target(y_train)
                
                model.fit(X_train, y_train)
                # Predict
                y_pred = model.predict(X_val)
        
                # Evaluate
                metrics = evaluate_model(model, X_val, y_val, target_scaler, model_name)
                
                # Log parameters and metrics
                mlflow.set_tag("rfe_" + model_name)
                mlflow.log_param("rfe selection", rfe_selection)
                mlflow.log_param("model_name", model_name)
                mlflow.log_metrics(metrics)
                
                signature = infer_signature(X_val, y_pred)
                input_example = X_train[:5] 
                # Log the model
                
                mlflow.sklearn.log_model(model, "model", signature=signature, input_example=input_example)
        


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.3858
 - RMSE: 140636.3063
 - Normalized RMSE: 15.66%
 - Relative Error: 77.64%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2883
 - RMSE: 145263.1795
 - Normalized RMSE: 16.18%
 - Relative Error: 80.20%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3142
 - RMSE: 135813.2999
 - Normalized RMSE: 15.12%
 - Relative Error: 74.98%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.3858
 - RMSE: 140636.3063
 - Normalized RMSE: 15.66%
 - Relative Error: 77.64%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2917
 - RMSE: 145425.1191
 - Normalized RMSE: 16.19%
 - Relative Error: 80.29%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3155
 - RMSE: 135966.7520
 - Normalized RMSE: 15.14%
 - Relative Error: 75.07%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.3525
 - RMSE: 140449.2344
 - Normalized RMSE: 15.64%
 - Relative Error: 77.54%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2988
 - RMSE: 145711.1081
 - Normalized RMSE: 16.23%
 - Relative Error: 80.45%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3141
 - RMSE: 135798.4840
 - Normalized RMSE: 15.12%
 - Relative Error: 74.97%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.3445
 - RMSE: 140335.2203
 - Normalized RMSE: 15.63%
 - Relative Error: 77.48%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2947
 - RMSE: 145671.7367
 - Normalized RMSE: 16.22%
 - Relative Error: 80.42%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3117
 - RMSE: 135604.2818
 - Normalized RMSE: 15.10%
 - Relative Error: 74.87%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.3693
 - RMSE: 140150.9049
 - Normalized RMSE: 15.61%
 - Relative Error: 77.38%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2845
 - RMSE: 145206.1111
 - Normalized RMSE: 16.17%
 - Relative Error: 80.17%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3085
 - RMSE: 135473.4639
 - Normalized RMSE: 15.09%
 - Relative Error: 74.79%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.3532
 - RMSE: 139796.8291
 - Normalized RMSE: 15.57%
 - Relative Error: 77.18%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2832
 - RMSE: 145547.8534
 - Normalized RMSE: 16.21%
 - Relative Error: 80.36%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3104
 - RMSE: 135384.1924
 - Normalized RMSE: 15.08%
 - Relative Error: 74.74%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.3664
 - RMSE: 139560.5415
 - Normalized RMSE: 15.54%
 - Relative Error: 77.05%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2871
 - RMSE: 146073.5776
 - Normalized RMSE: 16.27%
 - Relative Error: 80.65%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3083
 - RMSE: 134719.3983
 - Normalized RMSE: 15.00%
 - Relative Error: 74.38%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4329
 - RMSE: 139529.8204
 - Normalized RMSE: 15.54%
 - Relative Error: 77.03%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2692
 - RMSE: 140291.4188
 - Normalized RMSE: 15.62%
 - Relative Error: 77.45%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3309
 - RMSE: 133952.4347
 - Normalized RMSE: 14.92%
 - Relative Error: 73.95%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4449
 - RMSE: 139521.7631
 - Normalized RMSE: 15.54%
 - Relative Error: 77.03%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2674
 - RMSE: 139592.6669
 - Normalized RMSE: 15.55%
 - Relative Error: 77.07%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3293
 - RMSE: 133702.1039
 - Normalized RMSE: 14.89%
 - Relative Error: 73.82%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4555
 - RMSE: 139438.9038
 - Normalized RMSE: 15.53%
 - Relative Error: 76.98%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2667
 - RMSE: 139208.7423
 - Normalized RMSE: 15.50%
 - Relative Error: 76.86%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3290
 - RMSE: 133198.8105
 - Normalized RMSE: 14.83%
 - Relative Error: 73.54%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4111
 - RMSE: 139426.8969
 - Normalized RMSE: 15.53%
 - Relative Error: 76.98%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2703
 - RMSE: 139512.8602
 - Normalized RMSE: 15.54%
 - Relative Error: 77.02%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3432
 - RMSE: 133239.3956
 - Normalized RMSE: 14.84%
 - Relative Error: 73.56%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4366
 - RMSE: 139373.0555
 - Normalized RMSE: 15.52%
 - Relative Error: 76.95%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2710
 - RMSE: 140109.0808
 - Normalized RMSE: 15.60%
 - Relative Error: 77.35%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3305
 - RMSE: 133661.2618
 - Normalized RMSE: 14.88%
 - Relative Error: 73.79%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4366
 - RMSE: 139373.0555
 - Normalized RMSE: 15.52%
 - Relative Error: 76.95%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2606
 - RMSE: 139131.5165
 - Normalized RMSE: 15.49%
 - Relative Error: 76.81%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3035
 - RMSE: 133212.1449
 - Normalized RMSE: 14.83%
 - Relative Error: 73.55%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4526
 - RMSE: 139316.6526
 - Normalized RMSE: 15.51%
 - Relative Error: 76.92%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2494
 - RMSE: 136534.8511
 - Normalized RMSE: 15.20%
 - Relative Error: 75.38%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.3161
 - RMSE: 132556.1578
 - Normalized RMSE: 14.76%
 - Relative Error: 73.18%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4560
 - RMSE: 139337.6990
 - Normalized RMSE: 15.52%
 - Relative Error: 76.93%
📊 Evaluation Results for RandomForest:
 - RMSLE: 1.2374
 - RMSE: 134892.1114
 - Normalized RMSE: 15.02%
 - Relative Error: 74.47%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.2977
 - RMSE: 131803.2810
 - Normalized RMSE: 14.68%
 - Relative Error: 72.77%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4460
 - RMSE: 139355.1283
 - Normalized RMSE: 15.52%
 - Relative Error: 76.94%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3979
 - RMSE: 43290.9093
 - Normalized RMSE: 4.82%
 - Relative Error: 23.90%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.0257
 - RMSE: 42217.0408
 - Normalized RMSE: 4.70%
 - Relative Error: 23.31%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.4968
 - RMSE: 132235.1421
 - Normalized RMSE: 14.73%
 - Relative Error: 73.01%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3982
 - RMSE: 43270.8826
 - Normalized RMSE: 4.82%
 - Relative Error: 23.89%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.1041
 - RMSE: 42671.5474
 - Normalized RMSE: 4.75%
 - Relative Error: 23.56%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.5174
 - RMSE: 132244.3191
 - Normalized RMSE: 14.73%
 - Relative Error: 73.01%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3959
 - RMSE: 43023.4221
 - Normalized RMSE: 4.79%
 - Relative Error: 23.75%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.0945
 - RMSE: 42550.2615
 - Normalized RMSE: 4.74%
 - Relative Error: 23.49%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.5172
 - RMSE: 132227.7758
 - Normalized RMSE: 14.73%
 - Relative Error: 73.00%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3824
 - RMSE: 41595.4442
 - Normalized RMSE: 4.63%
 - Relative Error: 22.96%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.0655
 - RMSE: 41052.6482
 - Normalized RMSE: 4.57%
 - Relative Error: 22.66%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.5185
 - RMSE: 132212.5649
 - Normalized RMSE: 14.72%
 - Relative Error: 72.99%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3815
 - RMSE: 41662.4280
 - Normalized RMSE: 4.64%
 - Relative Error: 23.00%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.0890
 - RMSE: 41639.3232
 - Normalized RMSE: 4.64%
 - Relative Error: 22.99%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.5185
 - RMSE: 132212.5649
 - Normalized RMSE: 14.72%
 - Relative Error: 72.99%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3821
 - RMSE: 41481.1955
 - Normalized RMSE: 4.62%
 - Relative Error: 22.90%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.1010
 - RMSE: 41949.6350
 - Normalized RMSE: 4.67%
 - Relative Error: 23.16%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.5351
 - RMSE: 132215.8662
 - Normalized RMSE: 14.72%
 - Relative Error: 73.00%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3831
 - RMSE: 41184.3289
 - Normalized RMSE: 4.59%
 - Relative Error: 22.74%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.1086
 - RMSE: 41653.3370
 - Normalized RMSE: 4.64%
 - Relative Error: 23.00%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.5250
 - RMSE: 132243.7635
 - Normalized RMSE: 14.73%
 - Relative Error: 73.01%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3815
 - RMSE: 41197.0118
 - Normalized RMSE: 4.59%
 - Relative Error: 22.74%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.0750
 - RMSE: 41805.7023
 - Normalized RMSE: 4.66%
 - Relative Error: 23.08%


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.5298
 - RMSE: 132243.6623
 - Normalized RMSE: 14.73%
 - Relative Error: 73.01%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3825
 - RMSE: 41135.1582
 - Normalized RMSE: 4.58%
 - Relative Error: 22.71%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.1260
 - RMSE: 41239.8377
 - Normalized RMSE: 4.59%
 - Relative Error: 22.77%


In [263]:
models = {
    "RandomForestRegressor": GridSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_grid={
            "n_estimators": [100, 200],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5],
        },
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    ),
    "XGBRegressor": GridSearchCV(
        estimator=XGBRegressor(objective="reg:squarederror", random_state=42),
        param_grid={
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 6]
        },
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    )
}
for rfe_selection in range(10, 34):
    df_ref = feature_selection_rfe(X, y, rfe_selection)

    for model_name, model in models.items():
            with mlflow.start_run(run_name=model_name):

                X_train, X_val, X_test, y_train, y_val, y_test = data_split(df_ref, y)

                feature_scaler, X_train_scaled = scale_features(X_train)
                target_scaler, y_train_scaled = scale_target(y_train)
                
                model.fit(X_train, y_train)
                best_model = model.best_estimator_
                y_pred = best_model.predict(X_val)
        
                # Evaluate (your custom function)
                metrics = evaluate_model(best_model, X_val, y_val, target_scaler, model_name)
        
                # Log best hyperparameters and metrics
                mlflow.set_tag("model index" , "rfe_Grd_" + model_name)
                mlflow.log_param("rfe selection", rfe_selection)
                mlflow.log_param("model_name", model_name)
                mlflow.log_params(model.best_params_)  # ← logs hyperparameters from GridSearchCV
                mlflow.log_metrics(metrics)
        
                # Log model with signature and input example
                signature = infer_signature(X_val, y_pred)
                input_example = X_val[:5]
                mlflow.sklearn.log_model(best_model, "model", signature=signature, input_example=input_example)
        
                print(f"✅ {model_name} logged to MLflow with best params.")

📊 Evaluation Results for RandomForestRegressor:
 - RMSLE: 1.3058
 - RMSE: 135926.2640
 - Normalized RMSE: 15.14%
 - Relative Error: 75.04%
✅ RandomForestRegressor logged to MLflow with best params.
📊 Evaluation Results for XGBRegressor:
 - RMSLE: 1.3210
 - RMSE: 135376.4499
 - Normalized RMSE: 15.08%
 - Relative Error: 74.74%
✅ XGBRegressor logged to MLflow with best params.
📊 Evaluation Results for RandomForestRegressor:
 - RMSLE: 1.3058
 - RMSE: 135905.8571
 - Normalized RMSE: 15.13%
 - Relative Error: 75.03%
✅ RandomForestRegressor logged to MLflow with best params.
📊 Evaluation Results for XGBRegressor:
 - RMSLE: 1.3223
 - RMSE: 135491.7073
 - Normalized RMSE: 15.09%
 - Relative Error: 74.80%
✅ XGBRegressor logged to MLflow with best params.
📊 Evaluation Results for RandomForestRegressor:
 - RMSLE: 1.3064
 - RMSE: 135912.5296
 - Normalized RMSE: 15.14%
 - Relative Error: 75.04%
✅ RandomForestRegressor logged to MLflow with best params.
📊 Evaluation Results for XGBRegressor:
 - RMSL

In [271]:
mlflow.set_experiment("Validat model")
experiment = mlflow.get_experiment_by_name("final test with validation data")

# You can filter by specific model type if desired, e.g., model_name = 'RandomForest'
query = "metrics.rmsle IS NOT NULL"  # Optionally add: " and params.model_name = 'RandomForest'"

# Search with the query and sort
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.rmsle ASC"],  # This orders runs by lowest RMSLE
    max_results=1
)

# Get the best run ID
best_run_id = runs.iloc[0].run_id
print(f"Best run ID with lowest RMSLE: {best_run_id}")

# Load the model
model_uri = f"runs:/{best_run_id}/model"
best_model = mlflow.sklearn.load_model(model_uri)

X_test_scaled = feature_selection_rfe(X_test, X_test, 10)
feature_scaler, X_test_scaled = scale_features(X_test)
target_scaler, y_test_scaled = scale_target(y_test)


metrics = evaluate_model(best_model, X_test_scaled, y_test_scaled, target_scaler, model_name)



Best run ID with lowest RMSLE: f85c615eb66946eebfddb1ab42c82f48
📊 Evaluation Results for RandomForestRegressor:
 - RMSLE: 6.0655
 - RMSE: 56228.9054
 - Normalized RMSE: 5761.16%
 - Relative Error: 30403.02%


  intercept_ = y_offset - X_offset @ coef_.T
  intercept_ = y_offset - X_offset @ coef_.T
  intercept_ = y_offset - X_offset @ coef_.T
  intercept_ = y_offset - X_offset @ coef_.T
  intercept_ = y_offset - X_offset @ coef_.T
  intercept_ = y_offset - X_offset @ coef_.T
  intercept_ = y_offset - X_offset @ coef_.T
  intercept_ = y_offset - X_offset @ coef_.T
  intercept_ = y_offset - X_offset @ coef_.T


In [257]:
df_ref_val

Unnamed: 0,yr,hr,holiday,workingday,temp,atemp,hum,windspeed,season_1,season_2,...,mnth_12,weekday_0,weekday_1,weekday_2,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_3,weathersit_4
12830,1.0,19.0,0.0,0.0,0.80,0.6970,0.27,0.1940,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
8688,1.0,20.0,1.0,0.0,0.24,0.2273,0.41,0.2239,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7091,0.0,2.0,0.0,1.0,0.32,0.3030,0.66,0.2836,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
12230,1.0,19.0,0.0,1.0,0.78,0.7121,0.52,0.3582,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
431,0.0,0.0,0.0,1.0,0.26,0.2273,0.56,0.3881,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6497,0.0,7.0,0.0,1.0,0.36,0.3485,0.71,0.1343,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1722,0.0,10.0,0.0,1.0,0.44,0.4394,0.54,0.3284,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
14359,1.0,12.0,0.0,0.0,0.72,0.6970,0.74,0.3582,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1061,0.0,23.0,0.0,1.0,0.38,0.3939,0.40,0.2239,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [320]:
import mlflow
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import RFE
import pandas as pd
import numpy as np

# Custom RFE transformer that remembers the feature names
class RFEFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features, estimator=None):
        self.n_features = n_features
        self.estimator = estimator
        self.selected_features = None
        self.selector = None
        
    def fit(self, X, y=None):
        # Use your existing feature_selection_rfe function internally
        if self.estimator is None:
            # If no estimator provided, create a random forest for feature importance
            from sklearn.ensemble import RandomForestRegressor
            self.estimator = RandomForestRegressor(n_estimators=10, random_state=42)
        
        # Create and fit RFE selector
        self.selector = RFE(estimator=self.estimator, n_features_to_select=self.n_features)
        self.selector.fit(X, y)
        
        # Store selected feature names
        self.selected_features = X.columns[self.selector.support_].tolist()
        return self
        
    def transform(self, X):
        # Ensure X is a DataFrame
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
            
        # Apply the selector
        if set(self.selected_features).issubset(set(X.columns)):
            # If all selected features are in X, use just those
            return X[self.selected_features]
        else:
            # Apply the transformation using the RFE mask
            return self.selector.transform(X)
    
    def get_feature_names_out(self):
        return np.array(self.selected_features)

# # Your existing models dictionary
# models = {
#     "RandomForestRegressor": GridSearchCV(
#         estimator=RandomForestRegressor(random_state=42),
#         param_grid={
#             "n_estimators": [100, 200],
#             "max_depth": [None, 10],
#             "min_samples_split": [2, 5],
#         },
#         cv=5,
#         scoring="neg_mean_squared_error",
#         n_jobs=-1
#     ),
#     "XGBRegressor": GridSearchCV(
#         estimator=XGBRegressor(objective="reg:squarederror", random_state=42),
#         param_grid={
#             "n_estimators": [100, 200],
#             "learning_rate": [0.05, 0.1],
#             "max_depth": [3, 6]
#         },
#         cv=5,
#         scoring="neg_mean_squared_error",
#         n_jobs=-1
#     )
# }

# # Modified training loop
# for rfe_selection in range(10, 11):
#     for model_name, model in models.items():
#         with mlflow.start_run(run_name=model_name):
#             # Create feature selector (to be saved with the model)
#             feature_selector = RFEFeatureSelector(n_features=rfe_selection)
            
#             feature_selector.fit(X, y)
#             df_ref =  feature_selector.transform(X)
            
#             X_train, X_val, X_test, y_train, y_val, y_test = data_split(df_ref, y)
            
#             # Fit the selector on training data
#              # Fit on full data to match your current approach
            
#             model.fit(X_train, y_train)
#             best_model = model.best_estimator_
            
#             # Create a pipeline with feature selection and the best model
#             pipeline = Pipeline([
#                 ('feature_selector', feature_selector),
#                 ('estimator', best_model)
#             ])
            
#             # Get predictions using the pipeline
#             X_val_original = X_val.copy()# Keep a copy with original column names
#             y_pred = pipeline.predict(X_val_original)
            
#             # Evaluate the model
#             metrics = evaluate_model(best_model, X_val, y_val, target_scaler, model_name)
            
#             # Log parameters and metrics
#             mlflow.set_tag("model_index", "rfe_Grd_" + model_name)
#             mlflow.log_param("rfe_selection", rfe_selection)
#             mlflow.log_param("model_name", model_name)
#             mlflow.log_params(model.best_params_)
#             mlflow.log_metrics(metrics)
            
#             # Log the pipeline instead of just the model
#             signature = infer_signature(X_val_original, y_pred)
#             input_example = X_val_original[:5]
#             mlflow.sklearn.log_model(pipeline, "model", signature=signature, input_example=input_example)
            
#             print(f"✅ {model_name} with RFE pipeline logged to MLflow with best params.")

In [292]:
y_pred = pipeline.predict(X_val_original)


In [291]:
y_pred.min()

np.float32(-58.32868)

In [293]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np

# Define base models without transformations
base_models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(objective="reg:squarederror", random_state=42)
}

# Wrap with log-transform
models = {
    name: GridSearchCV(
        estimator=TransformedTargetRegressor(
            regressor=model,
            func=np.log1p,        # Use log(1 + x) to handle zeros
            inverse_func=np.expm1 # Use exp(x) - 1 to reverse
        ),
        param_grid=param_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    )
    for name, model, param_grid in [
        ("RandomForest", base_models["RandomForest"], {
            "regressor__n_estimators": [100, 200],
            "regressor__max_depth": [None, 10],
            "regressor__min_samples_split": [2, 5]
        }),
        ("XGBoost", base_models["XGBoost"], {
            "regressor__n_estimators": [100, 200],
            "regressor__learning_rate": [0.05, 0.1],
            "regressor__max_depth": [3, 6]
        })
    ]
}

In [324]:
def evaluate_model_1(pipeline, X, y, target_scaler, model_name):
    # Get predictions (already in original scale)
    y_pred = pipeline.predict(X)
    
    # Ensure no negative predictions
    y_pred = np.maximum(y_pred, 0)  # Add safety clamp
    
    # Calculate metrics
    return {
        "mse": mean_squared_error(y, y_pred),
        "mae": mean_absolute_error(y, y_pred),
        "r2": r2_score(y, y_pred),
        "msle": mean_squared_log_error(y, y_pred)
    }

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# 1. Data Cleaning Class
class DataCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Learn cleaning parameters (e.g., mean for missing values)
        return self
        
    def transform(self, X):
        # Implement cleaning logic (handle NaNs, outliers, etc.)
        X_clean = X.copy()
        # ... your cleaning code ...
        return X_clean

# 2. Feature Engineering Class  
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Learn engineering parameters (e.g., PCA components)
        return self
        
    def transform(self, X):
        # Create new features/transformations
        X_eng = X.copy()
        X_eng['new_feature'] = X_eng['a'] / X_eng['b']
        # ... other feature engineering ...
        return X_eng

In [304]:
X

Unnamed: 0,yr,hr,holiday,workingday,temp,atemp,hum,windspeed,season_1,season_2,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3,weathersit_4
0,0.0,0.0,0.0,0.0,0.24,0.2879,0.81,0.0000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.22,0.2727,0.80,0.0000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,2.0,0.0,0.0,0.22,0.2727,0.80,0.0000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,3.0,0.0,0.0,0.24,0.2879,0.75,0.0000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,4.0,0.0,0.0,0.24,0.2879,0.75,0.0000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1.0,19.0,0.0,1.0,0.26,0.2576,0.60,0.1642,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17375,1.0,20.0,0.0,1.0,0.26,0.2576,0.60,0.1642,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17376,1.0,21.0,0.0,1.0,0.26,0.2576,0.60,0.1642,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
17377,1.0,22.0,0.0,1.0,0.26,0.2727,0.56,0.1343,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [302]:
df_ref

Unnamed: 0,yr,hr,workingday,temp,atemp,hum,windspeed,season_1,season_4,weathersit_3
0,0.0,0.0,0.0,0.24,0.2879,0.81,0.0000,1.0,0.0,0.0
1,0.0,1.0,0.0,0.22,0.2727,0.80,0.0000,1.0,0.0,0.0
2,0.0,2.0,0.0,0.22,0.2727,0.80,0.0000,1.0,0.0,0.0
3,0.0,3.0,0.0,0.24,0.2879,0.75,0.0000,1.0,0.0,0.0
4,0.0,4.0,0.0,0.24,0.2879,0.75,0.0000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
17374,1.0,19.0,1.0,0.26,0.2576,0.60,0.1642,1.0,0.0,0.0
17375,1.0,20.0,1.0,0.26,0.2576,0.60,0.1642,1.0,0.0,0.0
17376,1.0,21.0,1.0,0.26,0.2576,0.60,0.1642,1.0,0.0,0.0
17377,1.0,22.0,1.0,0.26,0.2727,0.56,0.1343,1.0,0.0,0.0


In [322]:
X = df_cleaned.drop(columns=['cnt'])
y = df_cleaned['cnt']
X_train, X_val, X_test, y_train, y_val, y_test = data_split(X, y)

In [323]:
for rfe_selection in range(10, 11):
    for model_name, model in models.items():
        with mlflow.start_run(run_name=model_name):
            # Data splitting (use original target)
            
            # Create full pipeline
            pipeline = Pipeline([
                ('feature_selector', RFEFeatureSelector(n_features=rfe_selection)),
                ('model', model)
            ])
            
            # Fit on training data
            pipeline.fit(X_train, y_train)
            
            metrics = evaluate_model_1(pipeline, X_val, y_val, None, model_name)
            
            # Logging remains the same
            mlflow.log_param("rfe_selection", rfe_selection)
            mlflow.log_params(pipeline.named_steps['model'].best_params_)
            mlflow.log_metrics(metrics)
            
            # Log the full pipeline
            mlflow.sklearn.log_model(pipeline, "model")



In [None]:
run_id = "d6a962ca898e4bbaa2e3de95d0e3cbe1"  # Copy from MLflow experiment page
model_uri = f"runs:/{run_id}/model"
loaded_pipeline = mlflow.sklearn.load_model(model_uri)

In [None]:
assert isinstance(X_test, pd.DataFrame), "Test data must be a DataFrame with original columns"

In [309]:
X_test

Unnamed: 0,yr,hr,holiday,workingday,temp,atemp,hum,windspeed,season_1,season_2,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3,weathersit_4
12830,1.0,19.0,0.0,0.0,0.80,0.6970,0.27,0.1940,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
8688,1.0,20.0,1.0,0.0,0.24,0.2273,0.41,0.2239,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7091,0.0,2.0,0.0,1.0,0.32,0.3030,0.66,0.2836,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
12230,1.0,19.0,0.0,1.0,0.78,0.7121,0.52,0.3582,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
431,0.0,0.0,0.0,1.0,0.26,0.2273,0.56,0.3881,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6497,0.0,7.0,0.0,1.0,0.36,0.3485,0.71,0.1343,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1722,0.0,10.0,0.0,1.0,0.44,0.4394,0.54,0.3284,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
14359,1.0,12.0,0.0,0.0,0.72,0.6970,0.74,0.3582,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1061,0.0,23.0,0.0,1.0,0.38,0.3939,0.40,0.2239,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [311]:
y_test_predict = pipeline.(X_test)

In [315]:
evaluate_model_1(pipeline, X_test, y_test, None, model_name)

{'mse': 2387.285255283001,
 'mae': 29.87999994694432,
 'r2': 0.9246203930348473,
 'msle': 0.11877472944497335}

In [313]:
y_test_predict

array([431.70923 ,  77.08661 ,   6.153729, ..., 318.7845  ,  47.983093,
       113.09479 ], dtype=float32)