In [21]:
import pandas as pd
import numpy as np
import math
import logging

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error, mean_squared_log_error

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import mlflow
import mlflow.sklearn 
from mlflow.models.signature import infer_signature

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error


In [8]:
def read_data(filename):
    """
    Reads the CSV file and returns a pandas DataFrame.
    """
    try:
        df = pd.read_csv(filename)
        if df.empty:
            raise ValueError("🚨 Data loaded but is empty.")
        return df
    except FileNotFoundError:
        raise FileNotFoundError("❌ Data file not found at specified path.")
    except Exception as e:
        raise RuntimeError(f"❌ Unexpected error while loading data: {e}")

In [13]:
def data_split(df, y, test_size=0.15, val_size=0.15, random_state=42):
    """
    Splits the DataFrame into train, validation, and test sets.
    """

    X_temp, X_test, y_temp, y_test = train_test_split(df, y, test_size=test_size, random_state=random_state)

    val_relative_size = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_relative_size, random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [9]:

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols_to_drop = ['instant', 'dteday', 'registered', 'casual']
        
    def fit(self, X, y=None):
      
        return self
        
    def transform(self, X):
        return self.initial_clean_data(X)
    
    def initial_clean_data(self, df):
        df_clean = df.copy()
        return df_clean.drop(columns=self.cols_to_drop, errors='ignore')

In [10]:
class RFEFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features, estimator=None):
        self.n_features = n_features
        self.estimator = estimator
        self.selected_features = None
        self.selector = None
        
    def fit(self, X, y=None):
        if self.estimator is None:
            from sklearn.ensemble import RandomForestRegressor
            self.estimator = RandomForestRegressor(n_estimators=10, random_state=42)
        
        self.selector = RFE(estimator=self.estimator, n_features_to_select=self.n_features)
        self.selector.fit(X, y)
        
        self.selected_features = X.columns[self.selector.support_].tolist()
        return self
        
    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
            
        if set(self.selected_features).issubset(set(X.columns)):
            return X[self.selected_features]
        else:
            return self.selector.transform(X)
    
    def get_feature_names_out(self):
        return np.array(self.selected_features)


In [26]:
class CategoricalConverter(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        # Nothing to learn here
        return self
        
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            if col not in X.columns:
                raise ValueError(f"Column '{col}' not found in DataFrame")
            X[col] = X[col].astype('category')
        return X


In [27]:
class DummyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, drop_first=False):
        self.drop_first = drop_first
        self.dummy_columns = None  # To store column names from training data
        
    def fit(self, X, y=None):
        dummy_df = pd.get_dummies(X, drop_first=self.drop_first)
        self.dummy_columns = dummy_df.columns.tolist()
        return self
        
    def transform(self, X):
        dummy_df = pd.get_dummies(X, drop_first=self.drop_first)
        
        for col in self.dummy_columns:
            if col not in dummy_df.columns:
                dummy_df[col] = 0
                
        return dummy_df.reindex(columns=self.dummy_columns, fill_value=0)

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X_eng = X.copy()
        X_eng['new_feature'] = X_eng['a'] / X_eng['b']
        return X_eng

In [19]:
def evaluate_model(pipeline, X, y, target_scaler, model_name):
    y_pred = pipeline.predict(X)
    
    y_pred = np.maximum(y_pred, 0)  # Add safety clamp
    
    return {
        "mse": mean_squared_error(y, y_pred),
        "mae": mean_absolute_error(y, y_pred),
        "r2": r2_score(y, y_pred),
        "msle": mean_squared_log_error(y, y_pred)
    }

In [35]:
if __name__ == "__main__":
    df = read_data("./data/hour.csv")
    X = df.drop(columns=['cnt'])
    y = df['cnt']
    X_train, X_val, X_test, y_train, y_val, y_test = data_split(X, y)
    mlflow.set_experiment("Pipeline experiment")
    
    base_models = {
        "RandomForest": RandomForestRegressor(random_state=42),
        "XGBoost": XGBRegressor(objective="reg:squarederror", random_state=42)
    }
    
    # Define model configurations
    models = {
        name: GridSearchCV(
            estimator=TransformedTargetRegressor(
                regressor=model,
                func=np.log1p,
                inverse_func=np.expm1
            ),
            param_grid=param_grid,
            cv=5,
            scoring="neg_mean_squared_error",
            n_jobs=-1
        )
        for name, model, param_grid in [
            ("RandomForest", base_models["RandomForest"], {
                "regressor__n_estimators": [100, 200],
                "regressor__max_depth": [None, 10],
                "regressor__min_samples_split": [2, 5]
            }),
            ("XGBoost", base_models["XGBoost"], {
                "regressor__n_estimators": [100, 200],
                "regressor__learning_rate": [0.05, 0.1],
                "regressor__max_depth": [3, 6]
            })
        ]
    }
    
    pipeline_configs = {
        "full_pipeline": [
            ('cleaner', DataCleaner()),
            ('category_converter', CategoricalConverter(columns=['weekday', 'weathersit', 'mnth', 'season'])),
            ('dummy_encoder', DummyEncoder()),
            ('feature_selector', RFEFeatureSelector(n_features=10)),
            ('model', None)
        ]
        # "no_categorical": [
        #     ('cleaner', DataCleaner()),
        #     ('feature_selector', RFEFeatureSelector(n_features=10)),
        #     ('model', None)
        # ]
    }
    
    # Loop through pipeline configurations and models
    for pipeline_name, pipeline_steps in pipeline_configs.items():
        for model_name, model in models.items():
            with mlflow.start_run(run_name=f"{pipeline_name}_{model_name}"):
                # Create a copy of the pipeline steps and fill in the model
                current_steps = pipeline_steps.copy()
                current_steps[-1] = ('model', model)  # Set the model in the pipeline
                
                # Create the pipeline
                pipeline = Pipeline(current_steps)
                
                # Fit the pipeline
                pipeline.fit(X_train, y_train)
                
                # Make predictions
                y_pred = pipeline.predict(X_val)
                
                # Evaluate
                metrics = evaluate_model(pipeline, X_val, y_val, None, model_name)
                
                # Log parameters
                mlflow.log_param("pipeline_config", pipeline_name)
                mlflow.log_param("model_name", model_name)
                if 'feature_selector' in dict(pipeline.named_steps):
                    feature_selector = pipeline.named_steps['feature_selector']
                    mlflow.log_param("rfe_selection", feature_selector.n_features)
                mlflow.log_params(pipeline.named_steps['model'].best_params_)
                mlflow.log_metrics(metrics)
                
                # Create signature and log model
                signature = infer_signature(X_val, y_pred)
                input_example = X_train[:5]
                
                mlflow.sklearn.log_model(
                    pipeline, 
                    "model", 
                    signature=signature, 
                    input_example=input_example
                )
                
                print(f"✅ {pipeline_name} with {model_name} logged to MLflow")
    
    mlflow.end_run()



✅ full_pipeline with RandomForest logged to MLflow




✅ full_pipeline with XGBoost logged to MLflow


In [33]:
241

array([431.70923 ,  77.08661 ,   6.153729, ..., 318.7845  ,  47.983093,
       113.09479 ], dtype=float32)

In [37]:
import mlflow
from mlflow.tracking import MlflowClient
import mlflow.sklearn

experiment_name = "Pipeline experiment"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    raise ValueError(f"Experiment '{experiment_name}' not found!")

runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.msle ASC"],
    max_results=1
)

if runs.empty:
    raise ValueError("No runs found with non-null msle metric.")

best_run_id = runs.iloc[0].run_id
print(f"🔍 Best run ID based on lowest MSLE: {best_run_id}")

model_uri = f"runs:/{best_run_id}/model"
best_pipeline = mlflow.sklearn.load_model(model_uri)

client = MlflowClient()
run_data = client.get_run(best_run_id).data

print("📦 Loaded Model Parameters:")
for key, value in run_data.params.items():
    print(f" - {key}: {value}")


🔍 Best run ID based on lowest MSLE: bc2c3a21e8314e6e96b22982e5fec961
📦 Loaded Model Parameters:
 - rfe_selection: 10
 - regressor__n_estimators: 200
 - model_name: XGBoost
 - pipeline_config: no_categorical
 - regressor__max_depth: 6
 - regressor__learning_rate: 0.1


In [40]:
y_pred = best_pipeline.predict(X_test)
y_pred

array([398.66428 ,  88.99932 ,   9.816879, ..., 411.9976  ,  36.117226,
       102.66221 ], dtype=float32)

In [43]:
evaluate_model(best_pipeline, X_test, y_pred, None, "best model")


{'mse': 0.0, 'mae': 0.0, 'r2': 1.0, 'msle': 0.0}