In [144]:
import pandas as pd
import numpy as np
import math
import logging

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-learn imports
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error, mean_squared_log_error

# Statsmodels import for VIF calculation
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import mlflow
import mlflow.sklearn 
from mlflow.models.signature import infer_signature

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV


# ### 1. Data Loading & Basic Inspection

In [145]:

def read_data(filename):
    """
    Reads the CSV file and returns a pandas DataFrame.
    """
    try:
        df = pd.read_csv(filename)
        if df.empty:
            raise ValueError("🚨 Data loaded but is empty.")
        return df
    except FileNotFoundError:
        raise FileNotFoundError("❌ Data file not found at specified path.")
    except Exception as e:
        raise RuntimeError(f"❌ Unexpected error while loading data: {e}")

In [146]:
def print_infos(df):
    """
    Prints general information and description of the DataFrame.
    """
    print("Data info : \n")
    display(df.info())

    print("\n\nData description: \n")
    display(df.describe())


In [147]:
def check_null(df):
    """
    Check and display the number of null values in each column.
    """
    print(df.isnull().sum().sort_values(ascending=False))

# ### 1. Clean data columns

In [148]:
def intial_clean_data(df):
    """
    Drops unnecessary columns from the DataFrame.
    """
    cols_to_drop = ['instant', 'dteday', 'registered', 'casual']
    df = df.drop(columns=cols_to_drop)
    return df

# ### 1. Convert Data type and dummies

In [149]:
def convert_to_category_type(df, columns):
    """
    Converts specified columns to categorical type.
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
        else:
            raise KeyError(f"Column '{col}' not found in DataFrame.")
    return df

In [150]:
def convert_dummies(df):
    """
    Converts categorical columns to dummy/indicator variables.
    """
    df = pd.get_dummies(df)
    return df

In [151]:
def get_numeric_columns(df):
    """
    Returns the numeric columns in the DataFrame.
    """
    return df.select_dtypes(include=['number']).columns.tolist()

In [152]:
def convert_bool_float(df):
    """
    Converts all boolean columns in a DataFrame to float (0.0 and 1.0).

    Parameters:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A new DataFrame with boolean columns converted to float.
    """
    df = df.copy()
    for col in df.select_dtypes(include='bool').columns:
        df[col] = df[col].astype(float)
    return df
    
    

In [153]:
def convert_int_float(df):
    df = df.copy()
    for col in df.select_dtypes(include='int').columns:
        df[col] = df[col].astype(float)
    return df


# ### 1. Display Data

In [154]:
def dispayl_histplot(df):
    """
    Displays histograms for all numeric columns in the DataFrame.
    """
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    
    n_cols = 4  
    n_rows = math.ceil(len(numeric_columns) / n_cols)
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows))
    
    axes = axes.flatten()
    
    for i, column in enumerate(numeric_columns):
        sns.histplot(df[column], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {column}')
        
    for i in range(len(numeric_columns), len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

In [155]:
def display_barplot(df, column):
    """
    Displays barplot for a specific column.
    """
    plt.figure(figsize = (10,4))
    plt.subplot(1, 2, 1)
    sns.barplot(x=column, y='cnt', data=df)


In [156]:
def display_violinplot(df):
    """
    Displays violin plot for multiple features against the 'cnt' column.
    """
    plt.figure(figsize=(20, 15))
    columns_to_plot = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 
                       'weathersit', 'temp', 'atemp', 'hum', 'windspeed']
    
    for idx, col in enumerate(columns_to_plot):
        plt.subplot(4, 3, idx + 1)
        if df[col].nunique() < 10:
            sns.violinplot(x=col, y='cnt', data=df)
        else:
            sns.scatterplot(x=col, y='cnt', data=df, alpha=0.5)
        plt.title(f'{col} vs cnt')
    
    plt.tight_layout()
    plt.show()

In [157]:
def display_corelation(df):
    """
    Displays the correlation heatmap between numerical features.
    """
    corr_matrix = df.corr(numeric_only=True)
    plt.figure(figsize=(25, 20))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Between Features')
    plt.tight_layout()
    plt.show()

In [178]:
def data_split(df, y, test_size=0.15, val_size=0.15, random_state=42):
    """
    Splits the DataFrame into train, validation, and test sets.
    """

    X_temp, X_test, y_temp, y_test = train_test_split(df, y, test_size=test_size, random_state=random_state)

    val_relative_size = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_relative_size, random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test

# ### 1. Feature selection

In [159]:
def feature_selection_rfe(X_train_scaled, y_train_scaled, n_features=15):
    """
    Selects features using Recursive Feature Elimination (RFE).
    """
    lr_rfe = LinearRegression()
    lr_rfe.fit(X_train_scaled, y_train_scaled)

    rfe = RFE(estimator=lr_rfe, n_features_to_select=n_features)
    rfe = rfe.fit(X_train_scaled, y_train_scaled)
    
    return pd.DataFrame(
        rfe.transform(X_train_scaled),
        columns=X_train_scaled.columns[rfe.support_],
        index=X_train_scaled.index
    )

In [160]:
def calculate_vif(X_train_rfe):
    """
    Calculates Variance Inflation Factor (VIF) for each feature.
    """
    vif = pd.DataFrame()
    vif['Features'] = X_train_rfe.columns
    vif['VIF'] = [variance_inflation_factor(X_train_rfe.values, i) for i in range(X_train_rfe.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    return vif.sort_values(by="VIF", ascending=False)

In [161]:
def transform_rfe(rfe, df):
       return pd.DataFrame(
        rfe.transform(df),
        columns=df.columns[rfe.support_],
        index=df.index
    )

# ### 1. Scale data

In [162]:
def scale_features(X):
    feature_scaler = MinMaxScaler()  
    X_scaled = feature_scaler.fit_transform(X) 
    return feature_scaler, X_scaled

In [163]:
def scale_target(y):
    target_scaler = MinMaxScaler()  
    y_scaled = target_scaler.fit_transform(y.values.reshape(-1, 1)) 
    return target_scaler, y_scaled

In [164]:
def unscale_features(scaler, X_scaled):
    X_unscaled = scaler.inverse_transform(X_scaled)
    return X_unscaled

In [165]:
def unscale_target(scaler, y_scaled):
    y_unscaled = scaler.inverse_transform(y_scaled)
    return y_unscaled

In [166]:
def save_model(model, filename):
    """
    Saves the trained model to a file.
    """
    import joblib
    joblib.dump(model, filename)

In [167]:
def load_model(filename):
    """
    Loads a saved model from a file.
    """
    import joblib
    return joblib.load(filename)

# ### 1. Model Training

In [168]:
def train_linear_model(X_train_scaled, y_train_scaled):
    """
    Trains a Linear Regression model and evaluates it.
    """
    model = LinearRegression()
    model.fit(X_train_scaled, y_train_scaled)
    return model

In [169]:
def evaluate_model(model, X_val, y_val, target_scaler, model_name="model"):
    """
    Evaluate the model using RMSLE, RMSE, NRMSE, and Relative Error.

    Parameters:
    - model: Trained regression model with a predict method
    - X_val: Validation features
    - y_val: True target values (scaled)
    - target_scaler: Scaler used to inverse transform target values
    - model_name: Optional name for the model

    Returns:
    - Dictionary with evaluation metrics
    """

    # Predict and reshape
    y_pred = model.predict(X_val)
    y_pred_array = np.array(y_pred).reshape(-1, 1)
    y_val_array = np.array(y_val).reshape(-1, 1)

    # Inverse transform predictions and targets
    y_pred_original = target_scaler.inverse_transform(y_pred_array)
    y_test_original = target_scaler.inverse_transform(y_val_array)

    # Clip predictions to avoid negative values
    y_pred_clipped = np.clip(y_pred_original, 0, None)

    # Calculate metrics
    rmsle = np.sqrt(mean_squared_log_error(y_test_original, y_pred_clipped))
    rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_clipped))
    nrmse = rmse / (y_test_original.max() - y_test_original.min())
    relative_error = rmse / y_test_original.mean()

    # Print or log
    print(f"📊 Evaluation Results for {model_name}:")
    print(f" - RMSLE: {rmsle:.4f}")
    print(f" - RMSE: {rmse:.4f}")
    print(f" - Normalized RMSE: {nrmse:.2%}")
    print(f" - Relative Error: {relative_error:.2%}")

    return {
        "rmsle": rmsle,
        "rmse": rmse,
        "nrmse": nrmse,
        "relative_error": relative_error
    }


# ### 1. Run script

In [211]:
if __name__ == "__main__":

    
    # Load and clean data
    df = read_data("./data/hour.csv")
    df_cleaned = intial_clean_data(df)

    # Visualize data
    # dispayl_histplot(df_cleaned)
    # display_violinplot(df_cleaned)
    # display_corelation(df_cleaned)

    # Convert to category and one-hot encode
    df_cleaned = convert_to_category_type(df_cleaned, ['weekday', 'weathersit', 'mnth', 'season'])
    df_cleaned = convert_dummies(df_cleaned)
    df_cleaned = convert_bool_float(df_cleaned)
    df_cleaned = convert_int_float(df_cleaned)

    X = df_cleaned.drop(columns=['cnt'])
    y = df_cleaned['cnt']

    df_ref = feature_selection_rfe(X, y, 32)
    
    # Split data into train, validation, and test sets
    X_train, X_val, X_test, y_train, y_val, y_test = data_split(df_ref, y)

    
    # Scale features and target
    feature_scaler, X_train_scaled = scale_features(X_train)
    target_scaler, y_train_scaled = scale_target(y_train)
    
    # Train model
    model = train_linear_model(X_train_scaled, y_train_scaled)

In [214]:
metrics = evaluate_model(model, X_val, y_val, target_scaler, model_name="linearRegression")


📊 Evaluation Results for linearRegression:
 - RMSLE: 4.3519
 - RMSE: 246416.4211
 - Normalized RMSE: 27.44%
 - Relative Error: 136.04%


  return X @ coef_.T + self.intercept_
  return X @ coef_.T + self.intercept_
  return X @ coef_.T + self.intercept_


In [215]:
mlflow.set_experiment("Bike sharing")  # Name your experiment


<Experiment: artifact_location='file:///Users/level3/mlops42/mlruns/389502738792437896', creation_time=1746704143441, experiment_id='389502738792437896', last_update_time=1746704143441, lifecycle_stage='active', name='Bike sharing', tags={}>

In [216]:
models ={
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1)
}

In [217]:

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_val)

        # Evaluate
        metrics = evaluate_model(model, X_val, y_val, target_scaler, model_name)
        
        # Log parameters and metrics
        mlflow.log_param("model_name", model_name)
        mlflow.log_metrics(metrics)
        
        signature = infer_signature(X_val, y_pred)
        input_example = X_val[:5] 
        # Log the model
        
        mlflow.sklearn.log_model(model, "model", signature=signature, input_example=input_example)

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


📊 Evaluation Results for LinearRegression:
 - RMSLE: 2.5298
 - RMSE: 132243.6623
 - Normalized RMSE: 14.73%
 - Relative Error: 73.01%
📊 Evaluation Results for RandomForest:
 - RMSLE: 0.3836
 - RMSE: 41535.4810
 - Normalized RMSE: 4.63%
 - Relative Error: 22.93%
📊 Evaluation Results for XGBoost:
 - RMSLE: 1.0750
 - RMSE: 41805.7023
 - Normalized RMSE: 4.66%
 - Relative Error: 23.08%


In [218]:
models = {
    "RandomForestRegressor": GridSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_grid={
            "n_estimators": [100, 200],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5],
        },
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    ),
    "XGBRegressor": GridSearchCV(
        estimator=XGBRegressor(objective="reg:squarederror", random_state=42),
        param_grid={
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 6]
        },
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    )
}

for model_name, model in models.items():
    for rfe_selection in range(10, 34):
        
        with mlflow.start_run(run_name=model_name):
            model.fit(X_train, y_train)
            best_model = model.best_estimator_
            y_pred = best_model.predict(X_val)
    
            # Evaluate (your custom function)
            metrics = evaluate_model(best_model, X_val, y_val, target_scaler, model_name)
    
            # Log best hyperparameters and metrics
            mlflow.log_param("model_name", model_name)
            mlflow.log_params(model.best_params_)  # ← logs hyperparameters from GridSearchCV
            mlflow.log_metrics(metrics)
    
            # Log model with signature and input example
            signature = infer_signature(X_val, y_pred)
            input_example = X_val[:5]
            mlflow.sklearn.log_model(best_model, "model", signature=signature, input_example=input_example)
    
            print(f"✅ {model_name} logged to MLflow with best params.")

📊 Evaluation Results for RandomForestRegressor:
 - RMSLE: 0.3800
 - RMSE: 41066.4408
 - Normalized RMSE: 4.57%
 - Relative Error: 22.67%
✅ RandomForestRegressor logged to MLflow with best params.
📊 Evaluation Results for XGBRegressor:
 - RMSLE: 1.1361
 - RMSE: 40188.8780
 - Normalized RMSE: 4.48%
 - Relative Error: 22.19%
✅ XGBRegressor logged to MLflow with best params.
