# Implement Bayesian Ridge Regression
Below is a Python implementation that mimics the Bayesian Ridge Regression workflow used by PyCaret, using libraries like pandas, scikit-learn, and matplotlib for preprocessing, training, testing, and visualization.

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score

In [11]:
# Load and preprocess data
def load_and_preprocess_data(source: str, target_col: str) -> pd.DataFrame:
    df = pd.read_csv(source, parse_dates=['Date'])
    df.rename(columns={'Date': 'date'}, inplace=True)
    df = df.sort_values('date')
    
    # Drop target variable NaN values
    df.dropna(subset=[target_col], inplace=True)

    # Drop mortgage rate column (if using diff as target)
    if target_col == 'MORTGAGE30US_diff':
        df.drop(columns=['MORTGAGE30US'], inplace=True, errors='ignore')
    elif target_col == 'MORTGAGE30US':
        df.drop(columns=['MORTGAGE30US_diff'], inplace=True, errors='ignore')
    
    return df

# Feature engineering
def create_features(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['weekofyear'] = df['date'].dt.isocalendar().week
    df['rolling_mean_3'] = df[target_col].rolling(window=3).mean()
    df['rolling_std_3'] = df[target_col].rolling(window=3).std()

    # drop date column
    df.drop(columns=['date'], inplace=True)

    # Convert everything to float32
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
        else:
            df[col] = df[col].astype('float32')
    return df

In [12]:
# Create a pipeline for preprocessing and modeling
def create_pipeline(model):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    return pipeline

In [13]:
# Train and evaluate models with K-Fold CV
def train_and_evaluate_with_kfold(df: pd.DataFrame, target_col: str, n_splits: int = 5):
    # Split data into features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Define models
    bayesian_model = create_pipeline(BayesianRidge())
    linear_model = create_pipeline(LinearRegression())

    models = {
        "Bayesian Ridge Regression": bayesian_model,
        "Linear Regression": linear_model
    }

    # Perform K-Fold Cross-Validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = {}

    for model_name, model in models.items():
        cv_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
        rmse_scores = np.sqrt(-cv_scores)
        results[model_name] = {
            "Mean RMSE": rmse_scores.mean(),
            "Std RMSE": rmse_scores.std()
        }
        print(f"{model_name} - Mean RMSE: {rmse_scores.mean():.4f}, Std RMSE: {rmse_scores.std():.4f}")

    # Train final models on the full dataset
    for model_name, model in models.items():
        model.fit(X, y)
        joblib.dump(model, f'../models/{model_name.lower().replace(" ", "_")}_pipeline.pkl')

    return models, results

In [14]:
target_col = 'MORTGAGE30US_diff'
source = '../data/full_mortgage_dataset.csv'

# Load and preprocess data
df = load_and_preprocess_data(source, target_col)
df = create_features(df, target_col)
df.fillna(0, inplace=True)  # Fill NaN values created by rolling features

# Train and evaluate models
models, results = train_and_evaluate_with_kfold(df, target_col, n_splits=5)

Bayesian Ridge Regression - Mean RMSE: 0.0883, Std RMSE: 0.0338
Linear Regression - Mean RMSE: 0.0892, Std RMSE: 0.0363
