# Modular MLflow Pipeline for California Housing
In this notebook you can configure your models and hyperparameter search at the top.

In [8]:
# === Configuration ===
# List your models here (sklearn estimator instances):
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor()
}

# Choose search method: "grid" or "random"
search_method = "grid"

# Define hyperparameter grid or distributions for each model:
param_grids = {
    "LinearRegression": {},
    "DecisionTree": {
        "max_depth": [5, 10, 20, None],
        "min_samples_split": [2, 5, 10]
    },
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, 20, None]
    }
}

# Search settings
n_iter_random = 10  # only used if search_method == "random"
cv_folds = 5
random_state = 42


In [9]:
# === Imports ===
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score

import mlflow
import mlflow.sklearn

# Utility to create run name
def make_run_name(model_name):
    ts = datetime.now().strftime("%Y_%m_%d_%H_%M")
    return f"{ts}_{model_name}"

# Load and preprocess data
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name="MedHouseVal")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

def log_model_to_mlflow(
    model, 
    model_name: str,
    X_train, 
    y_train, 
    best_params: dict,
    input_example=None,
    search_method: str = "grid",
    registered_model_name: str = "CaliforniaHousingModel"
):
    """
    Logs a trained model to MLflow with signature, input_example, tags, and description.

    Parameters:
        model: Trained sklearn model
        model_name: Logical name of the model (e.g., "RandomForest")
        X_train: Training features (for signature)
        y_train: Training labels (for signature)
        best_params: Dictionary of best hyperparameters
        input_example: Optional input example (default: first row of X_train)
        search_method: "grid" or "random"
        registered_model_name: Name under which the model should be registered
    """
    if input_example is None:
        input_example = X_train[:1]

    # 1. Signature
    signature = infer_signature(X_train, model.predict(X_train[:1]))

    # 2. Log and register model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=input_example,
        registered_model_name=registered_model_name
    )

    # 3. Tag and describe registered model version
    client = MlflowClient()
    latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]

    # Set tags
    client.set_model_version_tag(
        name=registered_model_name,
        version=latest_version.version,
        key="model_type",
        value=model_name
    )
    client.set_model_version_tag(
        name=registered_model_name,
        version=latest_version.version,
        key="search_method",
        value=search_method
    )

    # Set description
    description = f"Modelltyp: {model_name}\nHyperparameter: {best_params}"
    client.update_model_version(
        name=registered_model_name,
        version=latest_version.version,
        description=description
    )

    print(f"✅ Modell {model_name} registriert als Version {latest_version.version}")


In [None]:
# === Training Loop with MLflow ===
mlflow.set_experiment("California_Housing_Modular_Pipeline")

results = []

for name, model in models.items():
    # Select parameter grid/distribution
    params = param_grids.get(name, {})

    if search_method == "grid":
        searcher = GridSearchCV(model, params, cv=cv_folds, n_jobs=-1)
    else:
        from scipy.stats import randint
        # Convert lists to distributions for RandomizedSearchCV
        dists = {k: (randint(min(v), max(v)+1) if isinstance(v, list) else v) 
                 for k, v in params.items()}
        searcher = RandomizedSearchCV(model, dists, 
                                      n_iter=n_iter_random, cv=cv_folds,
                                      random_state=random_state, n_jobs=-1)

    run_name = make_run_name(name)
    with mlflow.start_run(run_name=run_name):
        # Tag model type
        mlflow.set_tag("model_type", name)

        # Autolog and input/output schema
        mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)

        # Fit and search
        searcher.fit(X_train_scaled, y_train.values)
        best = searcher.best_estimator_

        # Evaluate
        preds = best.predict(X_test_scaled)
        rmse = root_mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        # Log best parameters
        mlflow.log_params(searcher.best_params_)

        # Log model explicitly as versioned under same name
        log_model_to_mlflow(
            model=best,
            model_name=name,
            X_train=X_train_scaled,
            y_train=y_train,
            best_params=searcher.best_params_,
            search_method=search_method
        )

        print(f"{run_name}: RMSE={rmse:.4f}, R2={r2:.4f}")
        results.append((name, rmse, r2))

# Summarize
import pandas as pd
res_df = pd.DataFrame(results, columns=["model", "rmse", "r2"]).sort_values("rmse")
res_df


2025/05/22 09:32:30 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '2' of model 'CaliforniaHousingModel'.


2025_05_22_09_32_LinearRegression: RMSE=0.7456, R2=0.5758


2025/05/22 09:33:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '3' of model 'CaliforniaHousingModel'.


2025_05_22_09_32_DecisionTree: RMSE=0.6410, R2=0.6865


2025/05/22 09:36:22 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '4' of model 'CaliforniaHousingModel'.


2025_05_22_09_33_RandomForest: RMSE=0.5023, R2=0.8074


Unnamed: 0,model,rmse,r2
2,RandomForest,0.502341,0.807429
1,DecisionTree,0.640966,0.686481
0,LinearRegression,0.745581,0.575788
