# Modular MLflow Pipeline for California Housing

In [None]:
# === Configuration ===
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# Import für Gradient Boosting Modelle
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor

models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "KNN": KNeighborsRegressor(),
    "XGBoost": XGBRegressor(random_state=42, verbosity=0),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "HistGB": HistGradientBoostingRegressor(random_state=42),
    "SVR": SVR(),
    "MLPRegressor": MLPRegressor(random_state=42, max_iter=500)
}

# grid or random search
search_method = "grid"

# hyperparameter grid or distributions:
param_grids = {
    "LinearRegression": {},
    "DecisionTree": {
        "max_depth": [5, 10, 20, None],
        "min_samples_split": [2, 5, 10]
    },
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, 20, None]
    },
    "KNN": {
        "n_neighbors": [3, 5, 10, 20],
        "weights": ['uniform', 'distance']
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 10],
        "learning_rate": [0.01, 0.1, 0.2]
    },
    "LightGBM": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 10],
        "learning_rate": [0.01, 0.1, 0.2]
    },
    "CatBoost": {
        "iterations": [50, 100, 200],
        "depth": [3, 5, 10],
        "learning_rate": [0.01, 0.1, 0.2]
    },
    "HistGB": {
        "max_iter": [50, 100, 200],
        "max_depth": [3, 5, 10],
        "learning_rate": [0.01, 0.1, 0.2]
    },
    "SVR": {
        "kernel": ['linear', 'poly', 'rbf'],
        "C": [0.1, 1, 10],
        "epsilon": [0.01, 0.1, 0.2]
    },
    "MLPRegressor": {
        "hidden_layer_sizes": [(50,), (100,), (50, 50)],
        "activation": ['relu', 'tanh'],
        "alpha": [0.0001, 0.001, 0.01]
    }
}

# Search settings
n_iter_random = 10  # only for random
cv_folds = 5
random_state = 42


In [3]:
# === Imports ===
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score

import mlflow
import mlflow.sklearn

# run name
def make_run_name(model_name):
    ts = datetime.now().strftime("%Y_%m_%d_%H_%M")
    return f"{ts}_{model_name}"

# Load and preprocess data
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name="MedHouseVal")

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [4]:
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

def log_model_to_mlflow(
    model, 
    model_name: str,
    X_train, 
    y_train, 
    best_params: dict,
    input_example=None,
    search_method: str = "grid",
    registered_model_name: str = "CaliforniaHousingModel"
):
    
    if input_example is None:
        input_example = X_train[:1]

    # 1. Signature
    signature = infer_signature(X_train, model.predict(X_train[:1]))

    # 2. Log and register model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=input_example,
        registered_model_name=registered_model_name
    )

    # 3. Tag and describe registered model version
    client = MlflowClient()
    latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]

    # Set tags
    client.set_model_version_tag(
        name=registered_model_name,
        version=latest_version.version,
        key="model_type",
        value=model_name
    )
    client.set_model_version_tag(
        name=registered_model_name,
        version=latest_version.version,
        key="search_method",
        value=search_method
    )

    # Set description
    description = f"Model type: {model_name}\nHyperparameter: {best_params}"
    client.update_model_version(
        name=registered_model_name,
        version=latest_version.version,
        description=description
    )

    print(f"Modell {model_name} registered as Version {latest_version.version}")


In [5]:
# === Training Loop with MLflow ===
mlflow.set_experiment("California_Housing_Modular_Pipeline")

results = []

for name, model in models.items():
    # Select parameter grid/distribution
    params = param_grids.get(name, {})

    if search_method == "grid":
        searcher = GridSearchCV(model, params, cv=cv_folds, n_jobs=-1)
    else:
        from scipy.stats import randint
        # Convert lists to distributions for RandomizedSearchCV
        dists = {k: (randint(min(v), max(v)+1) if isinstance(v, list) else v) 
                 for k, v in params.items()}
        searcher = RandomizedSearchCV(model, dists, 
                                      n_iter=n_iter_random, cv=cv_folds,
                                      random_state=random_state, n_jobs=-1)

    run_name = make_run_name(name)
    with mlflow.start_run(run_name=run_name):
        # Tag model type
        mlflow.set_tag("model_type", name)

        # Autolog
        mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)

        # Fit and search
        searcher.fit(X_train_scaled, y_train.values)
        best = searcher.best_estimator_

        preds = best.predict(X_test_scaled)
        rmse = root_mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        # Log best parameters
        mlflow.log_params(searcher.best_params_)

        # Log model explicitly as versioned under same name
        log_model_to_mlflow(
            model=best,
            model_name=name,
            X_train=X_train_scaled,
            y_train=y_train,
            best_params=searcher.best_params_,
            search_method=search_method
        )

        # print(f"{run_name}: RMSE={rmse:.4f}, R2={r2:.4f}")
        # results.append((name, rmse, r2))

# Print results
import pandas as pd
res_df = pd.DataFrame(results, columns=["model", "rmse", "r2"]).sort_values("rmse")
res_df


2025/05/27 10:24:02 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '11' of model 'CaliforniaHousingModel'.
  latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]


Modell LinearRegression registered as Version 11


2025/05/27 10:24:46 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '12' of model 'CaliforniaHousingModel'.
  latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]


Modell DecisionTree registered as Version 12


2025/05/27 10:28:14 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '13' of model 'CaliforniaHousingModel'.
  latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]


Modell RandomForest registered as Version 13


2025/05/27 10:28:59 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '14' of model 'CaliforniaHousingModel'.
  latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]


Modell KNN registered as Version 14


2025/05/27 10:30:04 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '15' of model 'CaliforniaHousingModel'.
  latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]


Modell XGBoost registered as Version 15
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947


2025/05/27 10:31:06 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.
Registered model 'CaliforniaHousingModel' already exists. Creating a new version of this model...
Created version '16' of model 'CaliforniaHousingModel'.
  latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]


Modell LightGBM registered as Version 16


Unnamed: 0,model,rmse,r2
