# Modular MLflow Pipeline for California Housing

In [None]:
# === Configuration ===
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# Import für Gradient Boosting Modelle
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor

models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    # "KNN": KNeighborsRegressor(),
    # "XGBoost": XGBRegressor(random_state=42, verbosity=0),
    # "LightGBM": LGBMRegressor(random_state=42),
    # "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    # "HistGB": HistGradientBoostingRegressor(random_state=42),
    # "SVR": SVR(),
    # "MLPRegressor": MLPRegressor(random_state=42, max_iter=500)
}

# grid or random search
search_method = "grid"

# hyperparameter grid or distributions:
param_grids = {
    "LinearRegression": {},
    "DecisionTree": {
        "max_depth": [5, 10, 20, None],
        "min_samples_split": [2, 5, 10]
    },
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, 20, None]
    },
    "KNN": {
        "n_neighbors": [3, 5, 10, 20],
        "weights": ['uniform', 'distance']
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 10],
        "learning_rate": [0.01, 0.1, 0.2]
    },
    "LightGBM": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 10],
        "learning_rate": [0.01, 0.1, 0.2]
    },
    "CatBoost": {
        "iterations": [50, 100, 200],
        "depth": [3, 5, 10],
        "learning_rate": [0.01, 0.1, 0.2]
    },
    "HistGB": {
        "max_iter": [50, 100, 200],
        "max_depth": [3, 5, 10],
        "learning_rate": [0.01, 0.1, 0.2]
    },
    "SVR": {
        "kernel": ['linear', 'poly', 'rbf'],
        "C": [0.1, 1, 10],
        "epsilon": [0.01, 0.1, 0.2]
    },
    "MLPRegressor": {
        "hidden_layer_sizes": [(50,), (100,), (50, 50)],
        "activation": ['relu', 'tanh'],
        "alpha": [0.0001, 0.001, 0.01]
    }
}

# Search settings
n_iter_random = 10  # only for random
cv_folds = 5
random_state = 42


In [24]:
# === Imports ===
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score

import mlflow
import mlflow.sklearn

# run name
def make_run_name(model_name, param_set=None):
    ts = datetime.now().strftime("%Y_%m_%d_%H_%M")
    run_name = f"{ts}_{model_name}"

    if param_set:
        # Create a short param summary, e.g., max_depth=5_n_estimators=100
        param_str = "_".join(f"{k}={v}" for k, v in param_set.items())
        run_name += f"_{param_str}"

    return run_name

# Load and preprocess data from CSV
housing = pd.read_csv('data/housing_processed.csv')

# Separate features and target
X = housing.drop('median_house_value', axis=1)
y = housing['median_house_value']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [25]:
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

def log_model_to_mlflow(
    model, 
    model_name: str,
    X_train, 
    y_train, 
    best_params: dict,
    input_example=None,
    search_method: str = "grid",
    registered_model_name: str = "CaliforniaHousingModel"
):
    
    if input_example is None:
        input_example = X_train[:1]

    # 1. Signature
    signature = infer_signature(X_train, model.predict(X_train[:1]))

    # 2. Log and register model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=input_example,
        registered_model_name=registered_model_name
    )

    # 3. Tag and describe registered model version
    client = MlflowClient()
    latest_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]

    # Set tags
    client.set_model_version_tag(
        name=registered_model_name,
        version=latest_version.version,
        key="model_type",
        value=model_name
    )
    client.set_model_version_tag(
        name=registered_model_name,
        version=latest_version.version,
        key="search_method",
        value=search_method
    )

    # Set description
    description = f"Model type: {model_name}\nHyperparameter: {best_params}"
    client.update_model_version(
        name=registered_model_name,
        version=latest_version.version,
        description=description
    )

    print(f"Modell {model_name} registered as Version {latest_version.version}")


In [21]:
# === Training Loop with MLflow ===
mlflow.set_experiment("California_Housing_Modular_Pipeline")

results = []

for name, model in models.items():
    # Select parameter grid/distribution
    params = param_grids.get(name, {})

    if search_method == "grid":
        searcher = GridSearchCV(model, params, cv=cv_folds, n_jobs=-1)
    else:
        from scipy.stats import randint
        # Convert lists to distributions for RandomizedSearchCV
        dists = {k: (randint(min(v), max(v)+1) if isinstance(v, list) else v) 
                 for k, v in params.items()}
        searcher = RandomizedSearchCV(model, dists, 
                                      n_iter=n_iter_random, cv=cv_folds,
                                      random_state=random_state, n_jobs=-1)

    run_name = make_run_name(name)
    with mlflow.start_run(run_name=run_name):
        # Tag model type
        mlflow.set_tag("model_type", name)

        # Autolog
        mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)

        # Fit and search
        searcher.fit(X_train_scaled, y_train.values)
        best = searcher.best_estimator_

        preds = best.predict(X_test_scaled)
        rmse = root_mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        # Log best parameters
        mlflow.log_params(searcher.best_params_)

        # Log model explicitly as versioned under same name
        log_model_to_mlflow(
            model=best,
            model_name=name,
            X_train=X_train_scaled,
            y_train=y_train,
            best_params=searcher.best_params_,
            search_method=search_method
        )

        # print(f"{run_name}: RMSE={rmse:.4f}, R2={r2:.4f}")
        # results.append((name, rmse, r2))

# Print results
import pandas as pd
res_df = pd.DataFrame(results, columns=["model", "rmse", "r2"]).sort_values("rmse")
res_df


ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\ensemble\_forest.py", line 360, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        ensure_all_finite=False,
        ^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1387, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1397, in _check_y
    y = check_array(
        y,
    ...<5 lines>...
        estimator=estimator,
    )
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1107, in check_array
    _assert_all_finite(
    ~~~~~~~~~~~~~~~~~~^
        array,
        ^^^^^^
    ...<2 lines>...
        allow_nan=ensure_all_finite == "allow-nan",
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\utils\validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        X,
        ^^
    ...<4 lines>...
        input_name=input_name,
        ^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\felix\OneDrive - TH Köln\25_25 SoSe\MLWR\labs\6 – Track the development of the model with MLflow\model\.venv\Lib\site-packages\sklearn\utils\validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input y contains NaN.


In [26]:
import itertools

mlflow.set_experiment("California_Housing_Modular_Pipeline")

results = []

for name, model in models.items():
    params = param_grids.get(name, {})

    # Erzeuge alle Kombinationen aus dem Parametergrid
    keys, values = zip(*params.items()) if params else ([], [])
    param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)] if keys else [{}]

    for param_set in param_combinations:
        # Erstelle Run-Name basierend auf Modellname + Parametern
        run_name = make_run_name(name, param_set)

        with mlflow.start_run(run_name=run_name):
            mlflow.set_tag("model_type", name)
            mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)

            # Setze Parameter manuell im Modell
            model.set_params(**param_set)
            model.fit(X_train_scaled, y_train.values)

            preds = model.predict(X_test_scaled)
            rmse = root_mean_squared_error(y_test, preds)
            r2 = r2_score(y_test, preds)

            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.log_params(param_set)

            log_model_to_mlflow(
                model=model,
                model_name=name,
                X_train=X_train_scaled,
                y_train=y_train,
                best_params=param_set,
                search_method="none"  # oder wie auch immer du kennzeichnen willst
            )

            results.append((name, rmse, r2))

import pandas as pd
res_df = pd.DataFrame(results, columns=["model", "rmse", "r2"]).sort_values("rmse")
res_df


KeyboardInterrupt: 