In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# loading the California housing dataset
cali_housing = fetch_california_housing(as_frame=True)

# split the dataset into train and test partitions
X_train, X_test, y_train, y_test = train_test_split(
    cali_housing.data, cali_housing.target, test_size=0.2, random_state=123
)

In [2]:
cali_housing.data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [3]:
from domino_data.training_sets import client, model
import pandas as pd

training_df = pd.DataFrame(data = X_train, columns = cali_housing.feature_names)
training_df[cali_housing.target_names[0]] = y_train

tsv = client.create_training_set_version(
    training_set_name="cali_house_price_prediction_regression",
    df=training_df,
    key_columns=[],
    target_columns=cali_housing.target_names,
    exclude_columns=[],
    meta={"experiment_id": "0.1"},
    monitoring_meta=model.MonitoringMeta(**{
        "categorical_columns": [],
        "timestamp_columns": [],
        "ordinal_columns": []
    })
)

print(f"TrainingSetVersion {tsv.training_set_name}:{tsv.number}")

TrainingSetVersion cali_house_price_prediction_regression:17


In [4]:
from sklearn.linear_model import LinearRegression
from domino_data_capture.data_capture_client import DataCaptureClient
import uuid
import datetime
import pandas as pd

# train the model
lin_reg = LinearRegression().fit(X_train, y_train)

# Infer model signature
predictions = lin_reg.predict(X_train)

In [5]:
X_test.to_csv("data-to-be-scored.csv", index=False)

In [6]:
import mlflow
experiment_name = "House Price Prediction Expt"

mlflow.set_experiment(experiment_name)

data_capture_client = DataCaptureClient(cali_housing.feature_names, cali_housing.target_names)

class HousingModel(mlflow.pyfunc.PythonModel):
    def __init__(self,model):
        self.model = model
    
    # Assumes model_input is a list of lists
    def predict(self, context, model_input, params=None):
        event_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
        prediction = self.model.predict(model_input)
        
        if isinstance(model_input, pd.DataFrame):
            model_input = model_input.values.tolist()
        
        for i in range(len(prediction)):
            # Record eventID and current time
            event_id = uuid.uuid4()
            model_input_value = model_input[i]
            prediction_value = [prediction[i]]
            
            # Capture this prediction event so Domino can keep track
            data_capture_client.capturePrediction(model_input_value, prediction_value, event_id=event_id,
                                timestamp=event_time)
        return prediction

model = HousingModel(lin_reg)

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = lin_reg.predict(X_test)

# The coefficients
print("Coefficients: \n", lin_reg.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
# The mean absolute error
print("Mean squared error: %.2f" % mean_absolute_error(y_test, y_pred))

Coefficients: 
 [ 4.39988248e-01  9.15770358e-03 -1.11827735e-01  6.47857908e-01
 -6.55068105e-06 -3.92330215e-03 -4.17033805e-01 -4.27676550e-01]
Mean squared error: 0.52
Coefficient of determination: 0.61
Mean squared error: 0.53


In [8]:
with mlflow.start_run() as run:
    run_params = lin_reg.get_params()
    
    if not run_params == None:
        for name in run_params:
            mlflow.log_param(name, run_params[name])
    
    mlflow.log_metric("r2_score", r2_score(y_test, y_pred))
    mlflow.log_metric("mean_squared_error", mean_squared_error(y_test, y_pred))
    mlflow.log_metric("mean_absolute_error", mean_absolute_error(y_test, y_pred))
    
    model_info = mlflow.pyfunc.log_model(
        registered_model_name="cali_house_price_prediction_regression", 
        python_model = model, 
        artifact_path="sklearn-model"
    )

print(model_info)

Registered model 'cali_house_price_prediction_regression' already exists. Creating a new version of this model...
2025/08/13 22:56:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cali_house_price_prediction_regression, version 11


<mlflow.models.model.ModelInfo object at 0x72c198e7a1a0>


Created version '11' of model 'cali_house_price_prediction_regression'.
