In [0]:
sql_df = spark.sql("SELECT square_feet_float, num_rooms_float, age_float, distance_to_city_km_float, price_float FROM kaggle_ml_demo.house_prediction.train_data_house_prediction_float")
display(sql_df)

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Prepare features using the aliased columns
feature_columns = ['square_feet_float', 'num_rooms_float', 'age_float', 'distance_to_city_km_float']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
data = assembler.transform(sql_df)

# Split data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Train a Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='price_float')
lr_model = lr.fit(train_data)

# Evaluate the model
predictions = lr_model.transform(test_data)
evaluator = RegressionEvaluator(labelCol='price_float', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)

# Display RMSE
display(spark.createDataFrame([(rmse,)], ['RMSE']))

In [0]:
import mlflow
from mlflow.models.signature import infer_signature

# Set the registry URI to Unity Catalog
mlflow.set_registry_uri("databricks-uc")

# Define the catalog, schema, and model name
catalog_name = "kaggle_ml_demo"
schema_name = "house_prediction"
model_name = "ml_model_house_prediction"
full_model_name = f"{catalog_name}.{schema_name}.{model_name}"

# Infer the model signature
signature = infer_signature(train_data.select(feature_columns).toPandas(), lr_model.transform(train_data).select("prediction").toPandas())

# Log the model with the signature
with mlflow.start_run() as run:
    mlflow.spark.log_model(
        lr_model, 
        "model", 
        signature=signature
    )

    # Register the model to Unity Catalog
    model_uri = f"runs:/{run.info.run_id}/model"
    mlflow.register_model(model_uri=model_uri, name=full_model_name)