### Create Train/Validation/Test splits

In [0]:
%pip install mlflow

In [0]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from delta.tables import DeltaTable

# Load dataset from catalog
dbutils.widgets.text("city", "antwerpen") 
city = dbutils.widgets.get("city")
table_name = f"workspace.default.{city}_airbnb_dataset"  # Update with actual catalog & schema
spark_df = spark.read.table(table_name)
delta_table = DeltaTable.forName(spark, table_name)
dataset_version = delta_table.history().select("version").orderBy("version", ascending=False).first()["version"]
df = spark_df.toPandas()
target_column = "price"
x = df.drop(columns=[target_column]).values  # Convert all feature columns to NumPy
y = df[target_column].values  # Extract target variable

# Split into Train (80%) and Temp (20%)
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)

# Split Temp into Validation (10%) and Test (10%)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

# Print shapes to verify
print(f"Train: x_train={x_train.shape}, y_train={y_train.shape}")
print(f"Validation: x_val={x_val.shape}, y_val={y_val.shape}")
print(f"Test: x_test={x_test.shape}, y_test={y_test.shape}")

### Train Linear Regression Model

In [0]:
# Initialize MLflow

mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"
mlflow.set_tracking_uri("databricks")  # Use Databricks' MLflow tracking
mlflow.set_experiment("/Users/vanthiel.erwin@gmail.com/linear-regression")  # Set experiment name in Databricks

with mlflow.start_run():
    mlflow.log_param("dataset_version", dataset_version)

    # Train Model
    model = LinearRegression()
    model.fit(x_train, y_train)

    # Make Predictions
    y_train_pred = model.predict(x_train)
    y_val_pred = model.predict(x_val)
    y_test_pred = model.predict(x_test)

    # Calculate Metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    # Log Metrics in MLflow
    mlflow.log_metric("train_mse", train_mse)
    mlflow.log_metric("val_mse", val_mse)
    mlflow.log_metric("test_mse", test_mse)

    # Log Model in MLflow
    mlflow.sklearn.log_model(model, f"linear_regression_model-{city}")

### Train Neural Network

In [0]:
%pip install torch

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import mlflow
import mlflow.pytorch
from torch.utils.data import DataLoader, TensorDataset

# Convert NumPy arrays to PyTorch tensors
x_train, y_train = torch.tensor(x_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)
x_val, y_val = torch.tensor(x_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32)
x_test, y_test = torch.tensor(x_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)

# Create PyTorch DataLoaders
batch_size = 32  # Adjust batch size as needed
train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
test_dataset = TensorDataset(x_test, y_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size=128, output_size=1):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Train the Model and Log with MLflow
input_size = x.shape[1]  # Get number of features from x
hidden_size = 128
output_size = 1  # For regression, output is a single value
num_epochs = 10
learning_rate = 0.001

model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"
mlflow.set_tracking_uri("databricks")  # Use Databricks' MLflow tracking
mlflow.set_experiment("/Users/vanthiel.erwin@gmail.com/simple-nn")  # Set MLflow experiment
with mlflow.start_run():
    mlflow.log_param("dataset_version", dataset_version)
    mlflow.log_param("input_size", input_size)
    mlflow.log_param("hidden_size", hidden_size)
    mlflow.log_param("output_size", output_size)
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("num_epochs", num_epochs)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training Loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        train_loss = total_loss / len(train_loader)
        mlflow.log_metric("train_loss", train_loss, step=epoch)

        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}")

    # Evaluate Model on Validation Set
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device).view(-1, 1)
            outputs = model(features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

    val_loss = total_loss / len(val_loader)
    mlflow.log_metric("val_loss", val_loss)
    print(f"Validation Loss: {val_loss:.4f}")

    # Evaluate Model on Test Set
    total_loss = 0
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device).view(-1, 1)
            outputs = model(features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

    test_loss = total_loss / len(test_loader)
    mlflow.log_metric("test_loss", test_loss)
    print(f"Test Loss: {test_loss:.4f}")

    # Log Model in MLflow
    mlflow.pytorch.log_model(model, f"simple_nn_model-{city}")
