# NYC Taxi Fare Prediction Model

This notebook trains a regression model to predict taxi fare amounts using the NYC taxi dataset from Unity Catalog.

In [0]:
import mlflow
import mlflow.sklearn
from mlflow.tracking.client import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print(f"MLflow version: {mlflow.__version__}")

In [0]:
# Create widgets for source table configuration
dbutils.widgets.text("catalog", "mlflow3demo_dev", "Catalog")
dbutils.widgets.text("schema", "dev_connor_brown_mlflow3_demo", "Schema") 
dbutils.widgets.text("features_table", "main.default.features", "Features Table Name")
dbutils.widgets.text("model_name", "dev_connor_brown_mlflow3_demo", "Model Name for UC")
dbutils.widgets.text("experiment_name", "/Workspace/Users/connor.brown@databricks.com/experiments/[dev connor_brown] mlflow3_demo", "Experiment Name")
dbutils.widgets.text("model_alias", "challenger", "Model Alias")

In [0]:
# Get widget values
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
features_table = dbutils.widgets.get("features_table")
model_name = dbutils.widgets.get("model_name")
experiment_name = dbutils.widgets.get("experiment_name")
model_alias = dbutils.widgets.get("model_alias")

# Construct full table name and model name
full_model_name = f"{catalog}.{schema}.{model_name}"

print(f"Source table: {features_table}")
print(f"Model will be registered as: {full_model_name}")

In [0]:
client = MlflowClient(registry_uri="databricks-uc")

In [0]:
# Load data from Unity Catalog
print(f"Loading data from {features_table}...")

df = spark.table(features_table)

In [0]:
# Feature Engineering and Data Preparation
print("Creating features for the regression model...")

# Feature engineering using Spark SQL
df_features = df.select(
    # Target variable
    F.col("fare_amount").alias("target"),
    
    # Distance feature
    F.col("trip_distance"),
    
    # Time-based features
    F.hour(F.col("tpep_pickup_datetime")).alias("pickup_hour"),
    F.dayofweek(F.col("tpep_pickup_datetime")).alias("pickup_day_of_week"),
    F.month(F.col("tpep_pickup_datetime")).alias("pickup_month"),
    
    # Trip duration in minutes
    ((F.unix_timestamp(F.col("tpep_dropoff_datetime")) - 
      F.unix_timestamp(F.col("tpep_pickup_datetime"))) / 60).alias("trip_duration_minutes"),
    
    # Location features (if available)
    F.col("pickup_zip").cast("string").alias("pickup_zip"),
    F.col("dropoff_zip").cast("string").alias("dropoff_zip")
).filter(
    # Filter out invalid data
    (F.col("fare_amount") > 0) & 
    (F.col("fare_amount") < 1000) &  # Remove outliers
    (F.col("trip_distance") > 0) & 
    (F.col("trip_distance") < 100) &  # Remove outliers
    (F.col("trip_duration_minutes") > 0) &
    (F.col("trip_duration_minutes") < 300)  # Remove trips longer than 5 hours
)

print(f"After filtering: {df_features.count():,} rows")
display(df_features.limit(5))


In [0]:
# Convert to Pandas for scikit-learn (sample data for faster training)
print("Converting to Pandas DataFrame...")

# Sample data for faster training (adjust sample_fraction as needed)
sample_fraction = 0.1  # Use 10% of data
df_sample = df_features.sample(fraction=sample_fraction, seed=42)

# Convert to Pandas
pdf = df_sample.toPandas()

print(f"Working with {len(pdf):,} rows (sample of {sample_fraction*100}%)")
print(f"Target variable stats:")
print(pdf['target'].describe())

# Prepare features for modeling
feature_columns = ['trip_distance', 'pickup_hour', 'pickup_day_of_week', 
                  'pickup_month', 'trip_duration_minutes']

# Handle missing values if any
pdf_clean = pdf[feature_columns + ['target']].dropna()

X = pdf_clean[feature_columns]
y = pdf_clean['target']

print(f"\nFinal dataset shape: {X.shape}")
print(f"Features: {feature_columns}")
print(f"Target variable range: ${y.min():.2f} - ${y.max():.2f}")


In [0]:
# Split data into training and test sets
print("Splitting data into train/test sets...")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")

# Train a simple Random Forest Regression model
print("\nTraining Random Forest Regressor...")

# Initialize the model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# Train the model
rf_model.fit(X_train, y_train)

print("Model training completed!")

# Make predictions
y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)

print(f"Training predictions range: ${y_pred_train.min():.2f} - ${y_pred_train.max():.2f}")
print(f"Test predictions range: ${y_pred_test.min():.2f} - ${y_pred_test.max():.2f}")


In [0]:
# Evaluate model performance
print("Evaluating model performance...")

# Calculate metrics for training set
train_mse = mean_squared_error(y_train, y_pred_train)
train_mae = mean_absolute_error(y_train, y_pred_train)
train_r2 = r2_score(y_train, y_pred_train)

# Calculate metrics for test set
test_mse = mean_squared_error(y_test, y_pred_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
test_r2 = r2_score(y_test, y_pred_test)

print("Model Performance Metrics:")
print("=" * 50)
print(f"Training Set:")
print(f"  - MSE: {train_mse:.2f}")
print(f"  - MAE: ${train_mae:.2f}")
print(f"  - R²: {train_r2:.3f}")
print()
print(f"Test Set:")
print(f"  - MSE: {test_mse:.2f}")
print(f"  - MAE: ${test_mae:.2f}")
print(f"  - R²: {test_r2:.3f}")
print()

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("🔍 Feature Importance:")
print(feature_importance)

# Store metrics for MLflow logging
metrics = {
    'train_mse': train_mse,
    'train_mae': train_mae,
    'train_r2': train_r2,
    'test_mse': test_mse,
    'test_mae': test_mae,
    'test_r2': test_r2
}


In [0]:
# Log model to Unity Catalog using MLflow
print("Logging model to Unity Catalog...")

# Set MLflow registry URI to Unity Catalog
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment(experiment_name)

In [0]:
# Start MLflow run
with mlflow.start_run(run_name=f"taxi_fare_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}") as run:
    
    # Log parameters
    mlflow.log_params({
        "model_type": "RandomForestRegressor",
        "n_estimators": 100,
        "max_depth": 10,
        "sample_fraction": sample_fraction,
        "features": feature_columns,
        "source_table": features_table
    })
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log the model with input example
    input_example = X_test.iloc[:5]  # First 5 rows as example
    
    logged_model = mlflow.sklearn.log_model(
        sk_model=rf_model,
        artifact_path="model",
        registered_model_name=full_model_name,
        input_example=input_example,
        signature=mlflow.models.infer_signature(X_train, y_pred_train)
    )
    
    # Log feature importance as an artifact
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['feature'], feature_importance['importance'])
    plt.title('Feature Importance')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    mlflow.log_artifact("feature_importance.png")
    plt.close()
    
    run_id = run.info.run_id
print(f"Model logged successfully!")
print(f"Run ID: {run_id}")
print(f"Model registered as: {full_model_name}")
print(f"You can view the model in the Databricks UI under Models -> {full_model_name}")

In [0]:
client.set_registered_model_alias(name=full_model_name, alias=model_alias, version=logged_model.registered_model_version)