In [5]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sklearn.model_selection import train_test_split

# Configuration
role = get_execution_role()
session = sagemaker.Session()
region = boto3.Session().region_name
bucket = "c732-sfu-parking-data-lake" 
prefix = "sagemaker/sfu-parking"

# input path 
training_data_s3 = f"s3://{bucket}/processed/training_features/"

print(f"Region: {region}")
print(f"Role: {role}")
print(f"Reading data from: {training_data_s3}")

# load data
df = pd.read_parquet(training_data_s3)
print(f"Data Loaded. Shape: {df.shape}")

# drop non-feature columns for training (keep lot_id/campus but encode them)
# One-Hot Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['lot_id', 'campus'])

# Define features (X) and labels (Y)
# drop target columns and identifiers not needed for X
features = df_encoded.drop(['occupancy_plus_15m', 'departures_in_15m', 'timestamp', 'date'], axis=1, errors='ignore')

# targets
label_occupancy = df['occupancy_plus_15m']
label_departure = df['departures_in_15m']

# split Data (80% training, 20% validation)
X_train, X_val, y_occ_train, y_occ_val, y_dep_train, y_dep_val = train_test_split(
    features, label_occupancy, label_departure, test_size=0.2, random_state=42
)

# Model training using XGBOOST 
# XGBoost in SageMaker expects CSV data in S3 with no headers
# first column = target label, remaining columns = features
def upload_to_s3(x, y, name):
    # combine label + features
    dataset = pd.concat([y, x], axis=1)
    filename = f"{name}.csv"
    dataset.to_csv(filename, header=False, index=False)
    
    return session.upload_data(filename, bucket=bucket, key_prefix=f"{prefix}/input/{name}")

# upload occupancy data
train_occ_uri = upload_to_s3(X_train, y_occ_train, 'train_occupancy')
val_occ_uri = upload_to_s3(X_val, y_occ_val, 'val_occupancy')

# upload departure data
train_dep_uri = upload_to_s3(X_train, y_dep_train, 'train_departure')
val_dep_uri = upload_to_s3(X_val, y_dep_val, 'val_departure')

print("Data uploaded to S3 for training.")

# Training 2 models
# Retrieve XGBoost container image
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.5-1")

# Standard hyperparameters for both models (changed objective below)
base_hyperparams = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.8",
    "num_round": "100"
}

# Hyperparameters for Occupancy (standard regression for continuous-like data)
hyperparams_occ = base_hyperparams.copy()
hyperparams_occ["objective"] = "reg:squarederror"

# Hyperparameters for Departure (Poisson regression for count data)
hyperparams_dep = base_hyperparams.copy()
hyperparams_dep["objective"] = "count:poisson" 

# Train model A: occupancy predictor 
print("Training Occupancy Model...")
xgb_occ = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/output/occupancy",
    sagemaker_session=session
)
xgb_occ.set_hyperparameters(**hyperparams_occ)
xgb_occ.fit({'train': TrainingInput(train_occ_uri, content_type='csv'), 
             'validation': TrainingInput(val_occ_uri, content_type='csv')})

# Train model B: departure predictor
print("Training Departure Model...")
xgb_dep = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/output/departure",
    sagemaker_session=session
)
xgb_dep.set_hyperparameters(**hyperparams_dep) # Using the new Poisson objective
xgb_dep.fit({'train': TrainingInput(train_dep_uri, content_type='csv'), 
             'validation': TrainingInput(val_dep_uri, content_type='csv')})

print("Deploying Endpoints...")

# Deploy occupancy model
predictor_occ = xgb_occ.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium", 
    endpoint_name="sfu-occupancy-predictor"
)

# Deploy departure model
predictor_dep = xgb_dep.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name="sfu-departure-predictor"
)

print("\n--- DEPLOYMENT COMPLETE ---")
print("Endpoint 1: sfu-occupancy-predictor")
print("Endpoint 2: sfu-departure-predictor")

Region: us-west-2
Role: arn:aws:iam::718465053795:role/AmazonSageMakerExecutionRole
Reading data from: s3://c732-sfu-parking-data-lake/processed/training_features/
Data Loaded. Shape: (947898, 11)


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-24-22-29-52-432


Data uploaded to S3 for training.
Training Occupancy Model...
2025-11-24 22:29:54 Starting - Starting the training job...
2025-11-24 22:30:10 Starting - Preparing the instances for training...
2025-11-24 22:30:32 Downloading - Downloading input data...
2025-11-24 22:31:17 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-24 22:32:14.816 ip-10-0-167-33.us-west-2.compute.internal:8 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-24 22:32:14.840 ip-10-0-167-33.us-west-2.compute.internal:8 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-24:22:32:15:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-24:22:32:15:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-24:22:32:15:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-11-24:22:32:15:INFO] Ru

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-24-22-33-39-976


Training seconds: 175
Billable seconds: 175
Training Departure Model...
2025-11-24 22:33:42 Starting - Starting the training job...
2025-11-24 22:33:57 Starting - Preparing the instances for training...
2025-11-24 22:34:18 Downloading - Downloading input data...
2025-11-24 22:35:08 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-24 22:36:01.089 ip-10-0-155-188.us-west-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-24 22:36:01.111 ip-10-0-155-188.us-west-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-24:22:36:01:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-24:22:36:01:INFO] Failed to parse hyperparameter objective value count:poisson to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-24:22:36:01:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-11-24:22:36:01

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-11-24-22-37-58-180


Training seconds: 180
Billable seconds: 180
Deploying Endpoints...


INFO:sagemaker:Creating endpoint-config with name sfu-occupancy-predictor
INFO:sagemaker:Creating endpoint with name sfu-occupancy-predictor


----------!

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-11-24-22-43-30-558
INFO:sagemaker:Creating endpoint-config with name sfu-departure-predictor
INFO:sagemaker:Creating endpoint with name sfu-departure-predictor


----------!
--- DEPLOYMENT COMPLETE ---
Endpoint 1: sfu-occupancy-predictor
Endpoint 2: sfu-departure-predictor
