In [2]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sklearn.model_selection import train_test_split

# CONFIGURATION
role = get_execution_role()
session = sagemaker.Session()
region = boto3.Session().region_name
bucket = "c732-sfu-parking-data-lake" 
prefix = "sagemaker/sfu-parking"

# input path 
training_data_s3 = f"s3://{bucket}/processed/training_features/"

print(f"Region: {region}")
print(f"Role: {role}")
print(f"Reading data from: {training_data_s3}")

# Load data
df = pd.read_parquet(training_data_s3)
print(f"Data Loaded. Shape: {df.shape}")

# drop non-feature columns for training (keep lot_id/campus but encode them)
# One-Hot Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['lot_id', 'campus'])

# Define features (X) and labels (Y)
# drop target columns and identifiers not needed for X
features = df_encoded.drop(['occupancy_plus_15m', 'departures_in_next_15m', 'timestamp', 'date'], axis=1, errors='ignore')

# targets
label_occupancy = df['occupancy_plus_15m']
label_departure = df['departures_in_15m']

# Split Data (80% training, 20% validation)
X_train, X_val, y_occ_train, y_occ_val, y_dep_train, y_dep_val = train_test_split(
    features, label_occupancy, label_departure, test_size=0.2, random_state=42
)

# Model training using XGBOOST 
# XGBoost in SageMaker expects CSV data in S3 with no headers
# first column = target label, remaining columns = features

def upload_to_s3(x, y, name):
    # combine label + features
    dataset = pd.concat([y, x], axis=1)
    filename = f"{name}.csv"
    dataset.to_csv(filename, header=False, index=False)
    
    return session.upload_data(filename, bucket=bucket, key_prefix=f"{prefix}/input/{name}")

# Upload occupancy data
train_occ_uri = upload_to_s3(X_train, y_occ_train, 'train_occupancy')
val_occ_uri = upload_to_s3(X_val, y_occ_val, 'val_occupancy')

# Upload departure data
train_dep_uri = upload_to_s3(X_train, y_dep_train, 'train_departure')
val_dep_uri = upload_to_s3(X_val, y_dep_val, 'val_departure')

print("Data uploaded to S3 for training.")

# Training 2 models
# Retrieve XGBoost container image
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.5-1")

# Common hyperparameters
hyperparams = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.8",
    "objective": "reg:squarederror",
    "num_round": "100"
}

# Train model A: occupancy predictor 
print("Training Occupancy Model...")
xgb_occ = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/output/occupancy",
    sagemaker_session=session
)
xgb_occ.set_hyperparameters(**hyperparams)
xgb_occ.fit({'train': TrainingInput(train_occ_uri, content_type='csv'), 
             'validation': TrainingInput(val_occ_uri, content_type='csv')})

# Train model B: departure predictor
print("Training Departure Model...")
xgb_dep = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/output/departure",
    sagemaker_session=session
)
xgb_dep.set_hyperparameters(**hyperparams)
xgb_dep.fit({'train': TrainingInput(train_dep_uri, content_type='csv'), 
             'validation': TrainingInput(val_dep_uri, content_type='csv')})

# Deploy endpoints
print("Deploying Endpoints...")

# Deploy occupancy model
predictor_occ = xgb_occ.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium", 
    endpoint_name="sfu-occupancy-predictor"
)

# Deploy departure model
predictor_dep = xgb_dep.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name="sfu-departure-predictor"
)

print("\n--- DEPLOYMENT COMPLETE ---")
print("Endpoint 1: sfu-occupancy-predictor")
print("Endpoint 2: sfu-departure-predictor")

Region: us-west-2
Role: arn:aws:iam::718465053795:role/AmazonSageMakerExecutionRole
Reading data from: s3://c732-sfu-parking-data-lake/processed/training_features/
Data Loaded. Shape: (947898, 11)
Data uploaded to S3 for training.
Training Occupancy Model...


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-20-21-42-19-040


2025-11-20 21:42:22 Starting - Starting the training job...
2025-11-20 21:42:38 Starting - Preparing the instances for training...
2025-11-20 21:42:59 Downloading - Downloading input data...
2025-11-20 21:43:44 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-20 21:44:48.561 ip-10-0-243-235.us-west-2.compute.internal:6 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-20 21:44:48.594 ip-10-0-243-235.us-west-2.compute.internal:6 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-20:21:44:49:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-20:21:44:49:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-20:21:44:49:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-11-20:21:44:49:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-11-

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-20-21-47-01-560


2025-11-20 21:47:06 Starting - Starting the training job...
2025-11-20 21:47:21 Starting - Preparing the instances for training...
2025-11-20 21:47:43 Downloading - Downloading input data...
2025-11-20 21:48:34 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-20 21:49:37.222 ip-10-0-104-37.us-west-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-20 21:49:37.244 ip-10-0-104-37.us-west-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-20:21:49:37:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-20:21:49:37:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-20:21:49:37:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-11-20:21:49:37:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-11-20

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-11-20-21-51-13-641


Training seconds: 180
Billable seconds: 180
Deploying Endpoints...


INFO:sagemaker:Creating endpoint-config with name sfu-occupancy-predictor
INFO:sagemaker:Creating endpoint with name sfu-occupancy-predictor


------!

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-11-20-21-54-45-872
INFO:sagemaker:Creating endpoint-config with name sfu-departure-predictor
INFO:sagemaker:Creating endpoint with name sfu-departure-predictor


---------!
--- DEPLOYMENT COMPLETE ---
Endpoint 1: sfu-occupancy-predictor
Endpoint 2: sfu-departure-predictor
