In [1]:
'''
This notebook demonstrates how to set up model monitoring using SageMaker Model Monitor
Although in real life we would capture real inference data for baseline,
Here we will use that 20% validation data
'''

'\nThis notebook demonstrates how to set up model monitoring using SageMaker Model Monitor. (In a production system you would capture real inference data and set up baselining.)\n'

In [2]:
!pip install awswrangler



In [3]:
import boto3
import time
import datetime
import awswrangler as wr
import pandas as pd
import sagemaker
from sagemaker.model_monitor import DataCaptureConfig, ModelMonitor
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.model_monitor import DataCaptureConfig, ModelMonitor, CronExpressionGenerator
from sagemaker import get_execution_role, image_uris, Session
from sagemaker.s3 import S3Uploader

bucket_name = "arxiv-project-bucket"
role = "arn:aws:iam::221082214706:role/MYLabRole"
region = "us-east-1"

sess = sagemaker.Session(boto_session=boto3.Session(region_name=region))

# Assume the deployed endpoint is  our"arxiv-clustering-endpoint"
endpoint_name = "arxiv-clustering-endpoint"

# Set prefixes for monitoring outputs
prefix = "sagemaker/Custom-ModelMonitor-Example"
data_capture_prefix = f"{prefix}/datacapture"
s3_capture_upload_path = f"s3://{bucket_name}/{data_capture_prefix}"
ground_truth_upload_path = f"s3://{bucket_name}/{prefix}/ground_truth_data/{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}"
reports_prefix = f"{prefix}/reports"
s3_report_path = f"s3://{bucket_name}/{reports_prefix}"

# For monitoring output where our monitoring job will write its reports
monitoring_output_uri = f"s3://{bucket_name}/{prefix}/schedule/output"

print("Data capture path:", s3_capture_upload_path)
print("Monitoring output path:", monitoring_output_uri)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Data capture path: s3://arxiv-project-bucket/sagemaker/Custom-ModelMonitor-Example/datacapture
Monitoring output path: s3://arxiv-project-bucket/sagemaker/Custom-ModelMonitor-Example/schedule/output


In [4]:
# Double check to see edits
print(sagemaker.__version__)

2.239.3


In [5]:
!pip install --upgrade pip




In [6]:
!pip install --upgrade sagemaker




In [7]:
print(sagemaker.__version__)

2.239.3


In [8]:
'''
In this cell, we sample a representative subset of the processed training data 
without loading all of it at once. This baseline dataset will be used to compute 
unsupervised quality metrics (basically our silhouette score)
'''

# Define S3 folder where processed training data is stored
train_path = f"s3://{bucket_name}/processed/train/"
# Define baseline dataset output path (NOTE: this is a dataset folder, important since we chunked)
baseline_path = f"s3://{bucket_name}/model-monitoring/baseline/arxiv_baseline.parquet"

# List all Parquet files in the training folder
parquet_files = wr.s3.list_objects(path=train_path)
print("Found the following parquet files:")
print(parquet_files)

Found the following parquet files:
['s3://arxiv-project-bucket/processed/train/arxiv_train_chunk_0.parquet', 's3://arxiv-project-bucket/processed/train/arxiv_train_chunk_0.parquet/e41408732a4345918e42cc0098224655.snappy.parquet', 's3://arxiv-project-bucket/processed/train/arxiv_train_chunk_1.parquet', 's3://arxiv-project-bucket/processed/train/arxiv_train_chunk_1.parquet/32766f83f8ae4c8e8e97920dc5f3896b.snappy.parquet', 's3://arxiv-project-bucket/processed/train/arxiv_train_chunk_10.parquet/1550e13a2c3541569be48303db148a69.snappy.parquet', 's3://arxiv-project-bucket/processed/train/arxiv_train_chunk_11.parquet/7dc98e4f34cb4a0fad60602745bc2fc3.snappy.parquet', 's3://arxiv-project-bucket/processed/train/arxiv_train_chunk_12.parquet/711248b7fc004157ac71d43712d6b108.snappy.parquet', 's3://arxiv-project-bucket/processed/train/arxiv_train_chunk_13.parquet/adf58ace0472479194064be95361104c.snappy.parquet', 's3://arxiv-project-bucket/processed/train/arxiv_train_chunk_14.parquet/f9b5f16acb4d4e74

In [9]:
# Sample from each file (i.e. 1000 rows per file) and then sample down to a target of 10,000 rows
# Probably should reduce to like 100
sample_list = []
target_sample_size = 10000
for key in parquet_files:
    try:
        df_temp = wr.s3.read_parquet(path=key)
    except Exception as e:
        print(f"Error reading {key}: {e}")
        continue
    if len(df_temp) > 0:
        n_rows = min(1000, len(df_temp))
        sample_df = df_temp.sample(n=n_rows, random_state=39)
        sample_list.append(sample_df)
        
if sample_list:
    baseline_df = pd.concat(sample_list, ignore_index=True)
    if len(baseline_df) > target_sample_size:
        baseline_df = baseline_df.sample(n=target_sample_size, random_state=39)
    print("Baseline dataset shape after sampling:", baseline_df.shape)
else:
    raise ValueError("No data samples could be extracted from the processed training files.")

# Need to drop problematic columns ("versions") that cause issues with pyarrow
# Likely important to not include as baseline feature in kmeans cluster
if "versions" in baseline_df.columns:
    baseline_df = baseline_df.drop(columns=["versions"])
    print("Dropped 'versions' column from baseline dataset.")

# Write the baseline dataset to S3 as a dataset (folder of Parquet files)
wr.s3.to_parquet(
    df=baseline_df,
    path=baseline_path,
    dataset=True,
    mode="overwrite",
    compression="snappy"
)
print("Baseline dataset created and uploaded to:", baseline_path)


Baseline dataset shape after sampling: (10000, 61)
Dropped 'versions' column from baseline dataset.


Baseline dataset created and uploaded to: s3://arxiv-project-bucket/model-monitoring/baseline/arxiv_baseline.parquet


In [10]:
# Configure Data Capture for the Endpoint
data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100,  # Capture all inference data
    destination_s3_uri=s3_capture_upload_path
)
print("Data capture configuration set.")


Data capture configuration set.


In [11]:
'''
This cell runs a processing job that computes baseline metrics using the custom_baseline.py script.
The custom_baseline.py is expected to read the baseline dataset (folder) and compute metrics (silhouette score).
As a result we need to pass the folder path as input.
'''
baseline_job_name = "arxiv-model-baseline-custom-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")

script_processor = ScriptProcessor(
    image_uri=image_uris.retrieve("sklearn", region, version="0.23-1", py_version="py3"),
    command=["python3"],
    instance_type="ml.m5.xlarge",
    instance_count=1,
    role=role,
    sagemaker_session=sess,
)

script_processor.run(
    inputs=[
        ProcessingInput(
            source=baseline_path,  
            destination="/opt/ml/processing/input_data",
            s3_data_type="S3Prefix"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="baseline_output",
            source="/opt/ml/processing/output"
        )
    ],
    code="custom_baseline.py",
    arguments=[
        "--input_data", "/opt/ml/processing/input_data",
        "--output_dir", "/opt/ml/processing/output",
        "--n_clusters", "5"
    ],
    job_name=baseline_job_name,
    wait=True
)
print("Custom baseline processing job completed.")

.............[34mBaseline metrics written to /opt/ml/processing/output/baseline_metrics.json[0m

Custom baseline processing job completed.


In [None]:
# Below few cells are highly experimental and repeated

In [13]:
'''
This is where we run into issues, creating a Model Monitor Schedule to eval endpoint
For our unsupervised problem, our custom baseline metrics (silhouette score) will serve as our baseline
We should also enable cloudwatch for this. 
'''
monitor = ModelMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
    sagemaker_session=sess
)

monitor_schedule_name = "arxiv-model-monitor-schedule-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")

# Create the schedule
# Important note that we no longer use output_s3_uri, but we use "output" as a dictionary
output_config = {
    "MonitoringOutputs": [
         {
             "S3Output": {
                  "S3Uri": monitoring_output_uri,
                  "LocalPath": "/opt/ml/processing/output",
                  "S3UploadMode": "Continuous"
             }
         }
    ]
}

environment = {} # Unsure if we need environment variables

monitor_schedule = monitor.create_monitoring_schedule(
    monitor_schedule_name=monitor_schedule_name,
    endpoint_input=endpoint_name,
    output=output_config,
    schedule_cron_expression=CronExpressionGenerator.hourly(),
)
print("Monitoring schedule created:", monitor_schedule_name)

In [14]:
# Define a dummy output configuration class
class DummyMonitoringOutputConfig:
    def __init__(self, s3_uri, local_path="/opt/ml/processing/output", s3_upload_mode="Continuous"):
        self.destination = s3_uri
        self.local_path = local_path
        self.s3_upload_mode = s3_upload_mode

    def _to_request_dict(self):
        return {
            "MonitoringOutputs": [
                {
                    "S3Output": {
                        "S3Uri": self.destination,
                        "LocalPath": self.local_path,
                        "S3UploadMode": self.s3_upload_mode
                    }
                }
            ]
        }

# Create an instance of the dummy output configuration
dummy_output_config = DummyMonitoringOutputConfig(monitoring_output_uri)

# Create an EndpointInput object for your endpoint
from sagemaker.model_monitor import EndpointInput

endpoint_input = EndpointInput(
    endpoint_name=endpoint_name,
    destination="/opt/ml/processing/input/endpoint"
)

# Create the monitoring schedule
monitor_schedule = monitor.create_monitoring_schedule(
    monitor_schedule_name=monitor_schedule_name,
    endpoint_input=endpoint_input,
    output=dummy_output_config,
    schedule_cron_expression=CronExpressionGenerator.hourly()
)
print("Monitoring schedule created:", monitor_schedule_name)


In [None]:
# Check Monitoring Execution Status
print("Waiting for the first monitoring execution...")
time.sleep(60)  # Wait one minute

executions = monitor.list_executions()
if executions:
    latest_execution = executions[-1]
    print("Latest monitoring execution details:")
    print(latest_execution.describe())
else:
    print("No monitoring executions found yet.")
