In [None]:
# Update the existing endpoint configuration to enable data capture
capture_s3_prefix = f"{auto_ml_job_name}/datacapture"  # S3 prefix for captured data
data_capture_config = {
    "EnableCapture": True,
    "InitialSamplingPercentage": 100,  # capture 100% of requests
    "DestinationS3Uri": f"s3://{bucket}/{capture_s3_prefix}/",
    "CaptureOptions": [{"CaptureMode": "Input"}, {"CaptureMode": "Output"}],
    "CaptureContentTypeHeader": {"CsvContentTypes": ["text/csv"], "JsonContentTypes": ["application/json"]}
}
# Create a new endpoint config with data capture enabled
endpoint_config_name_v2 = f"{auto_ml_job_name}-config-capture"
sm.create_endpoint_config(
    EndpointConfigName=endpoint_config_name_v2,
    ProductionVariants=[{
        "VariantName": "AllTraffic",
        "ModelName": model_name,
        "InstanceType": instance_type,
        "InitialInstanceCount": 1
    }],
    DataCaptureConfig=data_capture_config
)
# Update the endpoint to use the new config with data capture
sm.update_endpoint(EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name_v2)
waiter.wait(EndpointName=endpoint_name)
print(f"Data capture enabled for endpoint: {endpoint_name}")

# Invoke the endpoint again to generate a captured data record
if payload:
    _ = runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType="text/csv", Body=payload)
    print("Invoked endpoint to generate data capture.")

# Set up Model Monitor baseline using the training dataset
from sagemaker.model_monitor import DefaultModelMonitor, DatasetFormat
my_monitor = DefaultModelMonitor(role=boto3.client("sts").get_caller_identity()["Arn"],
                                 instance_count=1,
                                 instance_type="ml.m5.xlarge",  # instance for monitoring jobs
                                 volume_size_in_gb=20,
                                 max_runtime_in_seconds=3600)
baseline_results_uri = f"s3://{bucket}/{auto_ml_job_name}/baseline-results"
my_monitor.suggest_baseline(  # This runs a processing job to compute data stats and constraints:contentReference[oaicite:12]{index=12}
    baseline_dataset=s3_train_path,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_results_uri,
    wait=True
)
print("Baseline constraints and statistics generated.")

# Schedule a daily data-quality monitoring job
from sagemaker.model_monitor import CronExpressionGenerator, MonitoringDatasetFormat, EndpointInput
monitoring_schedule_name = f"{auto_ml_job_name}-dataquality-schedule"
my_monitor.create_monitoring_schedule(
    monitor_schedule_name=monitoring_schedule_name,
    endpoint_input=EndpointInput(endpoint_name=endpoint_name, destination="/opt/ml/processing/input/endpoint"),
    output_s3_uri=f"s3://{bucket}/{auto_ml_job_name}/monitoring-output",
    statistics=my_monitor.baseline_statistics(),
    constraints=my_monitor.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.daily(),  # run daily (you can use .hourly() for hourly)
    enable_cloudwatch_metrics=True
)
desc = my_monitor.describe_schedule()
print(f"Monitoring schedule status: {desc['MonitoringScheduleStatus']}")
print(f"Monitoring schedule configured to run at: {desc['ScheduleConfig']['ScheduleExpression']}")
