### Setting up the notebook session

In [None]:
import boto3
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()  # Use the appropriate IAM role
endpoint_name = "lab-sagemaker-endpoint"

bucket_name = next(
    (
        bucket["Name"]
        for bucket in boto3.client("s3").list_buckets()["Buckets"]
        if bucket["Name"].startswith("lab-sagemaker-")
    ),
    None,
)


### Sending Normal Data to the Endpoint

In [None]:
from sklearn.datasets import make_classification
import pandas as pd

# Generate new synthetic data that matches the original training distribution
X_normal, y_normal = make_classification(
    n_samples=20,       # Number of samples to send to the endpoint
    n_features=10,       # Number of features (same as training data)
    n_informative=8,     # Number of informative features (same as training data)
    n_redundant=2,       # Number of redundant features (same as training data)
    n_classes=2,         # Binary classification
    random_state=42      # For reproducibility (same seed as training data)
)

# Convert to pandas DataFrame for easier handling
df_normal = pd.DataFrame(X_normal, columns=[f'feature_{i}' for i in range(1, 11)])
df_normal['target'] = y_normal  # Add target column if necessary, though the model only needs features

# You can save this data to a CSV if required
df_normal.to_csv('synthetic_normal_data.csv', index=False)

In [None]:
input_data = df_normal.drop(columns=['target']).values.tolist()  # Convert to a 2D list for CSV input

from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker.Session(),  # Initialize SageMaker session
    serializer=CSVSerializer(),  # Specifies input format as CSV
    deserializer=JSOeDeserializer()  # Specifies output format as JSON
)

# Convert each row of the dataset to the CSV format and send it to the endpoint
for row in input_data:
    response = predictor.predict(row)  # Send each row for prediction
    print(response)  # Optionally, print the response for each row

# Set up model monitoring

Baseline processing job typically takes 

In [None]:
from sagemaker.model_monitor import DefaultModelMonitor

monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
    sagemaker_session=sagemaker_session
)

s3_key = "data-capture"

baseline_job = monitor.suggest_baseline(
    baseline_dataset=f's3://{bucket_name}/{s3_key}',
    dataset_format={'csv': {'header': True}},
    output_s3_uri=f's3://{bucket_name}/baseline_output',
    wait=True
)

In [None]:
from sagemaker.model_monitor import CronExpressionGenerator

endpoint_name = 'lab-sagemaker-endpoint'

monitor.create_monitoring_schedule(
    endpoint_input=endpoint_name,
    output_s3_uri=f's3://{bucket_name}/monitoring_output',
    statistics=baseline_job.baseline_statistics(),
    constraints=baseline_job.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True
)


# Simulating Data Drift and Quality Issues

In [None]:
from sklearn.datasets import make_classification
import pandas as pd

# Generate new synthetic data with drift
X_drifted, y_drifted = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=4,    # Reduced informative features
    n_redundant=6,      # Increased redundant features
    n_classes=2,
    random_state=99     # Different seed for variation
)

# Create DataFrame
df_drifted = pd.DataFrame(X_drifted, columns=feature_names)
df_drifted['target'] = y_drifted

# Save to CSV
df_drifted.to_csv('synthetic_drifted_data.csv', index=False)


In [None]:
import numpy as np

# Introduce missing values
df_drifted.loc[df_drifted.sample(frac=0.1).index, 'feature_1'] = np.nan

# Introduce outliers
df_drifted.loc[df_drifted.sample(frac=0.05).index, 'feature_2'] *= 10

# Sending new data to the endpoint

In [None]:
import json

# Prepare data for inference
inference_data = df_drifted.drop('target', axis=1).values

# Send data to endpoint
runtime = boto3.client('sagemaker-runtime')

for row in inference_data:
    payload = ','.join(map(str, row))
    response = runtime.invoke_endpoint(
        EndpointName=predictor.endpoint_name,
        ContentType='text/csv',
        Body=payload
    )
    result = response['Body'].read()
    # Optionally, process the result


In [None]:
# Create the predictor object
predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker.Session(),  # Initialize SageMaker session
    serializer=CSVSerializer(),  # Specifies input format as CSV
    deserializer=JSONDeserializer()  # Specifies output format as JSON
)