In [None]:
# This is a conceptual Jupyter Notebook for orchestrating SageMaker Training Jobs
# and deploying test endpoints for your anomaly detection model.
# You would run this on your SageMaker Notebook Instance.

# --- Section 1: Setup ---
import sagemaker
from sagemaker.sklearn.estimator import SKLearn # For scikit-learn training
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer, JSONSerializer
from sagemaker.deserializers import CSVDeserializer, JSONDeserializer
import boto3
import pandas as pd
import os
import json

# Initialize SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket() # Or your specific processed data bucket

# Define S3 paths (these should match your Terraform outputs and data processing outputs)
processed_features_s3_path = f"s3://{bucket}/processed_egress_costs/processed_features/" # Output from feature_engineering.py
model_artifacts_s3_path_prefix = f"s3://{bucket}/ml-model-artifacts/anomaly_detection/" # Where model.joblib will be saved

# --- Section 2: Prepare Training Data ---
# In a real scenario, your feature engineering processing job would output
# the 'processed_features.parquet' file.
# For this notebook, we'll simulate or assume this file exists.
# Ensure your processed data is correctly formatted for the training script.

# Example: If you have a local processed_features.parquet for testing
# local_data_path = 'path/to/your/local/processed_features.parquet'
# training_data_s3_uri = sagemaker_session.upload_data(
#     path=local_data_path,
#     bucket=bucket,
#     key_prefix='training_data_for_model'
# )
# print(f"Training data uploaded to: {training_data_s3_uri}")

# For actual pipeline, the training job input would be the S3 path from the processing job
training_data_s3_uri = processed_features_s3_path # This is the input channel for the training job

print(f"Using training data from: {training_data_s3_uri}")

# --- Section 3: Configure and Run SageMaker Training Job ---
# Define the SKLearn Estimator to run your 'training_script.py'
# The 'source_dir' should point to the directory containing your training_script.py
# and any other necessary files (e.g., requirements.txt if using custom libraries).
# This directory will be uploaded to S3 by SageMaker.
sklearn_estimator = SKLearn(
    entry_point='training_script.py',
    source_dir='../', # Points to 'ml_models/anomaly_detection/' where training_script.py resides
    role=role,
    instance_type='ml.m5.xlarge', # Matches your Terraform variable for training
    instance_count=1,
    framework_version='0.23-1', # scikit-learn version used in your script
    py_version='py3',           # Python version
    hyperparameters={
        'n_estimators': 100,
        'contamination': 0.01,
        'random_state': 42
    },
    output_path=model_artifacts_s3_path_prefix, # S3 path where model artifacts will be saved
    base_job_name='egress-anomaly-detector-training'
)

# Start the training job
print("Starting SageMaker training job...")
sklearn_estimator.fit({'train': training_data_s3_uri})
print("Training job completed.")

# The trained model artifact (model.joblib) will be at:
# s3://<your-bucket>/ml-model-artifacts/anomaly_detection/egress-anomaly-detector-training-<job-timestamp>/output/model.tar.gz
# You would extract model.joblib from this tar.gz for direct S3 upload or endpoint creation.

# --- Section 4: Deploy Test Endpoint (Optional, for quick validation) ---
# This deploys a temporary endpoint for testing the 'inference_script.py'.
# For production, you'd use the SageMaker Model and Endpoint resources defined in Terraform.
test_endpoint_name = f"{os.environ.get('PROJECT_NAME', 'test')}-anomaly-ep-test" # Use a unique test name

print(f"Deploying test endpoint: {test_endpoint_name}...")
predictor = sklearn_estimator.deploy(
    instance_type='ml.t2.medium', # Matches your Terraform variable for inference
    initial_instance_count=1,
    endpoint_name=test_endpoint_name,
    serializer=JSONSerializer(), # Assuming your inference_script expects JSON
    deserializer=JSONDeserializer() # Assuming your inference_script outputs JSON
)
print(f"Test endpoint deployed: {predictor.endpoint_name}")

# --- Section 5: Testing Inference ---
# Prepare sample data for inference. This should mimic the structure of data
# that the inference_script.py expects after feature engineering.
# Example: a single data point with numerical features.
# In a real scenario, you'd feed actual recent data.
sample_inference_data = pd.DataFrame({
    'daily_egress_cost_usd': [1500.0],
    'daily_egress_usage_amount': [1500000.0],
    # Add other numerical features that your preprocessor outputs
    # For categorical features, you'd provide the raw categorical values
    # and expect the preprocessor (if saved and loaded in inference_script)
    # to handle the one-hot encoding.
    # For this simple example, ensure the input matches the numerical features
    # expected by IsolationForest.
})

print("\nSending sample data for inference:")
print(sample_inference_data)

try:
    predictions = predictor.predict(sample_inference_data.to_json(orient='records'))
    print("\nSample inference result:")
    print(predictions)
    # Expected output will be a JSON string from inference_script.py
    # Example: [{"daily_egress_cost_usd": 1500.0, ..., "is_anomaly": 1, "anomaly_score": -0.123}]
except Exception as e:
    print(f"Error during inference: {e}")

# --- Section 6: Cleanup (IMPORTANT: Delete test endpoint to avoid charges) ---
print(f"\nDeleting test endpoint: {predictor.endpoint_name}...")
predictor.delete_endpoint()
print("Test endpoint deleted.")