## 1. Set Up Your Environment


In [3]:
import boto3
import sagemaker

# sagemaker settings
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker.session.Session().default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## 2. Read in a dataset config

You can access and download the template configuration files from the [Gretel MLOps GitHub repository](https://github.com/gretelai/gretel-mlops/blob/main/src/aws/sagemaker_pipeline/configs/).


In [4]:
import requests
import yaml
import sys

# URL of the raw YAML file
config_url = "https://raw.githubusercontent.com/gretelai/gretel-mlops/main/src/gretel_mlops/aws/sagemaker/configs/config_stroke.yaml"

# Get the content of the YAML file
response = requests.get(config_url)
config = yaml.safe_load(response.text)

# Note uncomment below lines for Gretel Hybrid usage
# config['gretel']['mode'] = 'hybrid'
# config['gretel']['sink_bucket'] = 'gretel-hybrid-sandbox-sink' # your sink bucket name
config["gretel"]["generate_factor"] = 1.0

# view config
yaml.dump(config, sys.stdout, default_flow_style=False, sort_keys=False)

dataset:
  name: healthcare-stroke-data
  train_path: s3://gretel-datasets/ml_ops/stroke/train.csv
  validation_path: null
  test_path: null
  target_column: stroke
  drop_columns: id
ML:
  ml_task: classification
  objective: binary:logistic
  objective_type: Maximize
  ml_eval_metric: f1
  ml_metric_threshold: 0.0
gretel:
  strategy: balance
  generate_factor: 1.0
  target_balance: 1.0
  mode: cloud
  sink_bucket: null


## 3. ML pipeline

Install the [gretel-mlops](https://github.com/gretelai/gretel-mlops) library.

In [8]:
!pip install -Uqq git+https://github.com/gretelai/gretel-mlops.git

### 3.1 Initiate the pipeline

In [6]:
from gretel_mlops.aws.sagemaker.pipeline import get_pipeline

model_package_group_name = (
    f"GretelModelPackageGroup-{config['dataset']['name']}"
)
pipeline_name = f"GretelPipeline-{config['dataset']['name']}"
gretel_secret_name = "prod/Gretel/ApiKey"

print(f"Initiating {pipeline_name}")

pipeline = get_pipeline(
    region=region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    gretel_secret=gretel_secret_name,
    config=config,
)

Initiating GretelPipeline-healthcare-stroke-data
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml




Using provided s3_resource




## 3.2 Start the pipeline

In [None]:
# Create a new or update existing Pipeline

pipeline.upsert(role_arn=role)

# start pipeline execution
train_execution = pipeline.start()

# wait for pipeline to be completed
train_execution.wait()

## 4. Inspect results

In [None]:
import json

s3_client = boto3.client("s3")
s3_path_report = f"{pipeline.steps[3].arguments['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']}/evaluation.json"
bucket_name = s3_path_report.replace("s3://", "").split("/", 1)[0]
file_key = s3_path_report.replace("s3://", "").split("/", 1)[1]

# Fetch the file from S3
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
content = response["Body"].read()

# Parse the JSON content
data = json.loads(content)

# Pretty print the JSON data
pretty_json = json.dumps(data, indent=4)
print(pretty_json)

## 5. Clean-up the pipeline

In [None]:
client = boto3.client("sagemaker")
client.delete_pipeline(PipelineName=pipeline_name)