In [1]:
import boto3
import pandas as pd
import numpy as np
import sagemaker

# Default Sagemaker bucket
session = sagemaker.Session()
default_bucket = session.default_bucket()
print("Default bucket:", default_bucket)

# Upload file from SageMaker Studio's working directory
local_file_path = "sample_dataset.xlsx" 
s3_key = "sample_dataset.xlsx"

s3 = boto3.client('s3')
s3.upload_file(local_file_path, default_bucket, s3_key)

## Preprocess the dataset
def preprocess_data(file_path):  
    df = pd.read_excel(file_path)

    df["firstorder"] = pd.to_datetime(df["firstorder"], errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"], errors='coerce')
    df = df.dropna()
    df["first_last_days_diff"] = (df['lastorder'] - df['firstorder']).dt.days
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff'] = (df['created'] - df['firstorder']).dt.days
    df.drop(['custid', 'created', 'firstorder', 'lastorder'], axis=1, inplace=True)
    df = pd.get_dummies(df, prefix=['favday', 'city'], columns=['favday', 'city'])

    return df

## Set the required configurations
model_name = "churn_model"
env = "dev"

storedata = preprocess_data(f"s3://{default_bucket}/{s3_key}")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Default bucket: sagemaker-us-west-1-277707136136


  warn(msg)


In [2]:
storedata.head()

Unnamed: 0,retained,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,first_last_days_diff,...,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BLR,city_BOM,city_DEL,city_MAA
0,0,29,100.0,3.448276,14.52,0.0,0,0,0,0,...,True,False,False,False,False,False,False,False,True,False
1,1,95,92.631579,10.526316,83.69,0.181641,1,1,1,1024,...,False,False,False,False,False,False,False,False,True,False
2,0,0,0.0,0.0,33.58,0.059908,0,0,0,217,...,False,False,False,False,False,True,False,False,True,False
3,0,0,0.0,0.0,54.96,0.0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
4,1,30,90.0,13.333333,111.91,0.00885,0,0,0,791,...,True,False,False,False,False,False,False,True,False,False


In [3]:
def split_datasets(df):
    y=df.pop("retained")
    X_pre = df
    y_pre = y.to_numpy().reshape(len(y),1)
    feature_names = list(X_pre.columns)
    X= np.concatenate((y_pre,X_pre),axis=1)
    np.random.shuffle(X)
    train,validation,test=np.split(X,[int(.7*len(X)),int(.85*len(X))])
    return feature_names,train,validation,test

# Split dataset
feature_names,train,validation,test = split_datasets(storedata)

# Save datasets in Amazon S3
pd.DataFrame(train).to_csv(f"s3://{default_bucket}/data/train/train.csv",header=False,index=False)
pd.DataFrame(validation).to_csv(f"s3://{default_bucket}/data/validation/validation.csv",header=False,index=False)
pd.DataFrame(test).to_csv(f"s3://{default_bucket}/data/test/test.csv",header=False,index=False)

In [10]:
from sagemaker.inputs import TrainingInput
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
from sagemaker.tuner import (
    HyperparameterTuner,
    ContinuousParameter,
    IntegerParameter
)

# Training and Validation Input for SageMaker Training job
s3_input_train = TrainingInput(
    s3_data=f"s3://{default_bucket}/data/train/",content_type="csv")
s3_input_validation = TrainingInput(
    s3_data=f"s3://{default_bucket}/data/validation/",content_type="csv")

# Hyperparameter used
fixed_hyperparameters = {
    "eval_metric":"auc",
    "objective":"binary:logistic",
    "num_round":"100",
    "rate_drop":"0.3",
    "tweedie_variance_power":"1.4"
}

# Use the built-in SageMaker algorithm

sess = sagemaker.Session()
container = sagemaker.image_uris.retrieve("xgboost",region,"0.90-2")

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    hyperparameters=fixed_hyperparameters,
    instance_type="ml.m5.large",
    output_path="s3://{}/output".format(default_bucket),
    sagemaker_session=sagemaker_session
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}
objective_metric_name = "validation:auc"
tuner = HyperparameterTuner(
estimator, objective_metric_name,
hyperparameter_ranges,max_jobs=10,max_parallel_jobs=2)

# Tune
tuner.fit({
    "train":s3_input_train,
    "validation":s3_input_validation
    },include_cls_metadata=False)

## Explore the best model generated
tuning_job_result = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" %job_count)
## 10 training jobs have completed

## Get the best training job

from pprint import pprint
if tuning_job_result.get("BestTrainingJob",None):
    print("Best Model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

......................................................................................!
10 training jobs have completed
Best Model found so far:
{'CreationTime': datetime.datetime(2025, 3, 31, 11, 17, 47, tzinfo=tzlocal()),
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:auc',
                                                 'Value': 0.9830520153045654},
 'ObjectiveStatus': 'Succeeded',
 'TrainingEndTime': datetime.datetime(2025, 3, 31, 11, 18, 52, tzinfo=tzlocal()),
 'TrainingJobArn': 'arn:aws:sagemaker:us-west-1:277707136136:training-job/sagemaker-xgboost-250331-1111-010-fd85eaae',
 'TrainingJobName': 'sagemaker-xgboost-250331-1111-010-fd85eaae',
 'TrainingJobStatus': 'Completed',
 'TrainingStartTime': datetime.datetime(2025, 3, 31, 11, 18, 3, tzinfo=tzlocal()),
 'TunedHyperParameters': {'alpha': '1.659326563588221',
                          'eta': '0.15923534097561928',
                          'max_depth': '5',
                          'min_child_weigh

In [11]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

processing_instance_type = "ml.m5.large"
processing_instance_count = 1

df = pd.read_excel("sample_dataset.xlsx")
df.to_csv("sample_dataset.csv", index=False)

s3.upload_file("sample_dataset.csv", default_bucket, "input/sample_dataset.csv")

input_data = f"s3://{default_bucket}/input/sample_dataset.csv"

# Processing step for feature engineering
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    sagemaker_session=sagemaker_session,
    role=role,
)

step_process = ProcessingStep(
    name="ChurnModelProcess",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(
            source=input_data,
            destination="/opt/ml/processing/input"
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/train",
            destination=f"s3://{default_bucket}/output/train"
        ),
        ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/validation",
            destination=f"s3://{default_bucket}/output/validation"
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/test",
            destination=f"s3://{default_bucket}/output/test"
        ),
    ],
    code=f"s3://{default_bucket}/input/code/preprocess.py",
)

  warn(msg)


In [12]:
from sagemaker.estimator import Estimator
from sagemaker.tuner import ContinuousParameter, IntegerParameter

# Training step for generating model artifacts
model_path = f"s3://{default_bucket}/output"

training_instance_type = "ml.t3.medium"

image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",
    py_version="py3",
    instance_type=training_instance_type,
)

fixed_hyperparameters = {
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "num_round": "100",
    "rate_drop": "0.3",
    "tweedie_variance_power": "1.4"
}

xgb_train = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    hyperparameters=fixed_hyperparameters,
    output_path=model_path,
    base_job_name="churn-train",
    sagemaker_session=sagemaker_session,
    role=role,
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}

objective_metric_name = "validation:auc"

In [13]:
from sagemaker.workflow.steps import TuningStep
from sagemaker.inputs import TrainingInput

# Tuning step (connects preprocessing to training & HPO)
step_tuning = TuningStep(
    name="ChurnHyperParameterTuning",
    tuner=HyperparameterTuner(
        estimator=xgb_train,
        objective_metric_name=objective_metric_name,
        hyperparameter_ranges=hyperparameter_ranges,
        max_jobs=2,
        max_parallel_jobs=2
    ),
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)

In [17]:
from sagemaker.workflow.step_collections import RegisterModel

model_package_group_name = "ChurnModelGroup"

step_register = RegisterModel(
        name="RegisterChurnModelV2",
        estimator=xgb_train,
        model_data=step_tuning.get_top_model_s3_uri(top_k=0,s3_bucket=default_bucket,prefix="output"),
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.m5.large", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name
    )

In [20]:
from sagemaker.clarify import SageMakerClarifyProcessor
from sagemaker.workflow.steps import ProcessingStep

clarify_instance_type = "ml.m5.large"

clarify_processor = SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type=clarify_instance_type,
    sagemaker_session=sagemaker_session
)

In [21]:
bias_report_output_path = f's3://{default_bucket}/clarify-output'
clarify_instance_type = "ml.m5.xlarge"

model_name = step_register.properties.ModelPackageArn

In [22]:
from sagemaker.clarify import DataConfig, ModelConfig, ModelPredictedLabelConfig, BiasConfig

data_config = DataConfig(
    s3_data_input_path=f's3://{default_bucket}/output/train/train.csv',
    s3_output_path=bias_report_output_path,
    label=0, 
    headers=[
        'retained','esent','eopenrate','eclickrate','avgorder','ordfreq',
        'paperless','refill','doorstep','first_last_days_diff',
        'created_first_days_diff','favday_Friday','favday_Monday',
        'favday_Saturday','favday_Sunday','favday_Thursday','favday_Tuesday',
        'favday_Wednesday','city_BLR','city_BOM','city_DEL','city_MAA'
    ],
    dataset_type="text/csv",
)

model_config = ModelConfig(
    model_name=model_name,
    instance_type=clarify_instance_type,
    instance_count=1,
    accept_type="text/csv",
)

model_predicted_label_config = ModelPredictedLabelConfig(
    probability_threshold=0.5
)

bias_config = BiasConfig(
    label_values_or_threshold=[1],  
    facet_name="doorstep",         
    facet_values_or_threshold=[0],
)

In [23]:
# Raw values used in the configs
dataset_type = "text/csv"
headers = [
    'retained','esent','eopenrate','eclickrate','avgorder','ordfreq',
    'paperless','refill','doorstep','first_last_days_diff',
    'created_first_days_diff','favday_Friday','favday_Monday',
    'favday_Saturday','favday_Sunday','favday_Thursday','favday_Tuesday',
    'favday_Wednesday','city_BLR','city_BOM','city_DEL','city_MAA'
]
label = 0
output_path = bias_report_output_path
model_name = "churn-model" 
instance_type = clarify_instance_type
probability_threshold = 0.5
facet_name = "doorstep"
facet_values = [0]

In [24]:
analysis_config = {
    "dataset_type": dataset_type,
    "headers": headers,
    "label": label,
    "methods": {
        "shap": {
            "baseline": [0] * len(headers)
        }
    },
    "output_path": output_path,
    "model_config": {
        "model_name": model_name,
        "instance_type": instance_type,
        "instance_count": 1,
        "accept_type": "text/csv",
        "content_type": "text/csv"
    },
    "predicted_label_config": {
        "probability_threshold": probability_threshold
    },
    "bias_config": {
        "label_values_or_threshold": [1],
        "facet": {
            "name": facet_name,
            "value_or_threshold": facet_values
        }
    }
}

In [25]:
import json

with open("analysis_config.json", "w") as f:
    json.dump(analysis_config, f, indent=4)

In [26]:
import boto3

s3 = boto3.client("s3")
s3_key = "clarify-config/analysis_config.json"

s3.upload_file("analysis_config.json", default_bucket, s3_key)

print(f"Uploaded to s3://{default_bucket}/{s3_key}")

Uploaded to s3://sagemaker-us-west-1-277707136136/clarify-config/analysis_config.json


In [27]:
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput

step_clarify = ProcessingStep(
    name="ClarifyExplainability",
    processor=clarify_processor,
    inputs=[
        ProcessingInput(
            source=f"s3://{default_bucket}/clarify-config/analysis_config.json",
            destination="/opt/ml/processing/input/analysis_config.json"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="clarify_output",
            source="/opt/ml/processing/output"
        )
    ],
    depends_on=[step_register.name] 
)

In [28]:
from sagemaker.workflow.properties import PropertyFile

evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

In [29]:
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor

eval_processor = ScriptProcessor(
    image_uri=image_uri,  # same as used in training
    command=["python3"],
    instance_type=clarify_instance_type,  # or your defined instance type
    instance_count=1,
    role=role,
    sagemaker_session=sagemaker_session
)

step_eval = ProcessingStep(
    name="EvaluateModel",
    processor=eval_processor,
    inputs=[
        ProcessingInput(
            source=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=default_bucket),
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation"
        )
    ],
    code=f"s3://{default_bucket}/input/code/evaluate.py",
    property_files=[evaluation_report]
)

In [34]:
from sagemaker.workflow.model_step import CreateModelStep
from sagemaker.model import Model

# Define the model object
xgb_model = Model(
    image_uri=image_uri,
    model_data=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=default_bucket, prefix="output"),
    sagemaker_session=sagemaker_session,
    role=role
)

# Create the pipeline step
step_create_model = CreateModelStep(
    name="CreateChurnModel",
    model=xgb_model,
)

In [35]:
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name="churn-pipeline",
    steps=[
        step_process,
        step_tuning,
        step_eval,
        step_register,
        step_create_model,
        step_clarify
    ],
    sagemaker_session=sagemaker_session
)


In [36]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-west-1:277707136136:pipeline/churn-pipeline',
 'ResponseMetadata': {'RequestId': '121f583e-a72b-48e1-b512-2489a76165f3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '121f583e-a72b-48e1-b512-2489a76165f3',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '82',
   'date': 'Mon, 31 Mar 2025 11:23:23 GMT'},
  'RetryAttempts': 0}}

In [37]:
execution = pipeline.start()

In [38]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-west-1:277707136136:pipeline/churn-pipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-west-1:277707136136:pipeline/churn-pipeline/execution/d73henwir8h0',
 'PipelineExecutionDisplayName': 'execution-1743420208695',
 'PipelineExecutionStatus': 'Executing',
 'PipelineExperimentConfig': {'ExperimentName': 'churn-pipeline',
  'TrialName': 'd73henwir8h0'},
 'CreationTime': datetime.datetime(2025, 3, 31, 11, 23, 28, 602000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 3, 31, 11, 23, 28, 602000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-west-1:277707136136:user-profile/d-4kv8wvmg9ve4/ElenaM',
  'UserProfileName': 'ElenaM',
  'DomainId': 'd-4kv8wvmg9ve4',
  'IamIdentity': {'Arn': 'arn:aws:sts::277707136136:assumed-role/AmazonSageMaker-ExecutionRole-20250328T135409/SageMaker',
   'PrincipalId': 'AROAUBKFCTSEPMHBFANAR:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-west-1:277