## Initial Imports in Sagemaker

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role
import pandas as pd

## Creating s3 Bucket, region selection, set IAM role for Sagemaker - AutoPilot

In [None]:
region = boto3.Session().region_name

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = "sagemaker/autopilot-credit_fraud"

role = get_execution_role()

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

Download transactions data from s3 bucket

In [None]:
!apt-get install unzip
!wget -N https://sagemaker-us-east-1-629722484321.s3.amazonaws.com/creditcard.csv.zip
!unzip -o creditcard.csv.zip

local_data_path = "./creditcard.csv"

In [3]:
cc_data_df = pd.read_csv(local_data_path)
cc_data_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.551600,-0.617801,-0.991390,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.119670,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,-1.593105,2.711941,-0.689256,4.626942,-0.924459,1.107641,1.991691,0.510632,-0.682920,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,-0.150189,0.915802,1.214756,-0.675143,1.164931,-0.711757,-0.025693,-1.221179,-1.545556,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,0.411614,0.063119,-0.183699,-0.510602,1.329284,0.140716,0.313502,0.395652,-0.577252,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,-1.933849,-0.962886,-1.042082,0.449624,1.962563,-0.608577,0.509928,1.113981,2.897849,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


Splitting the data into training and testing sets using the Class column as the target
AutoPilot will use the training set in creating machine learning models 
We'll use the test set for our best performing models


In [4]:
train_data = cc_data_df.sample(frac=0.8, random_state=200)

test_data = cc_data_df.drop(train_data.index)

test_data_no_target = test_data.drop(columns=["Class"])
test_data_no_target.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,1.017614,0.83639,1.006844,-0.443523,0.150219,0.739453,-0.54098,0.476677,0.451773,0.203711,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68
10,10.0,1.449044,-1.176339,0.91386,-1.375667,-1.971383,-0.629152,-1.423236,0.048456,-1.720408,1.626659,1.199644,-0.67144,-0.513947,-0.095045,0.23093,0.031967,0.253415,0.854344,-0.221365,-0.387226,-0.009302,0.313894,0.02774,0.500512,0.251367,-0.129478,0.04285,0.016253,7.8
13,11.0,1.069374,0.287722,0.828613,2.71252,-0.178398,0.337544,-0.096717,0.115982,-0.221083,0.46023,-0.773657,0.323387,-0.011076,-0.178485,-0.655564,-0.199925,0.124005,-0.980496,-0.982916,-0.153197,-0.036876,0.074412,-0.071407,0.104744,0.548265,0.104094,0.021491,0.021293,27.5


In [5]:
# Upload training and testing data to Amazon S3 bucket

train_file = "train_data.csv"
train_data.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print("Train data uploaded to: " + train_data_s3_path)

test_file = "test_data.csv"
test_data_no_target.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print("Test data uploaded to: " + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-1-629722484321/sagemaker/autopilot-fraud/train/train_data.csv
Test data uploaded to: s3://sagemaker-us-east-1-629722484321/sagemaker/autopilot-fraud/test/test_data.csv


# SageMaker Configuration for Autopilot job - to create 5 diffrent machine learning models using Area Under Curve as the deciding metric (evaluation)


In [6]:
auto_ml_job_config = {"CompletionCriteria": {"MaxCandidates": 5}}

input_data_config = [
    {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": "s3://{}/{}/train".format(bucket, prefix),
            }
        },
        "TargetAttributeName": "Class",
    }
]

output_data_config = {"S3OutputPath": "s3://{}/{}/output".format(bucket, prefix)}

auto_ml_job_ojective = {'MetricName': 'AUC'}

# 'create_auto_ml_job' API Launches Autopilot job using the 

In [7]:
from time import gmtime, strftime, sleep

timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())

auto_ml_job_name = "automl-creditcard-" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

sm.create_auto_ml_job(
    AutoMLJobName=auto_ml_job_name,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    ProblemType='BinaryClassification',
    AutoMLJobObjective=auto_ml_job_ojective,
    AutoMLJobConfig=auto_ml_job_config,
    RoleArn=role,
)

AutoMLJobName: automl-creditcard-23-01-33-45


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:629722484321:automl-job/automl-creditcard-23-01-33-45',
 'ResponseMetadata': {'RequestId': '85dbbf02-61fd-4078-86b6-96f53337194f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '85dbbf02-61fd-4078-86b6-96f53337194f',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '100',
   'date': 'Tue, 23 Nov 2021 01:33:46 GMT'},
  'RetryAttempts': 0}}

## Progress of Autopilot Job

In [8]:

print("JobStatus - Secondary Status")
print("------------------------------")


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print(describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])
job_run_status = describe_response["AutoMLJobStatus"]

while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response["AutoMLJobStatus"]

    print(
        describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"]
    )
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - Featur

In [9]:
# Return the best candidate model using the describe_auto_ml_job API

best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)["BestCandidate"]
best_candidate_name = best_candidate["CandidateName"]
print(best_candidate)
print("\n")
print("CandidateName: " + best_candidate_name)
print(
    "FinalAutoMLJobObjectiveMetricName: "
    + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"]
)
print(
    "FinalAutoMLJobObjectiveMetricValue: "
    + str(best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"])
)

{'CandidateName': 'automl-creditcard-23-01-33-45iz4-001-31c6fc7f', 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:auc', 'Value': 0.9926300048828125}, 'ObjectiveStatus': 'Succeeded', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:629722484321:processing-job/automl-creditcard-23-01-33-45-db-1-e258ee4c81aa4260a0f2b30f059d', 'CandidateStepName': 'automl-creditcard-23-01-33-45-db-1-e258ee4c81aa4260a0f2b30f059d'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:629722484321:training-job/automl-creditcard-23-01-33-45-dpp4-1-00e4a29c30c14030a21eefb1f4', 'CandidateStepName': 'automl-creditcard-23-01-33-45-dpp4-1-00e4a29c30c14030a21eefb1f4'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:629722484321:transform-job/automl-creditcard-23-01-33-45-dpp4-rpb-1-6ae138bb0eb4453b860393', 'CandidateStepName':

# Create a machine learning model from the top candidate/model using Amazon Inference Pipelines


In [10]:

model_name = "automl-credit-card-fraud-model-" + timestamp_suffix

model = sm.create_model(
    Containers=best_candidate["InferenceContainers"], ModelName=model_name, ExecutionRoleArn=role
)

print("Model ARN corresponding to the best candidate is : {}".format(model["ModelArn"]))

Model ARN corresponding to the best candidate is : arn:aws:sagemaker:us-east-1:629722484321:model/automl-credit-card-fraud-model-23-01-33-45


# Generate prediction using Sagemacker's batch transform

In [1]:
transform_job_name = "automl-credit-card-fraud-transform-" + timestamp_suffix

transform_input = {
    "DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": test_data_s3_path}},
    "ContentType": "text/csv",
    "CompressionType": "None",
    "SplitType": "Line",
}

transform_output = {
    "S3OutputPath": "s3://{}/{}/inference-results".format(bucket, prefix),
}

transform_resources = {"InstanceType": "ml.m5.4xlarge", "InstanceCount": 1}

sm.create_transform_job(
    TransformJobName=transform_job_name,
    ModelName=model_name,
    TransformInput=transform_input,
    TransformOutput=transform_output,
    TransformResources=transform_resources,
)

'\ntransform_job_name = "automl-credit-card-fraud-transform-" + timestamp_suffix\n\ntransform_input = {\n    "DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": test_data_s3_path}},\n    "ContentType": "text/csv",\n    "CompressionType": "None",\n    "SplitType": "Line",\n}\n\ntransform_output = {\n    "S3OutputPath": "s3://{}/{}/inference-results".format(bucket, prefix),\n}\n\ntransform_resources = {"InstanceType": "ml.m5.4xlarge", "InstanceCount": 1}\n\nsm.create_transform_job(\n    TransformJobName=transform_job_name,\n    ModelName=model_name,\n    TransformInput=transform_input,\n    TransformOutput=transform_output,\n    TransformResources=transform_resources,\n)'

{'TransformJobArn': 'arn:aws:sagemaker:us-east-1:629722484321:transform-job/automl-credit-card-fraud-transform-23-01-33-45',

 'ResponseMetadata': {'RequestId': 'de6e6fa9-a718-4509-a99e-27f3a792af84',
 
  'HTTPStatusCode': 200,
  
  'HTTPHeaders': {'x-amzn-requestid': 'de6e6fa9-a718-4509-a99e-27f3a792af84',
  
   'content-type': 'application/x-amz-json-1.1',
   
   'content-length': '123',
   
   'date': 'Sun, 30 Jul 2023 02:20:40 GMT'},
   
  'RetryAttempts': 0}}

# Track batch inference transformation job

In [12]:
print("JobStatus")
print("----------")


describe_response = sm.describe_transform_job(TransformJobName=transform_job_name)
job_run_status = describe_response["TransformJobStatus"]
print(job_run_status)

while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = sm.describe_transform_job(TransformJobName=transform_job_name)
    job_run_status = describe_response["TransformJobStatus"]
    print(job_run_status)
    sleep(30)

JobStatus
----------
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


# Batch Inference Transformation Job results


In [13]:

s3_output_key = "{}/inference-results/test_data.csv.out".format(prefix)
local_inference_results_path = "inference_results.csv"

s3 = boto3.resource("s3")
inference_results_bucket = s3.Bucket(session.default_bucket())

inference_results_bucket.download_file(s3_output_key, local_inference_results_path)

data = pd.read_csv(local_inference_results_path, sep=";")
pd.set_option("display.max_rows", 10)  # Keep the output on one page
data


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
56955,0
56956,0
56957,0
56958,0


# Models performance - Evaluation metric

In [15]:
candidates = sm.list_candidates_for_auto_ml_job(
    AutoMLJobName=auto_ml_job_name, SortBy="FinalObjectiveMetricValue"
)["Candidates"]
index = 1
for candidate in candidates:
    print(
        str(index)
        + "  "
        + candidate["CandidateName"]
        + "  "
        + str(candidate["FinalAutoMLJobObjectiveMetric"]["Value"])
    )
    index += 1

1  automl-creditcard-23-01-33-45iz4-001-31c6fc7f  0.9926300048828125
2  automl-creditcard-23-01-33-45iz4-003-93d5c214  0.9921299815177917
3  automl-creditcard-23-01-33-45iz4-002-ff710ea4  0.9918100237846375
4  automl-creditcard-23-01-33-45iz4-004-bf802de4  0.9850599765777588
5  automl-creditcard-23-01-33-45iz4-005-9c1634db  0.98471999168396


In [16]:
# Model notebook - downloading

sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)["AutoMLJobArtifacts"][
    "CandidateDefinitionNotebookLocation"
]

's3://sagemaker-us-east-1-629722484321/sagemaker/autopilot-fraud/output/automl-creditcard-23-01-33-45/sagemaker-automl-candidates/automl-creditcard-23-01-33-45-pr-1-3b032cbe79e64862871eff1bc78d/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb'

In [17]:
# Download Data Exploration Notebook

sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)["AutoMLJobArtifacts"][
    "DataExplorationNotebookLocation"
]

's3://sagemaker-us-east-1-629722484321/sagemaker/autopilot-fraud/output/automl-creditcard-23-01-33-45/sagemaker-automl-candidates/automl-creditcard-23-01-33-45-pr-1-3b032cbe79e64862871eff1bc78d/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb'

# Cleaning up resources after process completion

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket)

job_outputs_prefix = '{}/output/{}'.format(prefix,auto_ml_job_name)
bucket.objects.filter(Prefix=job_outputs_prefix).delete()