In [28]:
!pip install ucimlrepo
import os
import pandas as pd
import boto3
import sagemaker
import sagemaker.session
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from ucimlrepo import fetch_ucirepo 


# Retrieves the current AWS region name.
region = boto3.Session().region_name
# Creates a new SageMaker session.
sagemaker_session = sagemaker.session.Session()
# Gets the IAM execution role for SageMaker.
role = sagemaker.get_execution_role()
# Retrieves the default S3 bucket for the SageMaker session.
default_bucket = sagemaker_session.default_bucket()
# Creates a new PipelineSession for working with SageMaker Pipelines.
pipeline_session = PipelineSession()
# Defines a name for a SageMaker Model Package Group, which can be used to group related model versions.
model_package_group_name = "EndEndPackage"





In [22]:
default_bucket

'sagemaker-us-east-1-992382634893'

## Steps ##
- Preprocessing
- Training
- Evaluation
- Condition Evaluation
- Model Registration

In [23]:
# Dowload the dataset - Predicting the age of abalone from physical measurements
# https://archive.ics.uci.edu/dataset/1/abalone/
# # https://archive.ics.uci.edu/static/public/1/abalone.zip

# Prepare the location to store the data
# Ensure the data directory exists
os.makedirs('data', exist_ok=True)
local_path = "data/abalone-dataset.csv"

# fetch dataset from UCI
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes) 
features = abalone.data.features
# the target are the Abalone Rings
targets = abalone.data.targets 
  
# metadata 
print(abalone.metadata) 
  
# variable information 
print(abalone.variables) 

features.to_csv(local_path, index=False)


# OR Download from your S3 bucket if applicable
#s3 = boto3.resource("s3")
# s3.Bucket(f"sagemaker-servicecatalog-seedcode={region}").download_file(
#    "dataset/abalone-dataset.csv",
#    local_path)


{'uci_id': 1, 'name': 'Abalone', 'repository_url': 'https://archive.ics.uci.edu/dataset/1/abalone', 'data_url': 'https://archive.ics.uci.edu/static/public/1/data.csv', 'abstract': 'Predict the age of abalone from physical measurements', 'area': 'Biology', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular'], 'num_instances': 4177, 'num_features': 8, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['Rings'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C55C7W', 'creators': ['Warwick Nash', 'Tracy Sellers', 'Simon Talbot', 'Andrew Cawthorn', 'Wes Ford'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting the age of abalone from physical measurements.  The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- 

In [24]:
base_uri = f"s3://{default_bucket}/aws-mlops-live"
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path = local_path,
    desired_s3_uri = base_uri)
print(input_data_uri)

s3://sagemaker-us-east-1-992382634893/aws-mlops-live/abalone-dataset.csv


## Definition of Pipeline Parameters ##
Default variables for your pipeline
- processing_instance_count 
- input_data
- batch_data
- model_approval_status

In [27]:
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")
input_data = ParameterString(name="InputData", default_value=input_data_uri)
batch_data = ParameterString(name="BatchData", default_value=batch_data_uri)

NameError: name 'batch_data_uri' is not defined

## Defining the Processing Step ##

In [29]:
os.makedirs('abalone', exist_ok=True)
df = pd.read_csv(local_path)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [30]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055
