In [1]:
import sagemaker
import boto3
from sklearn.model_selection import train_test_split
import pandas as pd
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve the values from the environment
SAGEMAKER_ACCESS_KEY = os.getenv('SAGEMAKER_ACCESS_KEY')
SAGEMAKER_SECRET_KEY = os.getenv('SAGEMAKER_SECRET_KEY')
SAGEMAKER_ROLE= os.getenv('SAGEMAKER_ROLE')
REGION_NAME = 'us-east-1'

# Create a boto3 session with the provided credentials
boto3_session = boto3.Session(
    aws_access_key_id=SAGEMAKER_ACCESS_KEY,
    aws_secret_access_key=SAGEMAKER_SECRET_KEY,
    region_name=REGION_NAME
)

# Create a SageMaker client using boto3 with the correct region
sm_boto3 = boto3.client('sagemaker', region_name=REGION_NAME)

# Create a SageMaker session
sess = sagemaker.Session(boto_session=boto3_session)

# Get the AWS region from the session (should match the specified region)
# region = session.boto_session.REGION_NAME

# Define the S3 bucket name
BUCKET = 'sagemakermlops'

# Print the region and bucket name
print(f"Region: {REGION_NAME}")
print(f"Bucket: {BUCKET}")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/blackitalian/.config/sagemaker/config.yaml
Region: us-east-1
Bucket: sagemakermlops


Your .env file should be configured with the following:

```bash
SAGEMAKER_ACCESS_KEY=YourInfoHere
SAGEMAKER_SECRET_KEY=YourInfoHere
SAGEMAKER_ROLE=YourInfoHere
```

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
df.shape

(2000, 21)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [6]:
# ['Low_Risk','High_Risk], [0,1]
df['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [7]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [8]:
df.isnull().mean()*100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [9]:
features = list(df.columns)

In [10]:
label = features.pop(-1)
label

'price_range'

In [11]:
x = df[features]
y = df[label]

In [12]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [13]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [14]:
x.shape

(2000, 20)

In [15]:
y.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=42)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1700, 20)
(300, 20)
(1700,)
(300,)


In [18]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [19]:
trainX.shape, testX.shape

((1700, 21), (300, 21))

In [20]:
trainX.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
581,1512,1,0.5,0,8,1,18,0.1,88,3,...,1079,1897,3607,12,10,6,1,1,1,3
76,1114,0,2.8,0,4,1,9,0.4,197,3,...,1040,1071,907,10,7,17,1,1,0,0
1916,1176,1,2.1,0,2,1,62,0.5,168,2,...,674,1455,1534,6,2,17,1,0,0,1
1414,1550,1,2.7,0,2,0,32,0.1,126,7,...,1541,1619,1571,12,3,14,0,0,0,2
780,1042,0,2.2,0,15,1,11,0.6,139,5,...,68,1018,2826,18,0,2,1,0,0,2


In [21]:
trainX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [22]:
testX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [23]:
trainX.to_csv('data/train-V-1.csv', index=False)
testX.to_csv('data/test-V-1.csv', index=False)

In [24]:
# send data to S3 for Sagemaker
sk_prefix = 'sagemaker/mobile_price_classification/sklearncontainer'
trainpath = sess.upload_data(
    path='data/train-V-1.csv', 
    bucket=BUCKET,
    key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path='data/test-V-1.csv', 
    bucket=BUCKET,
    key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

s3://sagemakermlops/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv
s3://sagemakermlops/sagemaker/mobile_price_classification/sklearncontainer/test-V-1.csv


In [25]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import sklearn
import joblib
import argparse
import os
import numpy as np
import pandas as pd 

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf

if __name__ == '__main__':
    
    print('[INFO] Extracting arguments')
    parser = argparse.ArgumentParser()
    
    # Hyperparameters sent by the client are passed as command-line arguments
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=42)
    
    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument('--train', type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument('--test', type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument('--train-file', type=str, default="train-V-1.csv")
    parser.add_argument('--test-file', type=str, default="test-V-1.csv")
    
    args, _ = parser.parse_known_args()
    
    print(f'Sklearn Version: {sklearn.__version__}')
    print(f'Joblib Version: {joblib.__version__}')
    
    print('[INFO] Reading data')
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = train_df.columns.tolist()
    label = features.pop(-1)
    
    print('Building training and testing datasets')
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]
    
    print('Column order:')
    print(features)
    print()
    
    print('Data Shape:')
    print()
    print('---- SHAPE OF TRAINING DATA ----')
    print(X_train.shape)
    print(y_train.shape)
    print()
    print('---- SHAPE OF TESTING DATA ----')
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    print('Training RandomForest Model....')
    print()
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(X_train, y_train)
    print()
    
    model_path = os.path.join(args.model_dir, 'model.joblib')
    joblib.dump(model, model_path)
    print(f'Model persisted at {model_path}')
    print()
    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)
    
    print()
    print('---- METRICS RESULTS FOR TESTING DATA ----')
    print()
    print(f'Total Rows are: {X_test.shape[0]}')
    print(f'[TESTING] Model Accuracy is: {test_acc}')
    print(test_rep)


Overwriting script.py


In [26]:
from sagemaker.sklearn.estimator import SKLearn
from datetime import datetime

# Define the framework version
FRAMEWORK_VERSION = '0.23-1'

# Generate a timestamp
timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

# Define your custom S3 path with the timestamp
output_path = f's3://your-custom-bucket/path/to/save/model/{timestamp}/'

sklearn_estimator= SKLearn(
    entry_point='script.py',
    role=SAGEMAKER_ROLE,
    instance_count=1,
    instance_type='ml.m5.large', # this could also be a GPU instance type
    framework_version=FRAMEWORK_VERSION,
    base_job_name='RF-custom-sklearn',
    hyperparameters={
        'n_estimators': 100,
        'random_state': 0
    },
    use_spot_instances=True,
    max_wait = 7200,
    max_run = 3600,
    sagemaker_session=sess
    # output_path=output_path 
)

In [27]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)


INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-07-05-00-14-58-864


2024-07-05 00:15:00 Starting - Starting the training job...
2024-07-05 00:15:18 Starting - Preparing the instances for training...
2024-07-05 00:15:46 Downloading - Downloading input data...
2024-07-05 00:16:16 Downloading - Downloading the training image...
2024-07-05 00:16:57 Training - Training image download completed. Training in progress..2024-07-05 00:17:02,461 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-07-05 00:17:02,466 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-05 00:17:02,529 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-07-05 00:17:02,752 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-05 00:17:02,771 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-05 00:17:02,790 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-05 00:

## Deploy to a real-time endpoint
### Deploy with Python SDK
An Estimator could be deployed directly after training, with an Estimator.deploy() but here we showcase the more extensive process of creating a model from s3 artifacts, that could be used to deploy a model that was trained in a different session or even out of SageMaker.

In [28]:
# Wait for the training job to complete
sklearn_estimator.latest_training_job.wait(logs="None")

# Retrieve artifacts
artifact =sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print(f'Model artifact persisted at {artifact}')


2024-07-05 00:17:21 Starting - Preparing the instances for training
2024-07-05 00:17:21 Downloading - Downloading the training image
2024-07-05 00:17:21 Training - Training image download completed. Training in progress.
2024-07-05 00:17:21 Uploading - Uploading generated training model
2024-07-05 00:17:21 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-066243913450/RF-custom-sklearn-2024-07-05-00-14-58-864/output/model.tar.gz


In [30]:
artifact

's3://sagemaker-us-east-1-066243913450/RF-custom-sklearn-2024-07-05-00-14-58-864/output/model.tar.gz'

In [31]:
from sagemaker.sklearn.model import SKLearnModel
from time import strftime

# Define model name with current date and time
model_name = f'Custom-sklearn-model-{strftime("%Y-%m-%d-%H-%M-%S")}'

# Create the SKLearnModel object
model = SKLearnModel(
    name = model_name,
    model_data=artifact,
    role=SAGEMAKER_ROLE,
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)

In [32]:
# Define model name with current date and time
endpoint_name = f'Custom-sklearn-model-{strftime("%Y-%m-%d-%H-%M-%S")}'
# Deploy the model to an endpoint
predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name=endpoint_name
)

print(f"Model deployed at endpoint: {endpoint_name}")


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-07-04-20-32-53
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-07-04-20-33-45
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-07-04-20-33-45


------!Model deployed at endpoint: Custom-sklearn-model-2024-07-04-20-33-45


In [40]:
testX[features].head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
1860,1646,0,2.5,0,3,1,25,0.6,200,2,5,211,1608,686,8,6,11,1,1,0
353,1182,0,0.5,0,7,1,8,0.5,138,8,16,275,986,2563,19,17,19,1,0,0
1333,1972,0,2.9,0,9,0,14,0.4,196,7,18,293,952,1316,8,1,8,1,1,0
905,989,1,2.0,0,4,0,17,0.2,166,3,19,256,1394,3892,18,7,19,1,1,0
1289,615,1,0.5,1,7,0,58,0.5,130,5,8,1021,1958,1906,14,5,5,1,0,0


In [37]:
# the SKLearnPredictor does the serialization from pandas for us. All values predicted.
print(predictor.predict(testX[features].values.tolist()))

[0 2 1 3 1 2 2 0 3 1 0 0 2 3 3 2 3 3 1 0 0 1 1 2 0 1 2 2 2 0 0 0 3 0 1 1 2
 0 3 0 2 3 2 0 3 2 1 1 3 1 3 1 0 0 1 1 1 2 0 0 0 3 3 2 0 0 3 3 1 2 2 3 0 1
 3 0 0 3 2 2 3 2 1 0 1 3 2 3 3 0 3 3 2 1 3 2 2 3 1 1 0 0 1 0 0 3 2 0 1 1 0
 0 3 2 2 2 3 3 0 2 1 3 2 1 3 3 0 3 0 2 3 0 2 2 0 3 1 0 0 2 3 0 2 3 0 0 0 1
 1 2 3 1 1 0 2 2 0 1 0 1 2 3 2 3 1 0 0 2 2 3 3 1 1 0 3 1 1 2 1 0 0 0 0 0 3
 2 0 3 0 0 0 0 1 3 3 1 0 1 2 1 1 2 2 2 3 3 1 2 0 0 0 2 1 1 3 1 0 2 1 1 3 1
 2 0 0 2 1 2 0 0 2 0 1 3 2 1 1 3 3 0 1 3 3 3 0 3 1 2 3 3 2 1 1 3 3 1 3 3 3
 3 3 0 1 2 2 2 2 0 2 3 2 2 2 1 0 2 0 2 3 1 3 1 0 3 1 2 0 0 3 0 1 2 3 3 3 1
 1 0 1 3]


### Delete Endpoint

In [38]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '59072ced-44dd-41d3-87de-67a27d8db8cc',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '59072ced-44dd-41d3-87de-67a27d8db8cc',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 05 Jul 2024 00:44:05 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}