## Sagemaker - Update Recommendation Model

In [45]:
import sklearn # Check Sklearn version
sklearn.__version__

'1.5.2'

## 1. Initialize Boto3 SDK and create S3 bucket. 

In [46]:
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd
from io import BytesIO

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name

s3_bucket = "fiton-static-files"
s3_client = boto3.client('s3')
# s3_key_meta = 'exercise/exercise_data.csv'


In [47]:
# def get_last_train_timestamp(s3_bucket, s3_key):
#     try:
#         print("FETCHING LAST TRAIN TIMESTAMP .... ")
#         response = s3_client.get_object(Bucket=s3_bucket, Key=s3_key)
#         last_modified = response['Body'].read()
#         return last_modified.decode("utf-8")
#     except Exception as e:
#         print(f"Error retrieving last dump timestamp: {e}")
#         return '2024-05-12 17:40:40'  # Default timestamp if not found

In [48]:
def read_csv_from_s3(bucket_name, file_key):
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    csv_content = response['Body'].read()
    df = pd.read_csv(BytesIO(csv_content))
    return df

## 3. Data Exploration and Understanding.

In [49]:
# get_last_train_timestamp(s3_bucket, "last_modified.txt")

In [50]:
# s3_key = "last_modified.txt"
s3_key = "exercise/exercise_data.csv"
print(s3_key)
df = read_csv_from_s3(s3_bucket, s3_key)

exercise/exercise_data.csv


In [51]:
df.head(10)

Unnamed: 0,email,gender,weight,height,heart_rate,steps,exercise_id,timestamp
0,sg8002@nyu.edu,M,23,183,86,21609,222,1900-11-02
1,sg8002@nyu.edu,M,23,183,88,4986,263,1900-11-06
2,sg8002@nyu.edu,M,23,183,81,2955,12,1900-11-07
3,sg8002@nyu.edu,M,23,183,111,6847,547,1900-11-08
4,sg8002@nyu.edu,M,23,183,75,4083,534,1900-11-11
5,rohanchopra96@yahoo.com,M,27,180,101,29756,572,1900-10-26
6,rohanchopra96@yahoo.com,M,27,180,90,1324,534,1900-10-28
7,rohanchopra96@yahoo.com,M,27,180,90,6114,73,1900-10-29
8,rohanchopra96@yahoo.com,M,27,180,71,8000,531,1900-11-04
9,rohanchopra96@yahoo.com,M,27,180,100,7000,249,1900-11-06


In [52]:
df.shape

(12, 8)

In [53]:
df.columns

Index(['email', 'gender', 'weight', 'height', 'heart_rate', 'steps',
       'exercise_id', 'timestamp'],
      dtype='object')

In [54]:
# ['Low_Risk','High_Risk'],[0,1]
df = df.drop(['email', 'timestamp'], axis=1)
df = pd.get_dummies(df)

In [55]:
df.columns

Index(['weight', 'height', 'heart_rate', 'steps', 'exercise_id', 'gender_M'], dtype='object')

In [56]:
df.shape

(12, 6)

In [57]:
# Find the Percentage of Values are missing
df.isnull().mean() * 100

weight         0.0
height         0.0
heart_rate     0.0
steps          0.0
exercise_id    0.0
gender_M       0.0
dtype: float64

In [58]:
features = list(df.columns)
features

['weight', 'height', 'heart_rate', 'steps', 'exercise_id', 'gender_M']

In [59]:
label = 'exercise_id'
features.remove(label)
label

'exercise_id'

In [60]:
x = df[features]
y = df[label]

In [61]:
x.head()

Unnamed: 0,weight,height,heart_rate,steps,gender_M
0,23,183,86,21609,True
1,23,183,88,4986,True
2,23,183,81,2955,True
3,23,183,111,6847,True
4,23,183,75,4083,True


In [62]:
# {0: 'Low_Risk',1: 'High_Risk'}
y.head()

0    222
1    263
2     12
3    547
4    534
Name: exercise_id, dtype: int64

In [63]:
x.shape

(12, 5)

In [64]:
y.value_counts()

exercise_id
534    2
222    1
263    1
12     1
547    1
572    1
73     1
531    1
249    1
823    1
310    1
Name: count, dtype: int64

In [65]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42, shuffle = True)

In [66]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9, 5)
(3, 5)
(9,)
(3,)


## 4. Split the data into Train/Test CSV File. 

In [67]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [68]:
print(trainX.shape)
print(testX.shape)

(9, 6)
(3, 6)


In [69]:
trainX.head()

Unnamed: 0,weight,height,heart_rate,steps,gender_M,exercise_id
8,27,180,71,8000,True,531
5,27,180,101,29756,True,572
2,23,183,81,2955,True,12
1,23,183,88,4986,True,263
11,25,175,20000,90,True,310


In [70]:
trainX.isnull().sum()

weight         0
height         0
heart_rate     0
steps          0
gender_M       0
exercise_id    0
dtype: int64

In [71]:
testX.isnull().sum()

weight         0
height         0
heart_rate     0
steps          0
gender_M       0
exercise_id    0
dtype: int64

## 5. Upload data into the S3 Bucket.

In [72]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [73]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/exercise-data/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket = s3_bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket = s3_bucket, key_prefix=sk_prefix
)

In [74]:
testpath

's3://fiton-static-files/sagemaker/exercise-data/sklearncontainer/test-V-1.csv'

In [75]:
trainpath

's3://fiton-static-files/sagemaker/exercise-data/sklearncontainer/train-V-1.csv'

## 6. Create Training Script

In [84]:
%%writefile exercise_script.py


from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import validation_curve, train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
import boto3

s3 = boto3.client("s3")
train_file = 'train-V-1.csv'
train_path = os.path.join("./", train_file)
s3.download_file("fiton-static-files", "sagemaker/exercise-data/sklearncontainer/" + train_file, train_path)
train_df = pd.read_csv(train_path)
y_train = train_df["exercise_id"]

# inference functions ---------------

def input_fn(request_body, request_content_type):
    print(request_body)
    print(request_content_type)
    if request_content_type == "text/csv":
        request_body = request_body.strip()
        try:
            df = pd.read_csv(StringIO(request_body), header=None)
            return df
        
        except Exception as e:
            print(e)
    else:
        return """Please use Content-Type = 'text/csv' and, send the request!!""" 
 
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

def predict_fn(input_data, model):
    if type(input_data) != str:
        distances, indices = model.kneighbors(input_data)
        y_pred_test = []
        for n_idx in indices:
            y_pred_test.append(y_train[n_idx])
        print(y_pred_test)
        return y_pred_test
    else:
        return input_data
        
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")
    parser.add_argument("--s3-bucket", type=str, default="fiton-static-files")
    parser.add_argument("--s3-data-key", type=str, default="sagemaker/exercise-data/sklearncontainer/")
    parser.add_argument("--s3-model-key", type=str, default="sagemaker/exercise-models/")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    
    train_path = os.path.join(args.train, args.train_file)
    test_path = os.path.join(args.test, args.test_file)
    s3.download_file(args.s3_bucket, args.s3_data_key + args.train_file, train_path)
    s3.download_file(args.s3_bucket, args.s3_data_key + args.test_file, test_path)
    
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    print(train_df.head())
    print(test_df.head())
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    
    max_k = 4
    n_fold = 10
    target = 0.1

    model = KNeighborsRegressor(max_k)
#     model = joblib.load(model_path)
    model.fit(X_train, y_train)
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    s3.upload_file(model_path, args.s3_bucket, args.s3_model_key + "model.joblib")
    print("Model persisted at " + args.s3_bucket + args.s3_model_key + "model.joblib")
    print()

    
    distances, indices = model.kneighbors(X_test)
    y_pred_test = []
    for n_idx in indices:
        y_pred_test.append(y_train[n_idx])
    

#     print()
#     print("---- METRICS RESULTS FOR TESTING DATA ----")
#     print()
#     print("Total Rows are: ", X_test.shape[0])
#     print('[TESTING] Model Accuracy is: ', test_acc)
#     print('[TESTING] Testing Report: ')
#     print(test_rep)

Overwriting exercise_script.py


In [85]:
! python exercise_script.py --n_estimators 100 \
                   --random_state 42 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

[INFO] Extracting arguments
SKLearn Version:  1.5.2
Joblib Version:  1.4.2
[INFO] Reading data

   weight  height  heart_rate  steps  gender_M  exercise_id
0      27     180          71   8000      True          531
1      27     180         101  29756      True          572
2      23     183          81   2955      True           12
3      23     183          88   4986      True          263
4      25     175       20000     90      True          310
   weight  height  heart_rate  steps  gender_M  exercise_id
0      27     180          40   9000      True          823
1      27     180         100   7000      True          249
2      23     183          86  21609      True          222
Building training and testing datasets

Column order: 
['weight', 'height', 'heart_rate', 'steps', 'gender_M']

Label column is:  exercise_id

Data Shape: 

---- SHAPE OF TRAINING DATA (85%) ----
(9, 5)
(9,)

---- SHAPE OF TESTING DATA (15%) ----
(3, 5)
(3,)

Model persisted at fiton-static-filessagemak

## 7. Train script in-side Sagemaker container.

In [86]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.2-1"

sklearn_estimator = SKLearn(
    entry_point="exercise_script.py",
    role="arn:aws:iam::225989339915:role/aws-elasticbeanstalk-service-role",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="Exercise-Model",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [87]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: Exercise-Model-2024-11-29-00-01-52-225


2024-11-29 00:01:59 Starting - Starting the training job...
2024-11-29 00:02:13 Starting - Preparing the instances for training...
2024-11-29 00:03:00 Downloading - Downloading the training image......
2024-11-29 00:03:56 Training - Training image download completed. Training in progress..2024-11-29 00:04:01,438 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-11-29 00:04:01,443 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-29 00:04:01,447 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-11-29 00:04:01,471 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-11-29 00:04:01,728 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-29 00:04:01,733 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-11-29 00:04:01,762 sagemaker-training-toolkit INFO    

## 8. Store Model Artifacts(model.tar.gz) into the S3 Bucket. 

In [99]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-11-29 00:04:22 Starting - Preparing the instances for training
2024-11-29 00:04:22 Downloading - Downloading the training image
2024-11-29 00:04:22 Training - Training image download completed. Training in progress.
2024-11-29 00:04:22 Uploading - Uploading generated training model
2024-11-29 00:04:22 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-west-2-225989339915/Exercise-Model-2024-11-29-00-01-52-225/output/model.tar.gz


In [100]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

In [101]:
model_name = "Recommendation-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

In [102]:
model_name

'Recommendation-model-2024-11-29-00-14-25'

In [103]:

print(artifact)
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::225989339915:role/aws-elasticbeanstalk-service-role",
    entry_point="exercise_script.py",
    framework_version=FRAMEWORK_VERSION,
)

s3://sagemaker-us-west-2-225989339915/Exercise-Model-2024-11-29-00-01-52-225/output/model.tar.gz


In [106]:
endpoint_name = "Recommend-Exercise-Model"
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    update_endpoint = True
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


EndpointName=Recommend-Exercise-Model


INFO:sagemaker:Creating model with name: Recommendation-model-2024-11-29-00-14-25
INFO:sagemaker:Creating endpoint-config with name Recommend-Exercise-Model
INFO:sagemaker:Creating endpoint with name Recommend-Exercise-Model


------!

In [107]:
import boto3

# Initialize SageMaker client
sagemaker_client = boto3.client('sagemaker')

# Specify the S3 location of the new model artifact
new_model_url = artifact

# Specify the name of the existing SageMaker endpoint configuration
endpoint_config_name = 'Recommend-Exercise-Model-1'

# Specify the name of the existing SageMaker endpoint
endpoint_name = 'Recommend-Exercise-Model'

response = sagemaker_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'VariantName': 'variant-1',
        'ModelName': model_name,
        'InitialInstanceCount': 1,
        'InstanceType': 'ml.m5.large',
        'InitialVariantWeight': 1
    }]
)

print(response)

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:225989339915:endpoint-config/Recommend-Exercise-Model-1', 'ResponseMetadata': {'RequestId': '8e67bd0d-7399-47f6-ab0d-3cb3c7264809', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '8e67bd0d-7399-47f6-ab0d-3cb3c7264809', 'content-type': 'application/x-amz-json-1.1', 'content-length': '107', 'date': 'Fri, 29 Nov 2024 00:18:32 GMT'}, 'RetryAttempts': 0}}


In [108]:
# Update the SageMaker endpoint configuration to use the updated model
response = sagemaker_client.update_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
    RetainAllVariantProperties=True,
)

# Print the response
print(response)


{'EndpointArn': 'arn:aws:sagemaker:us-west-2:225989339915:endpoint/Recommend-Exercise-Model', 'ResponseMetadata': {'RequestId': '58859af5-0563-43d5-9610-54168f0110bf', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '58859af5-0563-43d5-9610-54168f0110bf', 'content-type': 'application/x-amz-json-1.1', 'content-length': '92', 'date': 'Fri, 29 Nov 2024 00:18:37 GMT'}, 'RetryAttempts': 0}}


## 9. Deploy Sagemaker Endpoint(API) for trained model, and test it. 

In [109]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(artifact)
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::225989339915:role/aws-elasticbeanstalk-service-role",
    entry_point="exercise_script.py",
    framework_version=FRAMEWORK_VERSION,
)

s3://sagemaker-us-west-2-225989339915/Exercise-Model-2024-11-29-00-01-52-225/output/model.tar.gz


In [110]:
endpoint_name = "Recommend-Exercise-Model"
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    update_endpoint = True
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


EndpointName=Recommend-Exercise-Model


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-11-29-00-18-41
INFO:sagemaker:Creating endpoint-config with name Recommend-Exercise-Model


ClientError: An error occurred (ValidationException) when calling the CreateEndpointConfig operation: Cannot create already existing endpoint configuration "arn:aws:sagemaker:us-west-2:225989339915:endpoint-config/Recommend-Exercise-Model".

In [111]:
testX.iloc[:,:-1]

Unnamed: 0,weight,height,heart_rate,steps,gender_M
10,27,180,40,9000,True
9,27,180,100,7000,True
0,23,183,86,21609,True


In [112]:
text_csv = testX.iloc[:, :-1].to_csv(index = False, header = False)
print(text_csv)

27,180,40,9000,True
27,180,100,7000,True
23,183,86,21609,True



In [113]:
import requests
import json

In [114]:
sagemaker_runtime = boto3.client('runtime.sagemaker')

In [115]:
response = sagemaker_runtime.invoke_endpoint(EndpointName = endpoint_name,
                                            ContentType = 'text/csv',
                                            Body = text_csv)

In [None]:
print(response)

In [116]:
result = json.loads(response['Body'].read().decode())
print(result)

[[531, 547, 73, 263], [547, 73, 531, 263], [572, 531, 547, 73]]


In [117]:
test_features = np.array(testX[features][0:2].values.tolist()[:-1])
test_features

array([[  27,  180,   40, 9000,    1]])

In [118]:
print(predictor.predict(testX[features][0:2].values.tolist()))

Please use Content-Type = 'text/csv' and, send the request!!


## Don't forget to delete the endpoint !

In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

### Don't forget to Subscribe Machine Learning Hub YouTube Channel. 