# Reference: https://github.com/vinayak-shanawad/AI-ML-Projects/blob/main/AWS-SageMaker-Examples/03_MultiModelEndpointWithHuggingFace/huggingface-sagemaker-multi-model-endpoint.ipynb

# Upload models into S3 bucket

In [1]:
!pip install transformers --quiet

import os
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from sagemaker import get_execution_role
import boto3
import sagemaker

#account_id = "<redacted>"
role = get_execution_role()
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
bucket = sagemaker_session.default_bucket()
prefix = "multimodel-classification"
sm_client = boto3.client("sagemaker")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [20]:
# #HF_TOKEN = "<redacted>"

# models = [
#     "jordanfan/modernBERT_depression",
#     "jordanfan/modernBERT_suicide_base",
#     "jordanfan/mental-roberta_depression_v2", # Needed to upload manually 
#     "jordanfan/mental-BERT_depression_v2", # Needed to upload manually 
# ]

# for model_name in models: 
#     print(model_name)
#     model_path = f"models/{model_name.split('/')[-1]}/model"
#     if not os.path.exists(model_path):
#         os.mkdir(model_path)

#     tokenizer = AutoTokenizer.from_pretrained(model_name,
#                                               token = HF_TOKEN)
#     model = AutoModelForSequenceClassification.from_pretrained(model_name,
#                                                               token = HF_TOKEN)
#     model.save_pretrained(save_directory = model_path)
#     #Manually save models weights as pytorch_model.bin 
#     torch.save(model.state_dict(), f"{model_path}/pytorch_model.bin")
#     tokenizer.save_pretrained(save_directory = model_path)

In [21]:
# #Copy over BERT inference script for each classification folder
# #Make slight changes to replace BERT_tokenizer with AutoTokenizer and BERTModel with AutoModelForSequenceClassification

# !mkdir models/modernBERT_depression/code
# ! cp source_dir/model1/inference.py models/modernBERT_depression/code/inference.py

# !mkdir models/modernBERT_suicide_base/code
# ! cp source_dir/model1/inference.py models/modernBERT_suicide_base/code/inference.py

# !mkdir models/mental-BERT_depression/code
# ! cp source_dir/model1/inference.py models/mental-BERT_depression/code/inference.py

# !mkdir models/mental-roberta_depression/code
# ! cp source_dir/model1/inference.py models/mental-roberta_depression/code/inference.py


In [48]:
# Compress files into tar file
# !tar -czvf models/modernBERT_depression.tar.gz -C models/modernBERT_depression/ .
# !tar -czvf models/modernBERT_suicide_base.tar.gz -C models/modernBERT_suicide_base/ .
# !tar -czvf models/mental-BERT_depression.tar.gz -C models/mental-BERT_depression/ .
# !tar -czvf models/mental-roberta_depression.tar.gz -C models/mental-roberta_depression/ .


./
./model/
./model/special_tokens_map.json
./model/model.safetensors
./model/config.json
./model/tokenizer.json
./model/pytorch_model.bin
./model/tokenizer_config.json
./.ipynb_checkpoints/
./code/
./code/.ipynb_checkpoints/
./code/.ipynb_checkpoints/inference-checkpoint.py
./code/inference.py


In [49]:
# # Upload into S3 bucket 
# from sagemaker.s3 import S3Uploader

# bucket = "my-genzen-bucket"

# models_path = 's3://{0}/{1}/models'.format(bucket,prefix)

# S3Uploader.upload('models/mental-BERT_depression.tar.gz', models_path)
# S3Uploader.upload('models/mental-roberta_depression.tar.gz', models_path)
# S3Uploader.upload('models/modernBERT_depression.tar.gz', models_path)
# S3Uploader.upload('models/modernBERT_suicide_base.tar.gz', models_path)

's3://my-genzen-bucket/multimodel-classification/models/modernBERT_suicide_base.tar.gz'

# Deploy Multi-Model Endpoint

In [85]:
from sagemaker import image_uris
# Create custom image with appropriate pytorch and transformer requirements 
image_uri = image_uris.retrieve(
    framework = "huggingface",
    region = region, 
    version='4.48.0', #transformers version 
	base_framework_version='pytorch2.3.0', #pytorch version 
    image_scope = "inference",
    instance_type='ml.m5.xlarge')

multimodels_path = f's3://my-genzen-bucket/{prefix}/models/'

deployment_name = "huggingface-multi-model-classification"

primary_container = {
    'Image': image_uri,
    'Mode': 'MultiModel',
    'ModelDataUrl': multimodels_path,
    'Environment': {
        'SAGEMAKER_PROGRAM': 'inference.py',
        'SAGEMAKER_REGION': region,
        'SAGEMAKER_SUBMIT_DIRECTORY': multimodels_path
    }
}

In [86]:
create_model_response = sm_client.create_model(ModelName = deployment_name,
                                              ExecutionRoleArn = get_execution_role(),
                                              PrimaryContainer = primary_container)


In [87]:
# create SageMaker Endpoint configuration
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName = f"{deployment_name}-epc",
    ProductionVariants=[
        {
        'InstanceType':'ml.m5.xlarge',
        'InitialInstanceCount':1,
        'ModelName': deployment_name,
        'VariantName':'AllTraffic',
        'InitialVariantWeight':1
        }
    ])

print('Endpoint configuration arn:  {}'.format(endpoint_config_response['EndpointConfigArn']))

# create SageMaker Endpoint
endpoint_params = {
    'EndpointName': f"{deployment_name}-ep",
    'EndpointConfigName': f"{deployment_name}-epc",
}
endpoint_response = sm_client.create_endpoint(**endpoint_params)
print('EndpointArn = {}'.format(endpoint_response['EndpointArn']))

Endpoint configuration arn:  arn:aws:sagemaker:us-east-2:975049977273:endpoint-config/huggingface-multi-model-classification-epc
EndpointArn = arn:aws:sagemaker:us-east-2:975049977273:endpoint/huggingface-multi-model-classification-ep


In [99]:
import boto3

invoke_client = boto3.client('sagemaker-runtime')

prompt = "I feel so lost and I am so stressed. I want to kill myself. I feel so horrible and hate myself"

response = invoke_client.invoke_endpoint(EndpointName=f"{deployment_name}-ep",
                              TargetModel='modernBERT_suicide_base.tar.gz',
                              Body=prompt.encode(encoding='UTF-8'),
                              ContentType='text/csv')
suicide_probas = response["Body"].read()

In [100]:
suicide_probas

b'[0.49434823 0.12845325 0.37719852]'

In [9]:
response = invoke_client.invoke_endpoint(EndpointName=f"{deployment_name}-ep",
                              TargetModel='modernBERT_depression.tar.gz',
                              Body=prompt.encode(encoding='UTF-8'),
                              ContentType='text/csv')
depression_modernbert_proba = response["Body"].read()

In [10]:
depression_modernbert_proba

b'[4.8430782e-04 2.2920099e-01 7.7031475e-01]'

In [11]:
response = invoke_client.invoke_endpoint(EndpointName=f"{deployment_name}-ep",
                              TargetModel='mental-BERT_depression.tar.gz',
                              Body=prompt.encode(encoding='UTF-8'),
                              ContentType='text/csv')
depression_mental_bert_proba = response["Body"].read()

In [12]:
depression_mental_bert_proba

b'[3.5191016e-04 1.6750398e-01 8.3214408e-01]'

In [13]:
response = invoke_client.invoke_endpoint(EndpointName=f"{deployment_name}-ep",
                              TargetModel='mental-roberta_depression.tar.gz',
                              Body=prompt.encode(encoding='UTF-8'),
                              ContentType='text/csv')
depression_mental_roberta_proba = response["Body"].read()

In [14]:
depression_mental_roberta_proba

b'[6.0263724e-04 2.3136440e-01 7.6803297e-01]'

In [134]:
sm_client.delete_model(ModelName=deployment_name)
sm_client.delete_endpoint_config(EndpointConfigName=f"{deployment_name}-epc")
sm_client.delete_endpoint(EndpointName=f"{deployment_name}-ep")

{'ResponseMetadata': {'RequestId': 'bb4387d9-5461-4d81-a58e-7e95d870ac83',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bb4387d9-5461-4d81-a58e-7e95d870ac83',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Sun, 16 Mar 2025 22:18:46 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}