In [None]:
import os
import boto3
import time
import numpy as np
import sagemaker

sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket_name = "deependu-my-personal-projects"
jobs_folder = "jobs"
dataset_folder = "datasets"
project_directory = "image-captioning-project"  # 'dummy-image-captioning-dataset'

In [None]:
current_time = time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())
job_name = f'pytorch-smddp-dist-image-captioning-{current_time}'
output_path = f"s3://{bucket_name}/{project_directory}/{jobs_folder}"

hyperparameters = {
    "epochs": 2,
    "lr": 0.01,
    "batch-size": 256,
    "backend": "smddp",
}

In [None]:
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}

In [None]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point="training_script.py",
    source_dir="code",
    output_path=output_path + "/",
    code_location=output_path,
    role=role,
    instance_count=2,
    instance_type="ml.p4d.24xlarge",  # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge',
    py_version="py310",
    framework_version="2.0.1",
    distribution=distribution,
    hyperparameters=hyperparameters,
)

In [None]:
datasets = (
    f"s3://{bucket_name}/{project_directory}/{dataset_folder}"  # s3 location of dataset
)
datasets

In [None]:
estimator.fit({"train": datasets}, job_name=job_name, wait=True)

---

## deploy as endpoint

In [None]:
s3_trained_model_uri = "s3://deependu-my-personal-projects/image-captioning-project/jobs/pytorch-smddp-dist-image-captioning-2024-03-16-08-33-01-076/output/model.tar.gz"
s3_trained_model_uri

In [None]:
import os
import boto3
import sagemaker
from sagemaker.pytorch import PyTorchModel

sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

model = PyTorchModel(
    model_data=s3_trained_model_uri,
    source_dir="code",
    entry_point="inference_script.py",
    role=role,
    framework_version="2.0.1",
    py_version="py310",
)

In [None]:
predictor = model.deploy(initial_instance_count=1, instance_type="ml.p3.2xlarge")

---

## now make api request to the endpoint using boto3 library (after serializing tensor inputs)