# Fine-tuning and deploying a Hugging Face model on SageMaker with ICD10 dataset

## Installation

_*Note:* we install the required libraries from Hugging Face and AWS. Additionally, we make sure we have a compatible PyTorch version installed_

In [2]:
!pip install "sagemaker>=2.48.0" --upgrade

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
!pip install datasets=='1.8.0'

[0m

*Note: Restart the kernel after installing the above packages.*

In [5]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)

In [6]:
restartkernel()

## Permissions

_If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it._

In [3]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::552327042361:role/service-role/AmazonSageMaker-ExecutionRole-20201202T215511
sagemaker bucket: sagemaker-ap-southeast-1-552327042361
sagemaker session region: ap-southeast-1


# Preparing the dataset

In [4]:
import pandas as pd
import numpy as np

path_to_input_file = "ICD-10-GT-AAA.csv"
df = pd.read_csv(path_to_input_file)



In [6]:
df

Unnamed: 0,dx_text,label
0,"Cholera due to Vibrio cholerae 01, biovar chol...",A0
1,"Cholera due to Vibrio cholerae 01, biovar eltor",A0
2,"Cholera, unspecified",A0
3,"Typhoid fever, unspecified",A0
4,Typhoid meningitis,A0
...,...,...
3584,Postprocedural testicular hypofunction,E8
3585,Postprocedural adrenocortical (-medullary) hyp...,E8
3586,Postprocedural hemorrhage and hematoma of an e...,E8
3587,Postprocedural hemorrhage and hematoma of an e...,E8


In [5]:
label2id = {
        "A0":0,
        "A1":1,
        "A2":2,
        "A3":3,
        "A4":4,
        "A5":5,
        "A6":6,
        "A7":7,
        "A8":8,
        "A9":9,
        "B0":10,
        "B1":11,
        "B2":12,
        "B3":13,
        "B4":14,
        "B5":15,
        "B6":16,
        "B7":17,
        "B8":18,
        "B9":19,
        "C0":20,
        "C1":21,
        "C2":22,
        "C3":23,
        "C4":24,
        "C5":25,
        "C6":26,
        "C7":27,
        "C8":28,
        "C9":29,
        "D0":30,
        "D1":31,
        "D2":32,
        "D3":33,
        "D4":34,
        "D5":35,
        "D6":36,
        "D7":37,
        "D8":38,
        "E0":39,
        "E1":40,
        "E2":41,
        "E3":42,
        "E4":43,
        "E5":44,
        "E6":45,
        "E7":46,
        "E8":47,
    }

In [7]:
df["label"] = df["label"].replace(label2id)

In [8]:
df

Unnamed: 0,dx_text,label
0,"Cholera due to Vibrio cholerae 01, biovar chol...",0
1,"Cholera due to Vibrio cholerae 01, biovar eltor",0
2,"Cholera, unspecified",0
3,"Typhoid fever, unspecified",0
4,Typhoid meningitis,0
...,...,...
3584,Postprocedural testicular hypofunction,47
3585,Postprocedural adrenocortical (-medullary) hyp...,47
3586,Postprocedural hemorrhage and hematoma of an e...,47
3587,Postprocedural hemorrhage and hematoma of an e...,47


In [9]:
df['split'] = np.random.randn(df.shape[0], 1)

msk = np.random.rand(len(df)) <= 0.8

train = df[msk]
test = df[~msk]

In [10]:
df.shape[0]

3589

In [11]:
train = train.drop('split', axis=1)
train.shape[0]

2849

In [12]:
test = test.drop('split', axis=1)
test.shape[0]

740

In [13]:
train.head(10)

Unnamed: 0,dx_text,label
1,"Cholera due to Vibrio cholerae 01, biovar eltor",0
2,"Cholera, unspecified",0
5,Typhoid fever with heart involvement,0
6,Typhoid pneumonia,0
8,Typhoid osteomyelitis,0
9,Typhoid fever with other complications,0
11,Paratyphoid fever B,0
14,Salmonella enteritis,0
15,Salmonella sepsis,0
17,Salmonella meningitis,0


In [14]:
test.head(10)

Unnamed: 0,dx_text,label
0,"Cholera due to Vibrio cholerae 01, biovar chol...",0
3,"Typhoid fever, unspecified",0
4,Typhoid meningitis,0
7,Typhoid arthritis,0
10,Paratyphoid fever A,0
12,Paratyphoid fever C,0
13,"Paratyphoid fever, unspecified",0
16,"Localized salmonella infection, unspecified",0
18,Salmonella pneumonia,0
21,Salmonella pyelonephritis,0


In [16]:
train.to_csv('nih_train.csv', index=False)
test.to_csv('nih_test.csv', index=False)

## Uploading data to S3

Upload the `dataset` files to the default bucket in Amazon S3

In [17]:
import os
from sagemaker.s3 import S3Uploader

local_train_dataset = "nih_train.csv"
local_test_dataset = "nih_test.csv"

# s3 uris for datasets
remote_train_dataset = f"s3://{sess.default_bucket()}/NIH-ICD10/data"
remote_test_dataset = f"s3://{sess.default_bucket()}/NIH-ICD10/data"

# upload datasets
S3Uploader.upload(os.path.join(local_train_dataset),remote_train_dataset)
S3Uploader.upload(os.path.join(local_test_dataset),remote_test_dataset)

print(f"train dataset uploaded to: {remote_train_dataset}/{local_train_dataset}")
print(f"test dataset uploaded to: {remote_test_dataset}/{local_test_dataset}")

train dataset uploaded to: s3://sagemaker-ap-southeast-1-552327042361/NIH-ICD10/data/nih_train.csv
test dataset uploaded to: s3://sagemaker-ap-southeast-1-552327042361/NIH-ICD10/data/nih_test.csv


# Fine-tuning & starting Sagemaker Training Job



## Creating an Estimator and start a training job

The training script that performs fine tuning is located here: `training/scripts/train.py`. Navigate to the source code location and open the `train.py` file. You can also go through it's contents by executing the cell below.

In [19]:
from sagemaker.huggingface import HuggingFace
import time

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,                          # number of training epochs
                 'train_batch_size': 32,               # batch size for training
                 'eval_batch_size': 64,                # batch size for evaluation
                 'learning_rate': 3e-5,                # learning rate used during training
                 'model_id':'distilbert-base-uncased', # pre-trained model
                 'fp16': True,                         # Whether to use 16-bit (mixed) precision training
                 'train_file': local_train_dataset,    # training dataset
                 'test_file': local_test_dataset,      # test dataset
                 }

List of supported models: https://huggingface.co/models?library=pytorch,transformers&sort=downloads

We create a `metric_definition` dictionary that contains regex-based definitions that will be used to parse the job logs and extract metrics. You can read more about parsing the cloudwatch logs [here](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html).

In [20]:
metric_definitions=[
    {'Name': 'eval_loss',               'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy',           'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1',                 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision',          'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"}]

In [21]:
# define Training Job Name 
job_name = f'nih-icd10-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train2.py',        # fine-tuning script used in training jon
    source_dir           = 'scripts',      # directory where fine-tuning script is stored
    instance_type        = 'ml.p3.2xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    transformers_version = '4.6',             # the transformers version used in the training job
    pytorch_version      = '1.7',             # the pytorch_version version used in the training job
    py_version           = 'py36',            # the python version used in the training job
    hyperparameters      = hyperparameters,   # the hyperparameter used for running the training job
    metric_definitions   = metric_definitions # the metrics regex definitions to extract logs
)

In [22]:
# define a data input dictonary with our uploaded s3 uris
training_data = {
    'train': remote_train_dataset,
    'test': remote_test_dataset
}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(training_data, wait=True)

2022-07-28 03:58:42 Starting - Starting the training job...ProfilerReport-1658980722: InProgress
...
2022-07-28 03:59:36 Starting - Preparing the instances for training.........
2022-07-28 04:01:13 Downloading - Downloading input data...
2022-07-28 04:01:36 Training - Downloading the training image.....................
2022-07-28 04:05:05 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-07-28 04:05:08,147 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-07-28 04:05:08,175 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-07-28 04:05:08,184 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-07-28 04:05:08,592 sagemaker-training-toolkit INFO     Installing dependencies from requirement

# Accessing Training Metrics

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
training_job_name = huggingface_estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")

df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
df

## Deploying the endpoint

To deploy our endpoint, we call `deploy()` on our HuggingFace estimator object, passing in our desired number of instances and instance type.

In [None]:
predictor = huggingface_estimator.deploy(1,"ml.g4dn.xlarge")

Then, we use the returned predictor object to call the endpoint.

In [None]:
sentiment_input= {"inputs":"I love using the new Inference DLC."}

predictor.predict(sentiment_input)

Finally, we delete the endpoint again.

In [None]:
predictor.delete_endpoint()