# Huggingface Sagemaker - finetune BERT model
From https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb

# Development environment


In [None]:
!pip install --upgrade boto3=="1.36.4" botocore=="1.36.4" s3transfer datasets s3fs numpy


In [None]:
import boto3
import botocore
import s3transfer
import datasets

print("boto3 version:", boto3.__version__)
print("botocore version:", botocore.__version__)
print("s3transfer version:", s3transfer.__version__)
print("datasets version:", datasets.__version__)


In [None]:
import sagemaker
import sagemaker.huggingface
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

# create a sagemaker execution role via IAM with required permissions 
role = "arn:aws:iam::<YOUR SAGEMAKER EXECUTION ROLE ARN>"

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

# Prepare data

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# tokenizer used in preprocessing
tokenizer_name = 'distilbert-base-uncased'

# dataset used
dataset_name = 'imdb'

# s3 key prefix for the data
s3_prefix = 'samples/datasets/imdb'

In [None]:
# load dataset
dataset = load_dataset(dataset_name)

# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
test_dataset = test_dataset.shuffle(seed=42).select(range(10000)) # smaller the size for test dataset to 10k 


# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
import s3fs

storage_options = {"anon": True}  # for anonymous connection
fs = s3fs.S3FileSystem(**storage_options)

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path)

# save validation to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/validation'
test_dataset.save_to_disk(test_input_path)

# Train the model

In [None]:
!pygmentize ./scripts/train_sagemaker.py


In [None]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'learning_rate': 5e-5,
                 'warmup_steps': 500,
                 'model_name':'distilbert-base-uncased'
                 }

In [None]:
huggingface_estimator = HuggingFace(entry_point='train_sagemaker.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.26',
                            pytorch_version='1.13',
                            py_version='py39',
                            hyperparameters = hyperparameters)

In [None]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

# Deploy the endpoint

In [None]:
predictor = huggingface_estimator.deploy(1, "ml.g4dn.xlarge")


In [None]:
sentiment_input= {"inputs": "a cinematic masterpiece"}

predictor.predict(sentiment_input)

# Test the model

In [None]:
test_dataset_sample = test_dataset[:100]

In [None]:
def map_labels(label):
    mapping = {'LABEL_0': 0, 'LABEL_1': 1}
    return mapping[label]

sentiment_input= {"inputs": test_dataset_sample["text"]}
test_output = predictor.predict(sentiment_input)
test_predictions = [map_labels(item['label']) for item in test_output]


In [None]:
# compute accuracy on test set
from sklearn.metrics import accuracy_score
accuracy_score(test_dataset_sample['labels'], test_predictions)

In [None]:
# show examples of review and labels
import pandas as pd
df = pd.DataFrame({"Review": test_dataset['text'][:10],
                   "Predicted label": test_predictions})

In [None]:
# clean up
predictor.delete_model()
predictor.delete_endpoint()

# What to try next
- How does the experience using Sagemaker training job compare to running the training in a notebook? Which mode of working do you prefer and why?
- Watch this workshop on Huggingface and AWS Sagemaker https://huggingface.co/docs/sagemaker/getting-started

In [None]:
test_dataset['labels']