## Introduction

The following code is meant to be run in Sagemaker Notebook Instance. We will train a Transformer Model on the IMDB dataset while logging data to Comet in real time. 

### Install Comet and other Dependencies

In [None]:
%pip install -U "comet_ml>=3.44.0" "sagemaker>=2.140.0" "transformers==4.26.1" "datasets[s3]" "boto3"

### Login to Comet 

In [None]:
import comet_ml

COMET_PROJECT_NAME = "comet-example-sagemaker-custom-transformers-text-classification"

comet_ml.login(project_name=COMET_PROJECT_NAME)

### Fetch Sagemaker Credentials 

In [None]:
import sagemaker

sess = sagemaker.Session()
prefix = "sagemaker/DEMO-huggingface-imdb"
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

###  Fetch the Data

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# tokenizer used in preprocessing
tokenizer_name = "distilbert-base-uncased"

# dataset used
dataset_name = "imdb"

# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)


# load dataset
train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])

train_dataset = test_dataset.shuffle().select(range(1000))
test_dataset = test_dataset.shuffle().select(range(100))

# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# save train_dataset to s3
training_input_path = f"s3://{sess.default_bucket()}/{prefix}/train"
train_dataset.save_to_disk(training_input_path)

# save test_dataset to s3
test_input_path = f"s3://{sess.default_bucket()}/{prefix}/test"
test_dataset.save_to_disk(test_input_path)

### Set Training Parameters

In [None]:
AWS_INSTANCE_TYPE = "ml.g4dn.xlarge"
AWS_INSTANCE_COUNT = 1

HYPERPARAMETERS = {
    "epochs": 1,
    "train_batch_size": 32,
    "log-interval": 1,
    "model_name": "distilbert-base-uncased",
}

### Setup the Sagemaker Estimator

In [None]:
from sagemaker.huggingface import HuggingFace

COMET_API_KEY = comet_ml.config.get_config()["comet.api_key"]
COMET_PROJECT_NAME = comet_ml.config.get_config()["comet.project_name"]

estimator = HuggingFace(
    source_dir="src",
    entry_point="text_classification.py",
    role=role,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
    instance_count=AWS_INSTANCE_COUNT,
    instance_type=AWS_INSTANCE_TYPE,
    hyperparameters=HYPERPARAMETERS,
    environment={
        "COMET_API_KEY": COMET_API_KEY,
        "COMET_PROJECT_NAME": COMET_PROJECT_NAME,
    },
)

### Run the Training Job

In [None]:
estimator.fit({"train": training_input_path, "test": test_input_path})