## Introduction

Welcome to our end-to-end binary Text-Classification example. In this demo, we will use the Hugging Faces transformers and datasets library to fine-tune a pre-trained transformer on binary text classification. In particular, the pre-trained model will be fine-tuned using the imdb dataset.

In [2]:
%pip install torch transformers datasets

Keyring is skipped due to an exception: 'keyring.backends'
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from datasets import load_dataset, load_from_disk

In [None]:
import sagemaker

sm_session = sagemaker.Session()

s3_root_folder = f's3://{sm_session.default_bucket()}/pathways/huggingface'

## Load and process the data set

We load our imdb datasets from HuggingFace and upload the data to S3 so we can reuse it without needing to load it from HuggingFace and do our transforms each time we want to use it.

In [None]:
# tokenizer used in preprocessing
tokenizer_name = 'distilbert-base-uncased'

# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [None]:
# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

In [None]:
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
# for demo, smaller the size of the datasets
test_dataset = test_dataset.shuffle().select(range(5000))

# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
train_data_path = os.path.join(s3_root_folder, 'data', 'train')
test_data_path = os.path.join(s3_root_folder, 'data', 'test')

train_dataset.save_to_disk(train_data_path)
test_dataset.save_to_disk(test_data_path)

## Run the training remotely with a GPU instance


The following method is used to compute metrics that evaluate the binary classification.

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:
import s3fs
import json

from sagemaker.remote_function import remote

@remote(s3_root_uri=s3_root_folder, keep_alive_period_in_seconds=600)
def train_hf_model(
    train_input_path,
    test_input_path,
    s3_output_path = None,
    *,
    epochs = 1,
    train_batch_size = 32,
    eval_batch_size = 64,
    warmup_steps = 500,
    learning_rate = 5e-5
):  
    model_dir = 'model'

    train_dataset = load_from_disk(train_input_path, keep_in_memory=True)
    test_dataset = load_from_disk(test_input_path, keep_in_memory=True)
    
    model_name = 'distilbert-base-uncased'
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    training_args = TrainingArguments(
        output_dir=model_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        warmup_steps=warmup_steps,
        evaluation_strategy="epoch",
        logging_dir="logs/",
        learning_rate=float(learning_rate),
    )

    # create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )
    
    print("Starting model training..")
    trainer.train()
        
    trainer.save_model(model_dir)
    
    print("Evaluating the model...")
    eval_result = trainer.evaluate(eval_dataset=test_dataset)

    if s3_output_path:
        fs = s3fs.S3FileSystem()
        with fs.open(os.path.join(s3_output_path, "eval_results.txt"), "w") as file:
            json.dump(eval_result, file)
        
        fs.put(model_dir, os.path.join(s3_output_path, model_dir), recursive=True)
    
    return os.path.join(s3_output_path, model_dir), eval_result

In [None]:
## Train the model
model_path, evaluation = train_hf_model(train_data_path, test_data_path, os.path.join(s3_root_folder, "run_1/output"))

In [None]:
evaluation

## Classify text using our trained model

The text classification model we just trained will return a label based on the sentiment of the text sent to the model for inference.
`LABEL-0` is for Negative sentiment and `LABEL-1` is for Positive sentiment

In [None]:
fs = s3fs.S3FileSystem()
fs.get(model_path, 'model', recursive=True)

In [None]:
trained_model = AutoModelForSequenceClassification.from_pretrained('model')

In [None]:
inputs = "I love using SageMaker."

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model=trained_model, tokenizer=tokenizer)

In [None]:
classifier(inputs)