## Text classification boilerplate code

### Table of contents
0. Data I/O
1. Deberta-v3-large

### Compute
* finetuned with 1x A100-SXM4-40GB
* nvcc version 12.2

## Setup & Data I/O

In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
!pip install -qU transformers datasets evaluate accelerate sentencepiece wandb scikit-learn

In [None]:
#!pip list

In [None]:
import wandb

wandb.login()

In [None]:
import numpy as np
import evaluate
import torch
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    DataCollatorWithPadding,
)
from datasets import load_dataset
from transformers import AutoTokenizer
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [None]:
imdb = load_dataset("imdb")
imdb

In [None]:
imdb["test"][0]

## 1. Deberta-v3

In [None]:
model_id = (
    "microsoft/deberta-v3-large"  # no model_max_length (relative position embeddings)
)
model_id = "microsoft/deberta-v2-xxlarge"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, fast=False)
# add a max length so that GPU doesn't go out of memory (turns truncation on in tokenization @ max length)
# tokenizer.model_max_length = 1500
tokenizer

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
    metric1 = evaluate.load("accuracy")
    metric2 = evaluate.load("f1")
    metric3 = evaluate.load("precision")
    metric4 = evaluate.load("recall")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = metric2.compute(predictions=predictions, references=labels)["f1"]
    precision = metric3.compute(predictions=predictions, references=labels)["precision"]
    recall = metric4.compute(predictions=predictions, references=labels)["recall"]
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.bfloat16,
)

In [None]:
model

In [None]:
training_args = TrainingArguments(
    output_dir="deberta-v2-xxl-imdb-v0.1",
    learning_rate=2e-5,
    auto_find_batch_size=True,
    # per_device_train_batch_size=8,
    # per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.2,
    logging_strategy="steps",
    logging_steps=25,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dfurman/deberta-v2-xxl-imdb-v0.1")
model = AutoModelForSequenceClassification.from_pretrained(
    "dfurman/deberta-v2-xxl-imdb-v0.1"
).to("cuda")

In [None]:
texts = [
    "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.",
    "This wasn't too good. Not a fan of the story line, but some of the acting was good. Not my favorite.",
]

for text in texts:
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs.to("cuda")).logits
    predicted_class_id = logits.argmax().item()
    print(model.config.id2label[predicted_class_id])

In [None]:
# test set accuracy:
from tqdm.auto import tqdm

correct = 0
for i in tqdm(range(len(imdb["test"]))):
    text = imdb["test"][i]["text"]
    label = model.config.id2label[imdb["test"][i]["label"]]
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to("cuda")
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    if label == model.config.id2label[predicted_class_id]:
        correct += 1

In [None]:
100 * correct / len(imdb["test"])