# Load the Dataset

In [1]:
from datasets import load_dataset

In [2]:
emotion = load_dataset('emotion')

# Load the DistilBERT Tokenizer

In [3]:
from transformers import AutoTokenizer

In [4]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [5]:
print(tokenizer.convert_ids_to_tokens(
    tokenizer("The DistilBERT Tokenizer is working!").input_ids
))

['[CLS]', 'the', 'di', '##sti', '##lbert', 'token', '##izer', 'is', 'working', '!', '[SEP]']


Now we cancreate a `tokenize` function that tokenizes the dataset in the format that is required by the DistilBERT model.

In [6]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [7]:
tokenize(emotion['train'][0:2])

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [8]:
emotion_tokenized = emotion.map(tokenize, batched=True, batch_size=None)

This creates the additional colmuns `input_ids` and `attention_mask` in the dataset.

In [9]:
print(emotion_tokenized)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})


# Use DistilBERT with a Custom Classification Head

In [10]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
labels = emotion['train'].features['label'].names
num_labels = len(labels)
model = (
    AutoModelForSequenceClassification
    .from_pretrained(model_ckpt, num_labels=num_labels)
    .to(device)
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Quality Metrics

In [12]:
from sklearn.metrics import accuracy_score, f1_score

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    
    
    return {'accuracy': acc, 'f1': f1}

## Train the Model

In [17]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotion_tokenized['train']) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    save_strategy='epoch',
    load_best_model_at_end=True,
    log_level='error',
)

In [18]:
trainer = Trainer(model=model, args=training_args, compute_metrics=compute_metrics,
                  train_dataset=emotion_tokenized['train'],
                  eval_dataset=emotion_tokenized['validation'],
                  tokenizer=tokenizer)
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 