In [None]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import torch
import requests
from bs4 import BeautifulSoup
import re

from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import evaluate
import glob


# 🤖 & 📥: Loading Tokenizer, downloading dataset

In [None]:
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")
labels = dataset['train'].features['labels'].feature.names

MODEL = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

# 📊 -> 🗃️: Dataset preparation
> Rearrange 28 GoEmotions sentiment classes into 6 (+ neutral = 7) basic human emotions

In [None]:
def get_single_labeled(df):
    single_labels = df['labels'].apply(lambda x: x if len(x) == 1 else None)
    single_labels = single_labels.dropna()  # Leave out the multilabeled ones

    # Extract the single-labeled data by index via iloc and create a copy to avoid the SettingWithCopyWarning
    single_df = df.iloc[single_labels.index].copy()

    # Transform the single-labeled data labels from list (e.g. [8]) into int (e.g. 8)
    single_df['labels'] = single_df['labels'].apply(lambda x: x[0])

    return single_df

In [None]:
# load the whole dataset into Pandas DataFrame
# use only data with single labels, transform labels from list to int
train_df = get_single_labeled(pd.DataFrame(dataset['train']))
val_df = get_single_labeled(pd.DataFrame(dataset['validation']))
test_df = get_single_labeled(pd.DataFrame(dataset['test']))

original_labels = dataset['train'].features['labels'].feature.names
# 6 basic emotion types | Sadness, Happiness, Fear, Anger, Surprise and Disgust
new_labels = ['sadness', 'happiness', 'fear', 'anger', 'surprise', 'disgust', "neutral"]

# regroup the emotions into 6 basic emotion types
labels_reordering = {
    'sadness':   ['grief', 'disappointment', 'remorse', 'sadness'],
    'happiness': ['admiration', 'amusement', 'approval', 'caring', 'excitement', 'gratitude', 'joy', 'love', 'optimism', 'pride', 'relief', 'desire'],
    'fear':      ['fear', 'nervousness'],
    'anger':     ['anger', 'annoyance', 'disapproval'],
    'surprise':  ['surprise', 'realization', 'confusion', 'curiosity'],
    'disgust':   ['disgust', 'embarrassment'],
    "neutral":   ["neutral"]
}

# regroup into smaller subset of string labels
for key, vals in labels_reordering.items():
    # key_id = new_labels.index(key)
    val_ids = [original_labels.index(val) for val in vals]
    for val_id in val_ids:
        # first replace for string keys to avoid mixing the new and old labels
        train_df.loc[train_df['labels'] == val_id, 'labels'] = key
        val_df.loc[val_df['labels'] == val_id, 'labels'] = key
        test_df.loc[test_df['labels'] == val_id, 'labels'] = key

# change string labels for indices
for key in new_labels:
    key_id = new_labels.index(key)
    train_df.loc[train_df['labels'] == key, 'labels'] = key_id
    val_df.loc[val_df['labels'] == key, 'labels'] = key_id
    test_df.loc[test_df['labels'] == key, 'labels'] = key_id

id2label = {id:label for id, label in enumerate(new_labels)}
label2id = {label:id for id, label in enumerate(new_labels)}

# ⚙️ & 📚: Training Configuration

In [None]:
BATCH_SIZE = 32
NUM_PROCS = 8
LR = 0.00005
EPOCHS = 5
OUT_DIR = 'story_vibe_output'

# turn the preprocessed dataset back to Dataset format to use the tokenization function as is

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_tokenized = train_dataset.map(tokenize_function, batched=True, batch_size=BATCH_SIZE, num_proc=NUM_PROCS)
val_tokenized = val_dataset.map(tokenize_function, batched=True, batch_size=BATCH_SIZE, num_proc=NUM_PROCS)
test_tokenized = test_dataset.map(tokenize_function, batched=True, batch_size=BATCH_SIZE, num_proc=NUM_PROCS)

"""
As real-world sentences vary in length, we pad shorter sentences with a special padding token.
DataCollatorWithPadding ensures this happens automatically during training.
By feeding it, our tokenizer, knows the appropriate padding token and max length to use.
"""
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
)


# Training part

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard',
    fp16=True
)

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
 
history = trainer.train()