In [20]:
# Import libraries used for dataset, model, tokenization, and probabilities
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

In [None]:
# Load the "emotion" dataset from HuggingFace
ds = load_dataset("emotion")

# Print one sample to understand structure (text + label)
ds["train"][0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [34]:
print(type(ds))
ds

<class 'datasets.dataset_dict.DatasetDict'>


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [36]:
# See label names (human readable)
label_names = ds["train"].features["label"].names
label_names

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [23]:
# See the raw input text and its label
sample = ds["train"][100]
sample

{'text': 'i wont let me child cry it out because i feel that loving her and lily when she was little was going to be opportunities that only lasted for those short few months',
 'label': 2}

In [24]:
# Load BERT tokenizer (turns text → tokens → numbers)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [25]:
# Convert text into token IDs, pad/truncate to same length, return tensors
tok = tokenizer(
    sample["text"],
    padding="max_length",
    truncation=True,
    max_length=64,
    return_tensors="pt"
)

tok

{'input_ids': tensor([[ 101, 1045, 2180, 2102, 2292, 2033, 2775, 5390, 2009, 2041, 2138, 1045,
         2514, 2008, 8295, 2014, 1998, 7094, 2043, 2016, 2001, 2210, 2001, 2183,
         2000, 2022, 6695, 2008, 2069, 6354, 2005, 2216, 2460, 2261, 2706,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [26]:
# Load BERT for emotion classification (6 labels)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=6
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Send tokenized input into model (forward pass)
outputs = model(**tok)

# Raw model output (logits) before softmax
outputs.logits

tensor([[ 0.0931,  0.2348, -0.6436,  0.0639, -0.3557, -0.1919]],
       grad_fn=<AddmmBackward0>)

In [28]:
# Convert logits to probabilities
probs = F.softmax(outputs.logits, dim=-1)
probs

tensor([[0.2003, 0.2308, 0.0959, 0.1945, 0.1279, 0.1506]],
       grad_fn=<SoftmaxBackward0>)

In [29]:
# Pick the index of max probability
pred_idx = probs.argmax(dim=-1).item()

# Convert to label name
pred_label = label_names[pred_idx]
pred_label

'joy'

In [30]:
"Truth: " + label_names[sample["label"]] + "   |   Pred: " + pred_label

'Truth: love   |   Pred: joy'

In [31]:
# Summary (just a text cell maybe)
'''
Flow observed:
1. raw text
2. tokenization to input_ids & mask
3. model forward
4. logits
5. softmax probabilities
6. predicted label

Next: fine-tuning to improve predictions
'''

'\nFlow observed:\n1. raw text\n2. tokenization to input_ids & mask\n3. model forward\n4. logits\n5. softmax probabilities\n6. predicted label\n\nNext: fine-tuning to improve predictions\n'