In [20]:
from datasets import load_dataset

In [21]:
dataset = load_dataset("SetFit/enron_spam")
dataset['train'][0]

Repo card metadata block was not found. Setting CardData to empty.


{'message_id': 33214,
 'text': 'any software just for 15 $ - 99 $ understanding oem software\nlead me not into temptation ; i can find the way myself .\n# 3533 . the law disregards trifles .',
 'label': 1,
 'label_text': 'spam',
 'subject': 'any software just for 15 $ - 99 $',
 'message': 'understanding oem software\nlead me not into temptation ; i can find the way myself .\n# 3533 . the law disregards trifles .',
 'date': datetime.datetime(2005, 6, 18, 0, 0)}

In [22]:
dataset = dataset['train'].train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['message_id', 'text', 'label', 'label_text', 'subject', 'message', 'date'],
        num_rows: 25372
    })
    test: Dataset({
        features: ['message_id', 'text', 'label', 'label_text', 'subject', 'message', 'date'],
        num_rows: 6344
    })
})

In [23]:
train_ds = dataset['train']
eval_ds = dataset['test']

### Tokenizer, model and processing

In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name) # load the tokenizer DistilBERT trained with vocab, tokennizaton rules, 
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [25]:
# for each batch of inputs, 
# 1. convert text -> input ids, as neural nets cannt consume text they only accept integer token ids. and using a match tokenizer is mandatory
# 2. adds attension mask, pads and truncates every sequence to exactly 256 tokens.
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

In [29]:
# tokenize_fn over entire dataset
# Adds new columns:
# input_ids
# attention_mask
train_tok = train_ds.map(tokenize_fn, batched=True)
eval_tok = eval_ds.map(tokenize_fn, batched=True)

Map:   0%|          | 0/25372 [00:00<?, ? examples/s]

In [30]:
# Drop the original string column, as PyTorch cannot batch strings
# Trainer expects only:
# tensors (input_ids, attention_mask, labels)

train_tok = train_tok.remove_columns(["text"])
eval_tok = eval_tok.remove_columns(["text"])

train_tok.set_format("torch")
eval_tok.set_format("torch")


In [31]:
# Loads pretrained DistilBERT encoder

# Adds a classification head:

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label={0: "ham", 1: "spam"},
    label2id={"ham": 0, "spam": 1},
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
import transformers
print(transformers.__version__)

4.57.3


In [34]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy") # Loads a standardized accuracy metric from Hugging Face

"""
During evaluation, Trainer gives you:

logits: model outputs (shape [batch, num_labels])

labels: ground truth integers

What you do

Convert logits â†’ class predictions

Compute accuracy

Why logits, not probabilities?

Softmax is monotonic

argmax(logits) == argmax(softmax(logits))

Skipping softmax is faster and numerically safer
"""
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="./enron-spam-distilbert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("./enron-spam-model")
tokenizer.save_pretrained("./enron-spam-model")


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0363,0.030437,0.993064
2,0.0116,0.02862,0.994798
3,0.0006,0.029917,0.995429


('./enron-spam-model/tokenizer_config.json',
 './enron-spam-model/special_tokens_map.json',
 './enron-spam-model/vocab.txt',
 './enron-spam-model/added_tokens.json',
 './enron-spam-model/tokenizer.json')