In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm

from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, TrainingArguments, Trainer

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [5]:
import evaluate
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## Load the data

In [6]:
# loading data
data = pd.read_csv('/content/drive/MyDrive/emails.csv')
print(data.shape)
data.head()

(5728, 2)


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


### Class imbalance

In updated versions of this file, we will address class imbalance and how to tackle possible adverse effects.

In [12]:
data['spam'].value_counts()

Unnamed: 0_level_0,count
spam,Unnamed: 1_level_1
0,4360
1,1368


### Preprocessing

In contrast with the LSTM version, I chose to keep many stopwords in the text, since the attention mechanism will handle their relative importance. Sometimes stopwords such as "and" or "through" do add significant meaning to the text, and help us avoid misunderstandings. Also, I don't need to turn upper case letters to lower case, since the GPT-2 tokenizer will handle them accordingly.

In [None]:
###

## Tokenizer

We are using the GPT-2 tokenizer.

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

### Prepare the data

In [8]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [9]:
# Create dataset
dataset = CustomDataset(
    texts=data["text"].tolist(),
    labels=data["spam"].tolist(),
    tokenizer=tokenizer,
    max_length=tokenizer.model_max_length
)

# train-test split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# DataLoader for batching
# train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

## GPT-2 Model

GPT-2 stands for "Generative Pre-trained Transformer 2". Developed by OpenAI, it is a transformer-based model.

In [10]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.eos_token_id

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluation metrics

Accuracy, precision, recall, and F1-score are typical metrics used in classification tasks. Especially when it comes to imbalanced datasets, accuracy alone is not a good indicator of whether the model is trained well.

In [11]:
# Load metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Compute multiple metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate each metric
    acc = accuracy.compute(predictions=predictions, references=labels)
    prec = precision.compute(predictions=predictions, references=labels, average="weighted")
    rec = recall.compute(predictions=predictions, references=labels, average="weighted")
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")

    # Return a dictionary of all metrics
    return {
        "accuracy": acc["accuracy"],
        "precision": prec["precision"],
        "recall": rec["recall"],
        "f1": f1_score["f1"]
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

### Training setup

I am using a very small batch size (1) due to compute limitations. By default the model uses the Adam optimizer with a linear schedule for the specified (by us) learning rate. Also, the loss function that is used is binary cross entropy, a golden standard for classification tasks.

In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    save_strategy="epoch",  # Save the model every epoch
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=50,  # Log every 50 steps
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Replace with your train dataset
    eval_dataset=test_dataset,  # Replace with your eval dataset
    tokenizer=tokenizer,  # Add tokenizer for data collators
    compute_metrics=compute_metrics  # Optional: Function to calculate accuracy, etc.
)

  trainer = Trainer(


In [13]:
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU found.")

GPU is available: Tesla T4


In [16]:
if "COLAB_TPU_ADDR" in os.environ:
    print("TPU is available!")
else:
    print("No TPU found.")

No TPU found.


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

### Training the model

In [18]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0005,0.220946,0.972949,0.975442,0.972949,0.973406


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0005,0.220946,0.972949,0.975442,0.972949,0.973406
2,0.0,0.040507,0.995637,0.995639,0.995637,0.995629


TrainOutput(global_step=9164, training_loss=0.11184777860218224, metrics={'train_runtime': 2845.1138, 'train_samples_per_second': 3.221, 'train_steps_per_second': 3.221, 'total_flos': 4789046844850176.0, 'train_loss': 0.11184777860218224, 'epoch': 2.0})

In [19]:
trainer.evaluate()

{'eval_loss': 0.0405070036649704,
 'eval_accuracy': 0.9956369982547993,
 'eval_precision': 0.9956392064519243,
 'eval_recall': 0.9956369982547993,
 'eval_f1': 0.9956286387874682,
 'eval_runtime': 87.1166,
 'eval_samples_per_second': 13.155,
 'eval_steps_per_second': 13.155,
 'epoch': 2.0}

I will address the results in more detail in the future. However, so far they look much better than the LSTM model I designed a few months ago.