Install requirements

In [None]:
!pip install transformers

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

Download annotated SPAM corpora

In [None]:
!wget https://github.com/gbella/NLP/raw/main/SPAM/spam.zip
!unzip spam.zip

--2023-06-12 15:15:31--  https://github.com/gbella/NLP/raw/main/SPAM/spam.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/gbella/NLP/main/SPAM/spam.zip [following]
--2023-06-12 15:15:31--  https://raw.githubusercontent.com/gbella/NLP/main/SPAM/spam.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3848060 (3.7M) [application/zip]
Saving to: ‘spam.zip’


2023-06-12 15:15:32 (108 MB/s) - ‘spam.zip’ saved [3848060/3848060]

Archive:  spam.zip
  inflating: email_spam.csv          
  inflating: sms_spam.csv            


Load the two corpora and fuse them into a single dataset

In [None]:
data = pd.read_csv('email_spam.csv')
data2 = pd.read_csv('sms_spam.csv',sep=';')
messages = pd.concat([data['message'],data2['message']]).values
labels = pd.concat([data['label'],data2['label']]).values
#test_messages = data['message'].values
#test_labels = data['label'].values

Preprocess data

In [None]:
token_id = []
attention_masks = []
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should be considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

for sample in messages:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids'])
  attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Split the dataset into a training and a remaining set, according to val_ratio
train_idx, rem_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True)

# Split again the remaining set into validation and test sets
val_idx, test_idx = train_test_split(
    np.arange(len(rem_idx)),
    test_size = 0.5,
    shuffle = True)

train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

test_set = TensorDataset(token_id[test_idx],
                         attention_masks[test_idx],
                         labels[test_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

test_dataloader = DataLoader(
            test_set,
            sampler = SequentialSampler(test_set),
            batch_size = batch_size
        )

In [None]:
# Compute evaluation metrics: accuracy, precision, recall, F-measure

def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - f1          = 2 * precision * recall / (precision + recall)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_f1 = 2.0 * b_precision * b_recall / (b_precision + b_recall) if (not b_precision == 'nan') and (not b_recall == 'nan') and (b_precision + b_recall) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_f1

In [None]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 8

for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             token_type_ids = None,
                             attention_mask = b_input_mask,
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_f1 = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids,
                              token_type_ids = None,
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_f1 = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_f1 != 'nan': val_f1.append(b_f1)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy:  {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall:    {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation F-measure: {:.4f}\n'.format(sum(val_f1)/len(val_f1)) if len(val_f1)>0 else '\t - Validation F-measure: NaN')

Epoch:  12%|█▎        | 1/8 [01:02<07:16, 62.42s/it]


	 - Train loss: 0.1470
	 - Validation Accuracy:  0.9839
	 - Validation Precision: 0.9737
	 - Validation Recall:    0.9368
	 - Validation F-measure: 0.9555



Epoch:  25%|██▌       | 2/8 [02:00<06:00, 60.02s/it]


	 - Train loss: 0.0462
	 - Validation Accuracy:  0.9899
	 - Validation Precision: 0.9842
	 - Validation Recall:    0.9609
	 - Validation F-measure: 0.9665



Epoch:  38%|███▊      | 3/8 [02:59<04:57, 59.60s/it]


	 - Train loss: 0.0259
	 - Validation Accuracy:  0.9879
	 - Validation Precision: 0.9698
	 - Validation Recall:    0.9552
	 - Validation F-measure: 0.9646



Epoch:  50%|█████     | 4/8 [03:58<03:56, 59.06s/it]


	 - Train loss: 0.0179
	 - Validation Accuracy:  0.9909
	 - Validation Precision: 0.9784
	 - Validation Recall:    0.9730
	 - Validation F-measure: 0.9707



Epoch:  62%|██████▎   | 5/8 [04:56<02:56, 58.91s/it]


	 - Train loss: 0.0235
	 - Validation Accuracy:  0.9879
	 - Validation Precision: 0.9590
	 - Validation Recall:    0.9644
	 - Validation F-measure: 0.9627



Epoch:  75%|███████▌  | 6/8 [05:55<01:57, 58.78s/it]


	 - Train loss: 0.0223
	 - Validation Accuracy:  0.9909
	 - Validation Precision: 0.9885
	 - Validation Recall:    0.9624
	 - Validation F-measure: 0.9700



Epoch:  88%|████████▊ | 7/8 [06:53<00:58, 58.62s/it]


	 - Train loss: 0.0266
	 - Validation Accuracy:  0.9919
	 - Validation Precision: 0.9831
	 - Validation Recall:    0.9586
	 - Validation F-measure: 0.9743



Epoch: 100%|██████████| 8/8 [07:51<00:00, 59.00s/it]


	 - Train loss: 0.0150
	 - Validation Accuracy:  0.9808
	 - Validation Precision: 0.9741
	 - Validation Recall:    0.9244
	 - Validation F-measure: 0.9385






In [None]:
# Tracking variables
test_accuracy = []
test_precision = []
test_recall = []
test_f1 = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
      # Forward pass
      test_output = model(b_input_ids,
                          token_type_ids = None,
                          attention_mask = b_input_mask)
    logits = test_output.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    # Calculate validation metrics
    b_accuracy, b_precision, b_recall, b_f1 = b_metrics(logits, label_ids)
    test_accuracy.append(b_accuracy)
    # Update precision only when (tp + fp) !=0; ignore nan
    if b_precision != 'nan': test_precision.append(b_precision)
    # Update recall only when (tp + fn) !=0; ignore nan
    if b_recall != 'nan': test_recall.append(b_recall)
    # Update specificity only when (tn + fp) !=0; ignore nan
    if b_f1 != 'nan': test_f1.append(b_f1)

print('\t - Testing Accuracy:  {:.4f}'.format(sum(test_accuracy)/len(test_accuracy)))
print('\t - Testing Precision: {:.4f}'.format(sum(test_precision)/len(test_precision)) if len(test_precision)>0 else '\t - Testing Precision: NaN')
print('\t - Testing Recall:    {:.4f}'.format(sum(test_recall)/len(test_recall)) if len(test_recall)>0 else '\t - Testing Recall: NaN')
print('\t - Testing F-measure: {:.4f}\n'.format(sum(test_f1)/len(test_f1)) if len(test_f1)>0 else '\t - Testing F-measure: NaN')



	 - Testing Accuracy:  0.9929
	 - Testing Precision: 0.9944
	 - Testing Recall:    0.9619
	 - Testing F-measure: 0.9723

