In [2]:
import pandas as pd

In [3]:
data=pd.read_csv("/content/spam_ham_dataset.csv")
data=pd.DataFrame(data.head(400))

In [4]:
from sklearn.model_selection import train_test_split
train_data, test_data=train_test_split(data, test_size=0.2, random_state=42)

In [5]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
18,1629,ham,Subject: meter variances - ua 4 clean - up\r\n...,0
202,2035,ham,"Subject: enron / hpl actuals for november 13 ,...",0
250,3828,spam,Subject: ! ebay reguiar verification\r\n% word...,1
274,3728,spam,Subject: get a bu _ lky p ^ 0 le * dcrgvabyssy...,1


In [6]:
#load the tokenizer and checkpoint
from transformers import BertTokenizer
checkpoint="bert-base-uncased"
tokenizer=BertTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
#tokenize the training and test data
train_encoding=tokenizer(list(train_data['text']), truncation=True, padding=True,max_length=512)
test_encoding=tokenizer(list(test_data['text']),truncation=True, padding=True, max_length=512)

In [13]:
#make the data as suitable to transformers library
import torch
from torch.utils.data import Dataset

class EmailDataset(Dataset):
  def __init__(self,encodings,labels):
    self.encodings=encodings
    self.labels=labels

  def __getitem__(self,idx):
    item={key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels']=torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [14]:
train_dataset=EmailDataset(train_encoding,train_data['label_num'].tolist())
test_dataset=EmailDataset(test_encoding, test_data['label_num'].tolist())

In [24]:
inp0=test_dataset[0]['input_ids']
tokenizer.decode(inp0)

'[CLS] subject : hpl curtailment plan enron north america corp. from : tom shelton 09 / 15 / 2000 11 : 44 am to : barbara n gray / hou / ect @ ect, steve hpl schneider / hou / ect @ ect, thomas a martin / hou / ect @ ect, edward d gottlob / hou / ect @ ect, james mckay / hou / ect @ ect, jim schwieger / hou / ect @ ect, greg brazaitis / hou / ect @ ect, lee l papayoti / hou / ect @ ect, jill t zivley / hou / ect @ ect, michael eiben / hou / ect @ ect, jim pond / corp / enron @ enron, jem @ ctw. com cc : brian redmond / hou / ect @ ect subject : hpl curtailment plan we are going to be reviewing the hpl curtailment plan, with the ultimate issue being how does hpl best structure its curtailment program to comply with regulatory requirements and to best suit hpl, s business purposes. attached is a memo meant to provide a brief summary about what a curtailment plan is and some of its ramifications. we, ll be setting up a meeting soon to discuss the issues surrounding the curtailment regulat

In [9]:
#convert the data using DataLoader
from torch.utils.data import DataLoader
train_dataloader=DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader=DataLoader(test_dataset, batch_size=8,shuffle=True)


In [10]:
#load the pretrained model
from transformers import BertForSequenceClassification
model=BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
device='cuda' if torch.cuda.is_available() else 'cpu'
device
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
#check the results without training the model
model.eval()
all_predictions_data = []
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    input_ids = batch['input_ids'].cpu().numpy()
    labels = batch['labels'].cpu().numpy()

    # Store all predictions with decoded texts
    for ids, pred, label in zip(input_ids, predictions.cpu().numpy(), labels):
        decoded_text = tokenizer.decode(ids, skip_special_tokens=True)
        all_predictions_data.append({
            'decoded_text': decoded_text,
            'predicted_label': pred,
            'true_label': label
        })

# Create a DataFrame from the collected data
predictions_df = pd.DataFrame(all_predictions_data)


In [13]:
pd.crosstab(predictions_df['predicted_label'],predictions_df['true_label'])

true_label,0,1
predicted_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,58,14
1,2,6


In [14]:
#load the optimizer and scheduler
from torch.optim import AdamW
from transformers import get_scheduler

optimizer=AdamW(model.parameters(), lr=5e-5)
num_epochs=3
num_training_steps=num_epochs*len(train_dataloader)

scheduler=get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [15]:
#train the model
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch={k: v.to(device) for k, v in batch.items()}
    outputs=model(**batch)
    loss=outputs.loss
    loss.backward()

    optimizer.step()
    scheduler.step()

    optimizer.zero_grad()

    print(f"epoch:{epoch} , Loss: {loss.item()}")






epoch:0 , Loss: 0.6886758208274841
epoch:0 , Loss: 0.674239456653595
epoch:0 , Loss: 0.7028271555900574
epoch:0 , Loss: 0.39279448986053467
epoch:0 , Loss: 0.7332675457000732
epoch:0 , Loss: 0.6679748892784119
epoch:0 , Loss: 0.7570545673370361
epoch:0 , Loss: 0.6911116242408752
epoch:0 , Loss: 0.6723517775535583
epoch:0 , Loss: 0.6477510929107666
epoch:0 , Loss: 0.5583497285842896
epoch:0 , Loss: 0.48992425203323364
epoch:0 , Loss: 0.668667733669281
epoch:0 , Loss: 0.6390959620475769
epoch:0 , Loss: 0.4224821627140045
epoch:0 , Loss: 0.4153954088687897
epoch:0 , Loss: 0.4197361469268799
epoch:0 , Loss: 0.39218857884407043
epoch:0 , Loss: 0.8371015191078186
epoch:0 , Loss: 0.2881072163581848
epoch:0 , Loss: 0.23938815295696259
epoch:0 , Loss: 0.3564598858356476
epoch:0 , Loss: 0.3493785560131073
epoch:0 , Loss: 0.3821188509464264
epoch:0 , Loss: 0.4927365779876709
epoch:0 , Loss: 0.30337387323379517
epoch:0 , Loss: 0.1784733384847641
epoch:0 , Loss: 0.2684180438518524
epoch:0 , Loss: 0

In [16]:
model.eval()
total_eval_acc=0
total_eval_loss=0


for batch in test_dataloader:
  batch={k:v.to(device) for k,v in batch.items()}

  with torch.no_grad():
    outputs=model(**batch)

  loss=outputs.loss
  total_eval_loss +=loss.item()

  logits=outputs.logits
  predictions=torch.argmax(logits,dim=-1)
  total_eval_acc +=(predictions == batch['labels']).float().mean()

avg_test_loss = total_eval_loss / len(test_dataloader)
avg_test_accuracy = total_eval_acc / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}, Test Accuracy: {avg_test_accuracy}")

Test Loss: 0.08601684407331049, Test Accuracy: 0.949999988079071


In [17]:

# Move model to evaluation mode
model.eval()

# List to store all prediction entries
all_predictions_data = []

# Evaluate the model
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    input_ids = batch['input_ids'].cpu().numpy()
    labels = batch['labels'].cpu().numpy()

    # Store all predictions with decoded texts
    for ids, pred, label in zip(input_ids, predictions.cpu().numpy(), labels):
        decoded_text = tokenizer.decode(ids, skip_special_tokens=True)
        all_predictions_data.append({
            'decoded_text': decoded_text,
            'predicted_label': pred,
            'true_label': label
        })

# Create a DataFrame from the collected data
predictions_df = pd.DataFrame(all_predictions_data)




In [18]:
predictions_df.head()

Unnamed: 0,decoded_text,predicted_label,true_label
0,subject : pipeline nominations away from the o...,0,0
1,"subject : feelings of guilt, worthlessness, he...",1,1
2,subject : 23 rd noms - - - - - - - - - - - - -...,0,0
3,subject : natural gas nomination for 11 / 00 e...,0,0
4,"subject : re : aekdju, the cat raised our us l...",1,1


In [19]:
pd.crosstab(predictions_df['predicted_label'],predictions_df['true_label'])

true_label,0,1
predicted_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,58,2
1,2,18


In [20]:
predictions_df.shape

(80, 3)