## SPAM MAIL CLASSIFICATION USING BERT

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel,BertTokenizerFast
# specify GPU
device = torch.device("cuda")

In [4]:
df = pd.read_csv('/content/spamdata_v2.csv')
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['label'].value_counts(normalize=True)

0    0.865937
1    0.134063
Name: label, dtype: float64

In [6]:
# split train dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=df['label'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2018,
                                                                test_size=0.5,
                                                                stratify=temp_labels)

In [7]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)



In [9]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [10]:

# Data Loader
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler

batch_size = 32

train_data = TensorDataset(train_seq,train_mask,train_y)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

val_data = TensorDataset(val_seq,val_mask,val_y)

val_sampler = SequentialSampler(val_data)

val_dataloader = DataLoader(val_data,sampler=val_sampler,batch_size=batch_size)

In [11]:
# Model Archietecture
for param in bert.parameters():
    param.requires_grad = False

In [12]:
class BERT_Arch(nn.Module):
    def __init__(self,bert):
        super(BERT_Arch,self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self,sent_id,mask):
        _,cls_hs = self.bert(sent_id,attention_mask=mask,return_dict=False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [13]:
model = BERT_Arch(bert)
model = model.to(device)

In [14]:
from transformers import AdamW
from sklearn.utils.class_weight import compute_class_weight

optimizer = AdamW(model.parameters(),lr=1e-5)
class_weights = compute_class_weight(class_weight='balanced',classes=np.unique(train_labels),y=train_labels)

print('Class Weights',class_weights)



Class Weights [0.57743559 3.72848948]


In [15]:
weights = torch.tensor(class_weights,dtype=torch.float)



weights = weights.to(device)

cross_entropy = nn.NLLLoss(weight=weights)

epochs = 10

In [16]:
def train():
    model.train()
    model.to(device)
    total_loss , total_accuracy = 0,0
    total_preds = []

    for step,batch in enumerate(train_dataloader):
        if step % 50 == 0 and not step == 0:
            print('Batch {:>5,} of {:>5,}.'.format(step,len(train_dataloader)))
        batch = [r.to(device) for r in batch]

        sent_id , mask , labels = batch
        model.zero_grad()

        preds = model(sent_id,mask)

        loss = cross_entropy(preds,labels)

        total_loss = total_loss + loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)

        optimizer.step()

        preds = preds.detach().cpu().numpy()

    total_preds.append(preds)
    avg_loss = total_loss / len(train_dataloader)

    total_preds = np.concatenate(total_preds,axis=0)
    return avg_loss,total_preds

In [17]:
def evaluate():

    print("\nEvaluating...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:

            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():

            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [18]:
# set initial loss to infinite
best_valid_loss = float('inf')

epochs = 20

train_losses = []
valid_losses = []

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss , _ = train()
    valid_loss , _ = evaluate()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(),'saved_weights.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 20
Batch    50 of   122.
Batch   100 of   122.

Evaluating...

Training Loss: 0.674
Validation Loss: 0.648

 Epoch 2 / 20
Batch    50 of   122.
Batch   100 of   122.

Evaluating...

Training Loss: 0.642
Validation Loss: 0.619

 Epoch 3 / 20
Batch    50 of   122.
Batch   100 of   122.

Evaluating...

Training Loss: 0.616
Validation Loss: 0.597

 Epoch 4 / 20
Batch    50 of   122.
Batch   100 of   122.

Evaluating...

Training Loss: 0.591
Validation Loss: 0.570

 Epoch 5 / 20
Batch    50 of   122.
Batch   100 of   122.

Evaluating...

Training Loss: 0.558
Validation Loss: 0.537

 Epoch 6 / 20
Batch    50 of   122.
Batch   100 of   122.

Evaluating...

Training Loss: 0.541
Validation Loss: 0.517

 Epoch 7 / 20
Batch    50 of   122.
Batch   100 of   122.

Evaluating...

Training Loss: 0.520
Validation Loss: 0.495

 Epoch 8 / 20
Batch    50 of   122.
Batch   100 of   122.

Evaluating...

Training Loss: 0.508
Validation Loss: 0.475

 Epoch 9 / 20
Batch    50 of   122.
Batch   100

## Load Weights

In [21]:
#load weights of best model
path = '/content/saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

## MAke Predictions

In [22]:
# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()



# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.98      0.90      0.94       724
           1       0.56      0.87      0.68       112

    accuracy                           0.89       836
   macro avg       0.77      0.88      0.81       836
weighted avg       0.92      0.89      0.90       836

