In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np


In [36]:
from transformers import BertTokenizer,BertModel

In [37]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [38]:
df=pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [39]:
df["Category"]=df["Category"].map({"ham":0,"spam":1})
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [40]:
df["Category"].value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

In [41]:
df_spam=df[df["Category"]==1]
df_spam.shape

(747, 2)

In [42]:
df_ham=df[df["Category"]==0].sample(1000)
df_ham.shape

(1000, 2)

In [43]:
df_new=pd.concat([df_spam,df_ham])
df_new.head()

Unnamed: 0,Category,Message
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
5,1,FreeMsg Hey there darling it's been 3 week's n...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...
11,1,"SIX chances to win CASH! From 100 to 20,000 po..."


In [44]:
df_new.shape

(1747, 2)

In [45]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df_new["Message"],df_new["Category"],test_size=0.25,random_state=42)

In [46]:
y_train.value_counts()

Category
0    761
1    549
Name: count, dtype: int64

In [47]:
y_test.value_counts()

Category
0    239
1    198
Name: count, dtype: int64

In [48]:
tokeniser=BertTokenizer.from_pretrained("bert-base-uncased")

def tokenise_function(texts,labels):
    encodings=tokeniser(texts,padding="max_length",max_length=128,truncation=True,return_tensors="pt")
    return encodings["input_ids"],encodings["attention_mask"],torch.tensor(labels,dtype=torch.float)
tokenise_function(["Hurry","Meet you"],[1,0])

(tensor([[ 101, 9241,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0],
         [ 101, 3113, 2017,  102,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,

In [49]:
train_ids,train_attention_mask,train_labels=tokenise_function(X_train.values.tolist(),y_train.values.tolist())

In [50]:
val_ids,val_attention_mask,val_labels=tokenise_function(X_test.values.tolist(),y_test.values.tolist())

In [51]:
train_ids[:2]

tensor([[  101,  3374,  2026, 18328,  2015,  2165,  5091,  1010,  2009,  7929,
          2065,  1045,  2272,  2011,  2085,  1029,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [52]:
train_datasets=torch.utils.data.TensorDataset(train_ids,train_attention_mask,train_labels)
val_datasets=torch.utils.data.TensorDataset(val_ids,val_attention_mask,val_labels)

In [53]:
train_loader=DataLoader(train_datasets,batch_size=64,shuffle=True)
val_loader=DataLoader(val_datasets,batch_size=64,shuffle=False)

In [54]:
bert=BertModel.from_pretrained("bert-base-uncased")
bert.config.hidden_size

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


768

In [55]:
class spam_classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert=BertModel.from_pretrained("bert-base-uncased")

        for param in self.bert.parameters():
            param.requires_grad=False

        self.classifier=nn.Sequential(
            nn.Linear(768,256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256,1),
            nn.Sigmoid()
        )
    def forward(self,input_ids,attention_mask):
        bert_output=self.bert(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings=bert_output.last_hidden_state[:,0,:]
        return self.classifier(sentence_embeddings)

In [56]:
model=spam_classifier().to(device)
criterion=nn.BCELoss().to(device)
optimiser=optim.Adam(model.parameters(),lr=0.001)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [57]:
epochs=3

for epoch in range(epochs):
    model.train()
    l=0.0

    for input_ids,attention_masks,labels in train_loader:
        input_ids=input_ids.to(device)
        attention_masks=attention_masks.to(device)
        labels=labels.to(device)

        optimiser.zero_grad()

        outputs=model(input_ids,attention_masks).squeeze()
        loss=criterion(outputs,labels)

        loss.backward()
        optimiser.step()

        l +=loss.item()
    print(l/(len(train_loader)))

0.2963479934703736
0.10849650017917156
0.08360247012405168


In [62]:
model.eval()
val_loss=0.0
correct_pred=0
with torch.no_grad():
    for input_ids,attention_masks,labels in val_loader:
        input_ids=input_ids.to(device)
        attention_masks=attention_masks.to(device)
        labels=labels.to(device)

        outputs=model(input_ids,attention_masks).squeeze()
        loss=criterion(outputs,labels)
        val_loss +=loss.item()
        pred=(outputs>0.5).float()
        correct_pred +=torch.sum(pred==labels)

print("Average validation loss",val_loss/len(val_loader))
val_accuracy=correct_pred.double()/len(val_datasets)
print("Accuracy",val_accuracy)




Average validation loss 0.08425269116248403
Accuracy tensor(0.9748, device='cuda:0', dtype=torch.float64)
