In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/spam.csv',encoding='latin1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data.shape

(5572, 5)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
data.columns = ["label",'text']

In [6]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
data['label'] = data['label'].replace({'ham':0,'spam':1})

In [9]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
def preprocessing_text(text):
    text = re.sub(r'<[^>]*>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphanumeric
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens
data['token'] = data['text'].apply(preprocessing_text)

In [11]:
data.head()

Unnamed: 0,label,text,token
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [12]:
# Build Vocabulary
all_words = [word for tokens in data['token'] for word in tokens]
word_counts = Counter(all_words)
vocab = {word: i+1 for i, word in enumerate(word_counts.keys())}  # Add 1 to reserve 0 for padding

In [13]:
# Convert tokens to sequences
def tokens_to_sequence(tokens, vocab, max_len=50):
    seq = [vocab[word] for word in tokens if word in vocab]
    return seq[:max_len] + [0] * (max_len - len(seq))  # Pad or truncate
data['sequence'] = data['token'].apply(lambda x: tokens_to_sequence(x, vocab))

In [14]:
data.head()

Unnamed: 0,label,text,token,sequence
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[17, 18, 19, 20, 21, 22, 0, 0, 0, 0, 0, 0, 0, ..."
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,...","[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3..."
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[21, 41, 42, 43, 44, 21, 45, 46, 42, 0, 0, 0, ..."
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t...","[47, 48, 49, 50, 51, 52, 53, 54, 0, 0, 0, 0, 0..."


In [15]:
# Create PyTorch Dataset
class SpamDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

In [16]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(data['sequence'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42)
train_ds = SpamDataset(X_train, y_train)
test_ds = SpamDataset(X_test, y_test)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)

In [35]:
class SpamCollection(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_size,output_size,model_type = "RNN"):
        super(SpamCollection,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embed_size)
        self.rnn = None
        self.fc = nn.Linear(hidden_size,output_size)
        self.sigmoid = nn.Sigmoid()
        if model_type == "RNN":
            self.rnn = nn.RNN(embed_size,hidden_size,batch_first = True)
        elif model_type == "LSTM":
            self.rnn = nn.LSTM(embed_size,hidden_size,batch_first = True)

        else:
            seelf.rnn = nn.GRU(embed_size,hidden_size,batch_first = True)

    def forward(self,x):
        x = self.embedding(x)
        h0 = torch.zeros(1,x.size(0),hidden_size).to(x.device)
        c0 = torch.zeros(1,x.size(0),hidden_size).to(x.device)
        if model_type == "LSTM":
            out,_ = self.rnn(x,(h0,c0))
        else:
            out,_ = self.rnn(x,h0)
        out = self.fc(out[:,-1,:])
        return self.sigmoid(out)

In [37]:
vocab_size = len(vocab)+1
embed_size = 100
hidden_size = 128
output_size = 1
lr = 0.001
epochs = 10
model_type = "LSTM"

In [39]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [41]:
model = SpamCollection(vocab_size,embed_size,hidden_size,output_size,model_type).to(device)

In [43]:
criterion = F.binary_cross_entropy
optimizer = Adam(model.parameters(),lr=lr)

In [45]:
train_loss = 0

In [57]:
%%time
from tqdm import trange,tqdm
pbar = trange(0,epochs,leave=False,desc="Epoch")
for epoch in pbar:
    # pbar = tqdm(data_dl,desc="Training")
    model.train()    
    pbar.set_postfix_str('Loss: %.4f' % (train_loss/len(train_dl)))
    train_loss = 0
    for text,lbls in tqdm(train_dl, desc="Training"):
        text,lbls = text.to(device),lbls.to(device)
        output = model(text).squeeze()
        loss = criterion(output,lbls)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch:   0%|                                                                      | 0/10 [00:00<?, ?it/s, Loss: 0.0110]
Training:   0%|                                                                                | 0/140 [00:00<?, ?it/s][A
Training:   2%|█▌                                                                      | 3/140 [00:00<00:05, 25.56it/s][A
Training:   6%|████▋                                                                   | 9/140 [00:00<00:02, 43.78it/s][A
Training:  14%|██████████▏                                                            | 20/140 [00:00<00:01, 71.44it/s][A
Training:  23%|████████████████▏                                                      | 32/140 [00:00<00:01, 87.97it/s][A
Training:  31%|██████████████████████▎                                                | 44/140 [00:00<00:00, 97.81it/s][A
Training:  40%|████████████████████████████                                          | 56/140 [00:00<00:00, 103.72it/s][A
Training:  49%|████

CPU times: total: 2.94 s
Wall time: 12 s




In [59]:
correct = 0
total = 0
model.eval()
with torch.no_grad():
    for text,lb in test_dl:
        text,lb = text.to(device),lb.to(device)
        out = model(text)
        _,pred = torch.max(out.data,1)
        correct += (pred == lb).sum().item()
        total += lb.size(0)
accuracy = 100 * correct/total
print(f"{correct}/{total}, acc : {accuracy}")

965/1115, acc : 86.54708520179372


In [61]:
ind = 500
text,lb = test_ds[ind]
out = model(text.unsqueeze(0).to(device))
lb = "ham" if lb == 0 else "spam"
if out < 0.5:
    print(f"Original : {lb} , Predict : ham")
else:
    print(f"Original : {lb} , Predict : spam")

Original : spam , Predict : spam
