In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import numpy as np
import torch
import torchtext  # 0.6.0
from torchtext.data import Field, BucketIterator, TabularDataset
import torch.nn as nn
import torch.optim as optim

import nltk

nltk.download("punkt")
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv("datasets/ham-spam/spam.csv", encoding="latin-1")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data = data.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [4]:
data = data.rename(index=str, columns={"v1": "labels", "v2": "text"})
data.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [6]:
train.reset_index(drop=True), test.reset_index(drop=True)

(     labels                                               text
 0       ham  No I'm in the same boat. Still here at my moms...
 1      spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
 2       ham     They r giving a second chance to rahul dengra.
 3       ham     O i played smash bros  &lt;#&gt;  religiously.
 4      spam  PRIVATE! Your 2003 Account Statement for 07973...
 ...     ...                                                ...
 4452    ham  I came hostel. I m going to sleep. Plz call me...
 4453    ham                             Sorry, I'll call later
 4454    ham      Prabha..i'm soryda..realy..frm heart i'm sory
 4455    ham                         Nt joking seriously i told
 4456    ham                In work now. Going have in few min.
 
 [4457 rows x 2 columns],
      labels                                               text
 0       ham  Funny fact Nobody teaches volcanoes 2 erupt, t...
 1       ham  I sent my scores to sophas and i had to do sec...
 2      spam

In [7]:
train.shape, test.shape

((4457, 2), (1115, 2))

In [8]:
train.to_csv("datasets/ham-spam/train.csv", index=False)
test.to_csv("datasets/ham-spam/test.csv", index=False)

In [9]:
TEXT = torchtext.data.Field(tokenize=word_tokenize)

In [10]:
LABEL = torchtext.data.LabelField(dtype=torch.float)


In [11]:
datafields = [("labels", LABEL), ("text", TEXT)]

In [12]:
trn, tst = torchtext.data.TabularDataset.splits(
    path="./datasets/ham-spam",
    train="train.csv",
    test="test.csv",
    format="csv",
    skip_header=True,
    fields=datafields,
)

In [13]:
trn[:5]

[<torchtext.data.example.Example at 0x1d6a454c9d0>,
 <torchtext.data.example.Example at 0x1d6a454c9a0>,
 <torchtext.data.example.Example at 0x1d6a454cd90>,
 <torchtext.data.example.Example at 0x1d6a454c4c0>,
 <torchtext.data.example.Example at 0x1d6a454e6e0>]

In [14]:
print(f"Number of training examples: {len(trn)}")
print(f"Number of testing examples: {len(tst)}")


Number of training examples: 4457
Number of testing examples: 1115


In [15]:
trn[5].__dict__.keys()

dict_keys(['labels', 'text'])

In [16]:
trn[5].text

['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']

In [17]:
trn[5].labels

'ham'

In [18]:
print(vars(trn.examples[5]))

{'labels': 'ham', 'text': ['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']}


In [19]:
TEXT.build_vocab(trn, max_size=10500)


In [20]:
LABEL.build_vocab(trn)

In [21]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 10213
Unique tokens in LABEL vocabulary: 2


In [22]:
print(TEXT.vocab.freqs.most_common(50))

[('.', 3847), ('to', 1750), ('I', 1574), (',', 1468), ('you', 1462), ('?', 1256), ('!', 1134), ('a', 1068), ('the', 946), ('...', 923), ('&', 772), ('i', 760), ('and', 673), ('in', 663), ('is', 647), (';', 641), ('u', 636), ('me', 600), (':', 570), ('..', 544), ('for', 527), ('my', 494), ('of', 471), ('it', 470), ('your', 461), ('have', 395), ('on', 394), (')', 393), ('2', 390), ('that', 385), ("'s", 384), ("'m", 320), ('now', 318), ('are', 316), ('do', 312), ('call', 307), ('at', 301), ('U', 300), ('or', 298), ('not', 295), ("n't", 281), ('be', 275), ('*', 270), ('lt', 267), ('gt', 267), ('with', 267), ('get', 265), ('will', 264), ('so', 257), ('#', 245)]


In [23]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '.', 'to', 'I', ',', 'you', '?', '!', 'a']


In [24]:
print(LABEL.vocab.stoi)

defaultdict(None, {'ham': 0, 'spam': 1})


In [25]:
batch_size = 64

train_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (trn, tst),
    batch_size=batch_size,
    sort_key=lambda x: len(x.text),
    sort_within_batch=False,
)

In [26]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded_dropout = self.dropout(embedded)
        # output, hidden = self.rnn(embedded)
        output, (hidden, _) = self.rnn(embedded_dropout)
        hidden_1D = hidden.squeeze(0)
        assert torch.equal(output[-1, :, :], hidden_1D)
        return self.fc(hidden_1D)

In [27]:
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

In [28]:
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [29]:
optimizer = optim.Adam(model.parameters(), lr=1e-6)

In [30]:
criterion = nn.BCEWithLogitsLoss()

In [31]:
def train(model, iterator, optimzer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.labels)
        rounded_pred = torch.round(torch.sigmoid(predictions))
        correct = (rounded_pred == batch.labels).float()
        acc = correct.sum() / len(correct)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
num_epochs = 5
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    print(
        f"| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%"
    )

| Epoch: 01 |Train Loss: 0.668 | Train Acc: 84.00%
| Epoch: 02 |Train Loss: 0.657 | Train Acc: 85.60%
| Epoch: 03 |Train Loss: 0.646 | Train Acc: 85.72%
| Epoch: 04 |Train Loss: 0.636 | Train Acc: 85.76%
| Epoch: 05 |Train Loss: 0.626 | Train Acc: 85.75%


In [33]:
epoch_loss = 0
epoch_acc = 0


In [34]:
model.eval()


RNN(
  (embedding): Embedding(10213, 100)
  (rnn): LSTM(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [35]:
with torch.no_grad():
    for batch in test_iterator:
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.labels)
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float()
        acc = correct.sum() / len(correct)
        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)

print(f"| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |")

| Test Loss: 0.664 | Test Acc: 76.48% |
