In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from Encoder import Encoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



In [2]:
hparams = {
    "embedding_dim": 512,
    "n_heads": 8,
    "n_encoders": 2,
    "dff": 50,
    "n_class": 2,
    "lr": 1e-4,
    "epoch": 10,
    "batch_size": 4
}

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
data = pd.read_csv("data.csv")
data = data.sort_values(by='Category')
ham = data[:700]
spam = data[-641:]
data.value_counts().get("spam")
spam.head()
data = pd.concat([spam,ham])
data.tail()

Unnamed: 0,Category,Message
2797,ham,Tell your friends what you plan to do on Valen...
2788,ham,"Forgot it takes me 3 years to shower, sorry. W..."
2786,ham,Yeah get the unlimited
5570,ham,The guy did some bitching but I acted like i'd...
2784,ham,"Just arrived, see you in a couple days &lt;3"


In [5]:
df_train, df_test = train_test_split(data, test_size=0.1)

In [6]:
len(df_train), len(df_test)

(1206, 135)

In [7]:
labels = df_train["Category"].unique()
labels

array(['spam', 'ham'], dtype=object)

In [8]:
l2i = {j:i for i,j in enumerate(labels)}
i2l = {i:j for i,j in enumerate(labels)}
l2i, i2l

({'spam': 0, 'ham': 1}, {0: 'spam', 1: 'ham'})

In [9]:
tokenizer = get_tokenizer('basic_english')

class TextIterator(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data["Message"].values.tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def yield_token(data_iter):
    for i in data_iter:
        yield tokenizer(i)

data_iter = TextIterator(data)
vocab = build_vocab_from_iterator(yield_token(data_iter), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])
print(vocab.get_stoi())

{'independently': 3303, 'division': 1599, '0721072': 2088, 'smsservices': 4049, 'if': 62, '1/2price': 2258, '2nights': 2342, 'homeowners': 1158, 'ur': 24, '08714712412': 2149, '078498****7': 2096, "'": 10, 'latest': 158, 'mtalk': 1782, '83110': 2465, 'points': 265, '<pad>': 0, 'smoke': 1255, '.': 2, 'sheets': 1891, '<unk>': 1, 'pushes': 3844, '18': 210, 'no>': 3633, 'yourself': 1002, '08719181259': 2175, 'lambu': 3401, 'moji': 3557, 'oz': 1810, 'efreefone': 3020, 'that': 41, '!': 4, 'rip': 3930, 'seems': 1886, 'her': 157, 'outdoors': 3699, 'terms': 529, 'group': 3208, 'nat': 1789, 'a': 7, 'mobiles': 516, 'sf': 4003, 'foley': 1634, 'age': 494, 'you': 6, '09066364311': 2230, 'eek': 1611, 'land': 280, ')': 35, 'reflection': 3888, 'every': 118, 'ac/u/natalie2k9': 2525, '10': 336, 'credit': 638, 'vat': 4333, 'to': 3, 'bsn': 2744, '08708034412': 1344, 'or': 19, 'message': 123, '08715203652': 2152, 'all': 71, '1000': 616, 'ias': 1692, 'i': 8, 'lets': 1180, 'ig11': 3294, 'nighters': 3621, '804

In [10]:
class TextDataSet(nn.Module):
    def __init__(self, data):
        self.text = data["Message"].values.tolist()
        self.labels = data["Category"].values.tolist()

    def get_seq_tokens(self, idx):
        tokens = [vocab[i] for i in tokenizer(self.text[idx])]
        l = len(tokens)
        return tokens, l

    def get_labels(self, idx):
        return int(l2i[self.labels[idx]])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        seq, seq_len = self.get_seq_tokens(idx)
        label = self.get_labels(idx)
        return seq, label, seq_len

def collat_fn(batch):
    seq, label, seq_len = zip(*batch)
    max_len = max(seq_len)
    for i in range(len(batch)):
        if len(seq[i]) != max_len:
            for j in range(len(seq[i]), max_len):
                seq[i].append(vocab["<pad>"])
    return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.long) 
        

In [11]:
def train(model, dataset, epochs, lr, bs):
    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.Adam(model.parameters(), lr=lr)
    train_dataset = TextDataSet(dataset)
    train_loader = DataLoader(train_dataset, batch_size=bs, collate_fn=collat_fn, shuffle=True)

    for epoch in range(epochs):
        total_loss_train = 0
        total_acc_train = 0 
        for seq, label in tqdm(train_loader):
            pred = model(seq.to(device))
            label = label.to(device)
            loss = criterion(pred, label)
            optim.zero_grad()
            
            correct = pred.argmax(axis=1) == label
            acc = correct.sum().item() / correct.size(0)
            total_acc_train += correct.float().sum().item()
            total_loss_train += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optim.step()

        print(f"epoch: {epoch+1} loss: {total_loss_train/ len(train_dataset)} acc: {total_acc_train/len(train_dataset)}")

In [12]:
from Classifier import Classifier
model = Classifier(len(vocab), hparams["embedding_dim"], hparams["n_heads"], hparams["n_encoders"], hparams["dff"], hparams["batch_size"]).to(device)

In [13]:
train(model, df_train, hparams["epoch"], hparams["lr"], hparams["batch_size"])

100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 69.56it/s]


epoch: 1 loss: 0.231069816878779 acc: 0.8092868988391376


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 71.34it/s]


epoch: 2 loss: 0.20192402022988049 acc: 0.9361525704809287


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 71.77it/s]


epoch: 3 loss: 0.19517867714413759 acc: 0.9643449419568823


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 71.28it/s]


epoch: 4 loss: 0.19223913397164288 acc: 0.9751243781094527


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 72.55it/s]


epoch: 5 loss: 0.1903100008296334 acc: 0.9850746268656716


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 72.54it/s]


epoch: 6 loss: 0.18908430554380465 acc: 0.988391376451078


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 70.41it/s]


epoch: 7 loss: 0.18762117377165735 acc: 0.9941956882255389


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 73.07it/s]


epoch: 8 loss: 0.18775641799566165 acc: 0.9950248756218906


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:04<00:00, 72.27it/s]


epoch: 9 loss: 0.18723937364953075 acc: 0.9958540630182421


100%|████████████████████████████████████████████████████████████████████████████████| 302/302 [00:05<00:00, 55.24it/s]

epoch: 10 loss: 0.18669389724533753 acc: 0.9983416252072969





In [14]:
df_train["Category"].value_counts(), df_train["Category"].count()

(Category
 ham     634
 spam    572
 Name: count, dtype: int64,
 1206)

In [15]:
def predict(text):
    tokens = tokenizer(text)
    tokens = [vocab[w] for w in tokens]
    tokens = torch.tensor([tokens])
    pred = model(tokens.to(device))
    return pred.argmax(dim=1)

In [16]:
def test(model, dataset, bs=4):
    criterion = nn.CrossEntropyLoss()
    train_dataset = TextDataSet(dataset)
    train_loader = DataLoader(train_dataset, batch_size=bs, collate_fn=collat_fn, shuffle=True)

    total_loss_train = 0
    total_acc_train = 0 
    model.eval()
    for seq, label in tqdm(train_loader):
        pred = model(seq.to(device))
        label = label.to(device)
        loss = criterion(pred, label)
        
        correct = pred.argmax(axis=1) == label
        acc = correct.sum().item() / correct.size(0)
        total_acc_train += correct.float().sum().item()
        total_loss_train += loss.item()

    print(f"val_loss: {total_loss_train/ len(train_dataset)} acc: {total_acc_train/len(train_dataset)}")

In [17]:
test(model, df_test)

100%|█████████████████████████████████████████████████████████████████████████████████| 34/34 [00:00<00:00, 116.64it/s]

val_loss: 0.20010613379655062 acc: 0.9481481481481482



