In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from Encoder import Encoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data = pd.read_csv("data.csv")
data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df_train, df_test = train_test_split(data, test_size=0.1)

In [5]:
len(df_train), len(df_test)

(5014, 558)

In [6]:
labels = df_train["Category"].unique()
labels

array(['ham', 'spam'], dtype=object)

In [7]:
l2i = {j:i for i,j in enumerate(labels)}
i2l = {i:j for i,j in enumerate(labels)}
l2i, i2l

({'ham': 0, 'spam': 1}, {0: 'ham', 1: 'spam'})

In [8]:
tokenizer = get_tokenizer('basic_english')

class TextIterator(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data["Message"].values.tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def yield_token(data_iter):
    for i in data_iter:
        yield tokenizer(i)

data_iter = TextIterator(data)
vocab = build_vocab_from_iterator(yield_token(data_iter), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])
print(vocab.get_stoi())



In [9]:
class TextDataSet(nn.Module):
    def __init__(self, data):
        self.text = data["Message"].values.tolist()
        self.labels = data["Category"].values.tolist()

    def get_seq_tokens(self, idx):
        tokens = [vocab[i] for i in tokenizer(self.text[idx])]
        l = len(tokens)
        return tokens, l

    def get_labels(self, idx):
        return int(l2i[self.labels[idx]])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        seq, seq_len = self.get_seq_tokens(idx)
        label = self.get_labels(idx)
        return seq, label, seq_len

def collat_fn(batch):
    seq, label, seq_len = zip(*batch)
    max_len = max(seq_len)
    for i in range(len(batch)):
        if len(seq[i]) != max_len:
            for j in range(len(seq[i]), max_len):
                seq[i].append(vocab["<pad>"])
    return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.long) 
        

In [10]:
def train(model, dataset, epochs, lr, bs):
    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.Adam(model.parameters(), lr=lr)
    train_dataset = TextDataSet(dataset)
    train_loader = DataLoader(train_dataset, batch_size=bs, collate_fn=collat_fn, shuffle=True)

    for epoch in range(epochs):
        total_loss_train = 0
        total_acc_train = 0 
        for seq, label in tqdm(train_loader):
            pred = model(seq.to(device))
            label = label.to(device)
            loss = criterion(pred, label)
            optim.zero_grad()
            
            correct = pred.argmax(axis=1) == label
            acc = correct.sum().item() / correct.size(0)
            total_acc_train += correct.float().sum().item()
            total_loss_train += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optim.step()

        print(f"epoch: {epoch+1} loss: {total_loss_train/ len(train_dataset)} acc: {total_acc_train/len(train_dataset)}")

In [11]:
from Classifier import Classifier
model = Classifier(len(vocab), 100, 4, 6, 2).to(device)

In [12]:
train(model, df_train, 10, 1e-2, 10)

  return self._call_impl(*args, **kwargs)
100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:18<00:00, 26.43it/s]


epoch: 1 loss: 0.045054664204545736 acc: 0.8619864379736737


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:19<00:00, 25.36it/s]


epoch: 2 loss: 0.044995496525772075 acc: 0.8639808536098923


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:19<00:00, 25.44it/s]


epoch: 3 loss: 0.04496557774608431 acc: 0.8639808536098923


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:19<00:00, 25.62it/s]


epoch: 4 loss: 0.04502540973015501 acc: 0.8639808536098923


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:19<00:00, 25.26it/s]


epoch: 5 loss: 0.04502541016999782 acc: 0.8639808536098923


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:19<00:00, 25.31it/s]


epoch: 6 loss: 0.044995493916434315 acc: 0.8639808536098923


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:19<00:00, 25.36it/s]


epoch: 7 loss: 0.04496557757371348 acc: 0.8639808536098923


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:19<00:00, 25.16it/s]


epoch: 8 loss: 0.045025410146222536 acc: 0.8639808536098923


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:22<00:00, 22.75it/s]


epoch: 9 loss: 0.044995493684625264 acc: 0.8639808536098923


100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [00:20<00:00, 23.91it/s]

epoch: 10 loss: 0.04499549377972641 acc: 0.8639808536098923





In [13]:
df_train["Category"].value_counts(), df_train["Category"].count()

(Category
 ham     4332
 spam     682
 Name: count, dtype: int64,
 5014)

In [14]:
for name, param in model.named_parameters():
    print(name, param.grad)

encoder.layers.0.mha.W_k.weight tensor([[-2.5436e-16,  4.3253e-16,  5.6743e-16,  ...,  9.7732e-17,
         -5.5557e-16, -8.6212e-17],
        [-2.1894e-16,  1.3813e-16, -1.4228e-16,  ..., -1.6660e-17,
         -1.3561e-16, -5.1851e-17],
        [ 1.0972e-16, -3.4952e-16, -7.0586e-16,  ..., -1.4725e-16,
          4.5821e-16,  8.0652e-17],
        ...,
        [-6.5284e-17, -2.3580e-16,  3.9823e-17,  ..., -1.5380e-16,
         -1.1793e-16, -1.5611e-16],
        [-2.2282e-16, -3.8805e-18, -1.9166e-16,  ..., -1.5270e-16,
          1.7564e-16, -1.0640e-16],
        [-1.6725e-17, -2.1043e-17, -3.3540e-18,  ...,  1.9259e-17,
         -1.4674e-17,  5.4799e-18]], device='cuda:0')
encoder.layers.0.mha.W_q.weight tensor([[ 9.9952e-17,  5.0565e-17,  7.5463e-17,  ..., -1.4139e-16,
         -1.5489e-16, -6.5641e-17],
        [-5.7799e-17, -1.1179e-16,  1.7077e-16,  ...,  6.2102e-17,
          5.3461e-17,  7.9233e-17],
        [ 6.0943e-17,  9.9492e-17,  3.9838e-16,  ...,  9.9235e-17,
         -1.31