# Text Classifier

Chapter 5 of Programming PyTorch for Deep Learning, but using samples from the [TREC 2005 Spam Corpus](https://trec.nist.gov/data/spam.html) instead of tweets.

In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext

In [2]:
def my_tokenize(s):
    return s.split()

In [3]:
TEXT = data.Field(lower=True, tokenize=my_tokenize)
LABEL = data.Field(lower=True)
samples = data.TabularDataset(path='./data/ham-spam-samples.tsv',
                              format='tsv', 
                              fields=[("label",LABEL), ("statement",TEXT)],
                              skip_header=False)

In [4]:
(training, testing, validating) = samples.split(split_ratio=[0.6,0.2,0.2])
(len(training),len(testing),len(validating))

(240, 80, 80)

In [5]:
vocab_size = 80
TEXT.build_vocab(training, max_size = vocab_size)
LABEL.build_vocab(training)
TEXT.vocab.freqs.most_common(10)

[('2001', 500),
 ('jul', 464),
 ('by', 326),
 ('with', 297),
 ('for', 271),
 ('from:', 270),
 ('to:', 260),
 ('id', 249),
 ('5', 200),
 ('from', 197)]

In [6]:
len(TEXT.vocab)

82

In [7]:
vars(training.examples[1])

{'label': ['spam'],
 'statement': ['y',
  'mailman.enron.com',
  '(8.10.1/8.10.1/corp-1.06)',
  'with',
  'esmtp',
  'id',
  'g343bbl51389',
  'for',
  '<matt.motley@enron.com>;',
  'wed,',
  '4',
  'jul',
  '2001',
  '14:51:50',
  '-0500',
  '(cdt)',
  'date:',
  'wed,',
  '4',
  'jul',
  '2001',
  '19:48:22',
  '+0000',
  'from:',
  '=?windows-1251?b?0oxq6+ds7e7lio/w5etr7ubl7ejl?=',
  '<dongming@aguascalientes.com',
  '>',
  'x-mailer:',
  'the',
  'bat!',
  '(v2.01)',
  'reply-to:',
  '=?windows-1251?b?0oxq6+ds7e7lio/w5etr7ubl7ejl?=',
  '<dongming@aguascalientes.com',
  '>',
  'x-priority:',
  '3',
  '(normal)',
  'message-id:',
  '<183920372.20041230020824@>',
  'to:',
  'matt.motley@enron.com',
  'subject:',
  '=?windows-1251?b?0oxq6+ds4dogwmvkzspoy9wgyidnzsloznmgw87e0yehisddy8jszdvfimlizcag?=',
  '=?windows-1251?b?ycdkzs3c38riiseh?=',
  'mime-version:',
  '1.0',
  'content-type:',
  'text/plain;',
  'charset=windows-1251',
  'content-transfer-encoding:',
  '8bit',
  '��������',
 

In [8]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [9]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(training, validating, testing), 
batch_size = 32,
device = device,
sort_key = lambda x: len(x.statement),
sort_within_batch = False)

# Defining the model

Start with a simple [Long short-term memory (LSTM)](https://en.wikipedia.org/wiki/Long_short-term_memory) model

In [10]:
class BasicLSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(BasicLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 1)

    def forward(self, seq):
        output, (hidden, _) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds.double()

In [11]:
def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):
    m = nn.Sigmoid()
    for epoch in range(1, epochs + 1):
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch_idx, batch in enumerate(train_iterator):
            optimizer.zero_grad()
            predict = model(batch.statement)
            loss = criterion(m(predict), batch.label.double())
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * batch.statement.size(0)
        training_loss /= len(train_iterator)
 
        model.eval()
        for batch_idx,batch in enumerate(valid_iterator):
            predict = model(batch.statement)
            loss = criterion(m(predict), batch.label.double())
            valid_loss += loss.data.item() * batch.statement.size(0)
 
        valid_loss /= len(valid_iterator)
        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss, valid_loss))

In [12]:
model = BasicLSTM(100, 300, 82)
model.to(device)

BasicLSTM(
  (embedding): Embedding(82, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=1, bias=True)
)

In [13]:
optimizer = optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.BCELoss()

In [14]:
train(5, model, optimizer, criterion, train_iterator, valid_iterator)

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch: 1, Training Loss: -848.09, Validation Loss: -1355.50
Epoch: 2, Training Loss: -2253.47, Validation Loss: -2485.72
Epoch: 3, Training Loss: -3649.51, Validation Loss: -3622.20
Epoch: 4, Training Loss: -4975.94, Validation Loss: -4550.91
Epoch: 5, Training Loss: -5842.60, Validation Loss: -4923.16
