# Text Classifier

Chapter 5 of Programming PyTorch for Deep Learning, but using samples from the [TREC 2005 Spam Corpus](https://trec.nist.gov/data/spam.html) instead of tweets.

In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext

In [2]:
def my_tokenize(s):
    return s.split()

In [3]:
TEXT = data.Field(lower=True, tokenize=my_tokenize)
LABEL = data.Field(lower=True)
samples = data.TabularDataset(path='./data/ham-spam-samples.tsv',
                              format='tsv', 
                              fields=[("label",LABEL), ("statement",TEXT)],
                              skip_header=False)

In [4]:
(training, testing, validating) = samples.split(split_ratio=[0.6,0.2,0.2])
(len(training),len(testing),len(validating))

(181, 60, 60)

In [5]:
vocab_size = 30000
TEXT.build_vocab(training, max_size = vocab_size)
LABEL.build_vocab(training)
TEXT.vocab.freqs.most_common(10)

[('the', 598),
 ('to', 413),
 ('and', 299),
 ('of', 278),
 ('a', 223),
 ('for', 213),
 ('in', 200),
 ('you', 175),
 ('is', 174),
 ('your', 170)]

In [6]:
len(TEXT.vocab)

4619

In [11]:
vars(training.examples[30])

{'label': ['ham'],
 'statement': ['rod-many',
  'thanks',
  'for',
  'helping',
  'me',
  'out',
  'with',
  'sabic.',
  'it',
  'was',
  'nice',
  'to',
  'host',
  'them',
  'on',
  'my',
  'turf',
  'rather',
  'than',
  'look',
  'across',
  'the',
  'table',
  'at',
  'a',
  'bunch',
  'of',
  'guys',
  'in',
  'dish',
  'dashes.',
  "i've",
  'gotten',
  'great',
  'feedback',
  'and',
  "we're",
  'a',
  'step',
  'closer',
  'in',
  'landing',
  'them',
  'as',
  'a',
  'partner.',
  'thanks',
  'again.',
  'terry']}

In [12]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [13]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(training, validating, testing), 
batch_size = 32,
device = device,
sort_key = lambda x: len(x.statement),
sort_within_batch = False)

# Defining the model

Start with a simple [Long short-term memory (LSTM)](https://en.wikipedia.org/wiki/Long_short-term_memory) model. 

Unlike the book, which relies on a three-part classifier, this model is doing a binary comparison, with an activation ([sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function)) function.

In [14]:
class BasicLSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(BasicLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 1)
        self.activator = nn.Sigmoid()

    def forward(self, seq):
        output, (hidden, _) = self.encoder(self.embedding(seq))
        prediction = self.predictor(torch.squeeze(hidden))
        prediction = self.activator(prediction)
        return prediction

In [15]:
example = next(iter(train_iterator))

In [16]:
vars(example)

{'batch_size': 21,
 'dataset': <torchtext.data.dataset.Dataset at 0x7f43f68881d0>,
 'fields': dict_keys(['label', 'statement']),
 'input_fields': ['label', 'statement'],
 'target_fields': [],
 'label': tensor([[3, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 3, 3, 2, 2]]),
 'statement': tensor([[ 188,  474,  559,  ...,  754,    2, 1506],
         [ 178,   14,  203,  ..., 4073,  245,   77],
         [ 183,  778,  241,  ..., 1879,  588,   14],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]])}

In [17]:
model = BasicLSTM(100, 300, 30000)
model.to(device)

BasicLSTM(
  (embedding): Embedding(30000, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=1, bias=True)
  (activator): Sigmoid()
)

In [43]:
def train(epochs, nn, optimizer, criterion, train_iterator, valid_iterator):
    for epoch in range(1, epochs + 1):

        training_loss = 0.0
        valid_loss = 0.0
        nn.train()
        for batch_idx, batch in enumerate(train_iterator):
            optimizer.zero_grad()
            predict = nn(batch.statement)
            loss = criterion(predict, batch.label.float())
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * batch.statement.size(0)
        training_loss /= len(train_iterator)
 
        
        model.eval()
        for batch_idx,batch in enumerate(valid_iterator):
            predict = model(batch.statement)
            loss = criterion(predict, batch.label.float())
            valid_loss += loss.data.item() * batch.statement.size(0)
 
        valid_loss /= len(valid_iterator)
        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss, valid_loss))

In [46]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [47]:
train(20, model, optimizer, criterion, train_iterator, valid_iterator)

Epoch: 1, Training Loss: -6739.23, Validation Loss: -5026.87
Epoch: 2, Training Loss: -6885.04, Validation Loss: -5026.87
Epoch: 3, Training Loss: -6868.81, Validation Loss: -5026.87
Epoch: 4, Training Loss: -6762.66, Validation Loss: -5026.87
Epoch: 5, Training Loss: -6782.96, Validation Loss: -5026.87
Epoch: 6, Training Loss: -6854.80, Validation Loss: -5026.87
Epoch: 7, Training Loss: -6954.90, Validation Loss: -5026.87
Epoch: 8, Training Loss: -6868.28, Validation Loss: -5026.87
Epoch: 9, Training Loss: -6868.24, Validation Loss: -5026.87
Epoch: 10, Training Loss: -6855.04, Validation Loss: -5026.87
Epoch: 11, Training Loss: -6918.01, Validation Loss: -5026.87
Epoch: 12, Training Loss: -6864.77, Validation Loss: -5026.87
Epoch: 13, Training Loss: -6825.58, Validation Loss: -5026.87
Epoch: 14, Training Loss: -6945.06, Validation Loss: -5026.87
Epoch: 15, Training Loss: -6866.98, Validation Loss: -5026.87
Epoch: 16, Training Loss: -6841.51, Validation Loss: -5026.87
Epoch: 17, Traini

# Making Predictions

In [48]:
def classify_text(text):
    categories = {0: 'ham', 1: 'spam'}
    processed = TEXT.process([TEXT.preprocess(text)])
    processed = processed.to(device)
    return categories[model(processed).argmax().item()]

In [49]:
classify_text(testing.examples[0].statement)

'ham'

In [17]:
testing.examples[0].label[0]

'ham'

In [50]:
correct = 0
examined = 0
for test_example in testing.examples:
    actual = test_example.label[0]
    predicted = classify_text(test_example.statement)
    examined += 1
    if actual == predicted:
        correct += 1
        print('Correct   --> {}/{} right overall {:.2%}'.format(correct, examined, correct / examined))
    else:
        print('Incorrect --> {}/{} right overall {:.2%}'.format(correct, examined, correct / examined))

Incorrect --> 0/1 right overall 0.00%
Correct   --> 1/2 right overall 50.00%
Correct   --> 2/3 right overall 66.67%
Incorrect --> 2/4 right overall 50.00%
Correct   --> 3/5 right overall 60.00%
Correct   --> 4/6 right overall 66.67%
Correct   --> 5/7 right overall 71.43%
Correct   --> 6/8 right overall 75.00%
Incorrect --> 6/9 right overall 66.67%
Correct   --> 7/10 right overall 70.00%
Incorrect --> 7/11 right overall 63.64%
Incorrect --> 7/12 right overall 58.33%
Incorrect --> 7/13 right overall 53.85%
Correct   --> 8/14 right overall 57.14%
Incorrect --> 8/15 right overall 53.33%
Incorrect --> 8/16 right overall 50.00%
Correct   --> 9/17 right overall 52.94%
Correct   --> 10/18 right overall 55.56%
Correct   --> 11/19 right overall 57.89%
Correct   --> 12/20 right overall 60.00%
Correct   --> 13/21 right overall 61.90%
Incorrect --> 13/22 right overall 59.09%
Correct   --> 14/23 right overall 60.87%
Correct   --> 15/24 right overall 62.50%
Correct   --> 16/25 right overall 64.00%
In