# Text Classifier

Chapter 5 of Programming PyTorch for Deep Learning, but using samples from the [TREC 2005 Spam Corpus](https://trec.nist.gov/data/spam.html) instead of tweets.

In [10]:
import torch
from torchtext import data 
import torchtext

In [2]:
def my_tokenize(s):
    return s.split()

In [3]:
TEXT = data.Field(lower=True, tokenize=my_tokenize)
LABEL = data.Field(lower=True)
samples = data.TabularDataset(path='./data/ham-spam-samples.tsv',
                              format='tsv', 
                              fields=[("label",LABEL), ("statement",TEXT)],
                              skip_header=False)

In [5]:
(train, test, valid) = samples.split(split_ratio=[0.6,0.2,0.2])

In [6]:
(len(train),len(test),len(valid))

(240, 80, 80)

In [8]:
vocab_size = 80
TEXT.build_vocab(train, max_size = vocab_size)
LABEL.build_vocab(train)
TEXT.vocab.freqs.most_common(10)

[('2001', 516),
 ('jul', 467),
 ('by', 343),
 ('with', 300),
 ('from:', 275),
 ('to:', 257),
 ('for', 254),
 ('id', 252),
 ('5', 201),
 ('received:', 199)]

In [11]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [12]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train, valid, test), 
batch_size = 32,
device = device,
sort_key = lambda x: len(x.statement),
sort_within_batch = False)