In [None]:
pip install torchtext==0.4

Uploading the dataset:

In [2]:
import torchtext

ngrams = 1
train_csv_path = 'ag_news_csv/train.csv'
test_csv_path = 'ag_news_csv/test.csv'
vocab = torchtext.vocab.build_vocab_from_iterator(
    torchtext.datasets.text_classification._csv_iterator(train_csv_path, ngrams))
train_data, train_labels = torchtext.datasets.text_classification._create_data_from_iterator(
        vocab, torchtext.datasets.text_classification._csv_iterator(train_csv_path, ngrams, yield_cls=True), False)
test_data, test_labels = torchtext.datasets.text_classification._create_data_from_iterator(
        vocab, torchtext.datasets.text_classification._csv_iterator(test_csv_path, ngrams, yield_cls=True), False)
if len(train_labels ^ test_labels) > 0:
    raise ValueError("Training and test labels don't match")
agnews_train = torchtext.datasets.TextClassificationDataset(vocab, train_data, train_labels)
agnews_test = torchtext.datasets.TextClassificationDataset(vocab, test_data, test_labels)

120000lines [00:01, 63537.39lines/s]
120000lines [00:03, 35966.08lines/s]
7600lines [00:00, 38329.06lines/s]


In [3]:
print(agnews_train[0])

(2, tensor([  432,   426,     2,  1606, 14839,   114,    67,     3,   849,    14,
           28,    15,    28,    16, 50726,     4,   432,   375,    17,    10,
        67508,     7, 52259,     4,    43,  4010,   784,   326,     2]))


In [4]:
len(agnews_train[0][1])

29

In [5]:
len(agnews_train[1][1])

42

Padding and truncating to make them the same length

In [6]:
from torch.nn.utils.rnn import pad_sequence

padded_exs = pad_sequence([agnews_train[0][1], agnews_train[1][1]])
print("First sequence padded: {}".format(padded_exs[:,0]))
print("First sequence length: {}".format(len(padded_exs[:,0])))
print("Second sequence padded: {}".format(padded_exs[:,1]))
print("Second sequence length: {}".format(len(padded_exs[:,1])))

First sequence padded: tensor([  432,   426,     2,  1606, 14839,   114,    67,     3,   849,    14,
           28,    15,    28,    16, 50726,     4,   432,   375,    17,    10,
        67508,     7, 52259,     4,    43,  4010,   784,   326,     2,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])
First sequence length: 42
Second sequence padded: tensor([15875,  1073,   855,  1311,  4251,    14,    28,    15,    28,    16,
          930,   798,   321, 15875,    99,     4, 27658,    29,     6,  4460,
           12,   565, 52791,     9, 80618,  2126,     8,     3,   526,   242,
            4,    29,  3891, 82815,  6575,    11,   207,   360,     7,     3,
          127,     2])
Second sequence length: 42


Creating the DataLoader

In [7]:
import numpy as np
import torch

def collator(batch):
    labels = torch.tensor([example[0] for example in batch])
    sentences = [example[1] for example in batch]
    data = pad_sequence(sentences)
    
    return [data, labels]

In [8]:
BATCH_SIZE = 128

train_loader = torch.utils.data.DataLoader(agnews_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator)
test_loader = torch.utils.data.DataLoader(agnews_test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator)

Hyperparameters defined:

In [9]:
VOCAB_SIZE = len(agnews_train.get_vocab())
EMBED_DIM = 100
HIDDEN_DIM = 64
NUM_OUTPUTS = len(agnews_train.get_labels())
NUM_EPOCHS = 3

SWEM example. Here, the word vector parameters are learned as well:

In [10]:
import torch.nn as nn
import torch.nn.functional as F

class SWEM(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_dim, num_outputs):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        self.fc1 = nn.Linear(embedding_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_outputs)

    def forward(self, x):
        embed = self.embedding(x)
        embed_mean = torch.mean(embed, dim=0)
        
        h = self.fc1(embed_mean)
        h = F.relu(h)
        h = self.fc2(h)
        return h

Training and evaluating:

In [11]:
model = SWEM(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_OUTPUTS)

In [12]:
from tqdm.notebook import tqdm

# Cross entropy (CE) Loss and Adam Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Iterate through train set minibatchs 
for epoch in range(NUM_EPOCHS):
    correct = 0
    num_examples = 0
    for inputs, labels in tqdm(train_loader):
        # Zero out the gradients
        optimizer.zero_grad()
        
        # Forward pass
        y = model(inputs)
        loss = criterion(y, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        predictions = torch.argmax(y,dim=1)
        correct += torch.sum((predictions == labels).float())        
        num_examples += len(labels)        
    # Print training progress
    # Not working for some reason
    ##if epoch % 25 == 0:
      ##  acc = correct/num_examples
        ##print("Epochs {0}: \t Train Loss: {1} \t Train Acc: {2}".format(epoch, loss, acc))

## Testing
correct = 0
num_test = 0

with torch.no_grad():
    # Iterate through test set minibatchs 
    for inputs, labels in tqdm(test_loader):
        # Forward pass
        y = model(inputs)        
        predictions =  torch.argmax(y,dim=1)
        correct += torch.sum((predictions == labels).float())
        num_test += len(labels) 
print('Test accuracy: {}'.format(correct/num_test))

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

Test accuracy: 0.9019736647605896


In [13]:
pytorch_total_params_SWEM = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(pytorch_total_params_SWEM)

9587924


Let's try a RNN now:

In [14]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_dim, num_outputs):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        self.rnn = nn.RNN(embedding_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_outputs)

    def forward(self, x):
        embed = self.embedding(x)
        output, h_final = self.rnn(embed)
        output = self.fc2(h_final.squeeze(0))
        return output

Training:

In [15]:
model_rnn = RNN(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_OUTPUTS)

In [None]:
# Cross entropy (CE) Loss and Adam Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_rnn.parameters(), lr=0.001)

# Iterate through train set minibatchs 
for epoch in range(NUM_EPOCHS):
    correct = 0
    num_examples = 0
    for inputs, labels in tqdm(train_loader):
        # Zero out the gradients
        optimizer.zero_grad()
        
        # Forward pass
        y = model_rnn(inputs)
        loss = criterion(y, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        predictions = torch.argmax(y,dim=1)
        correct += torch.sum((predictions == labels).float())        
        num_examples += len(labels)        
    # Print training progress
    # Not working for some reason
    ##if epoch % 25 == 0:
      ##  acc = correct/num_examples
        ##print("Epochs {0}: \t Train Loss: {1} \t Train Acc: {2}".format(epoch, loss, acc))

## Testing
correct = 0
num_test = 0

with torch.no_grad():
    # Iterate through test set minibatchs 
    for inputs, labels in tqdm(test_loader):
        # Forward pass
        y = model_rnn(inputs)        
        predictions =  torch.argmax(y,dim=1)
        correct += torch.sum((predictions == labels).float())
        num_test += len(labels) 
print('Test accuracy: {}'.format(correct/num_test))

In [16]:
pytorch_total_params = sum(p.numel() for p in model_rnn.parameters() if p.requires_grad)
print(pytorch_total_params)

9592084
