# Text Classification using torchtext and a Embedding-FNN model in Pytorch

## Installing the libraries

In [1]:
!pip install torch torchdata



In [16]:
!pip install -U portalocker>=2.0.0 wandb

In [26]:
import torch
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

from torch import nn
import wandb
import time

from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

### Download the dataset and visualize it

In [5]:
# Download the dataset from torchtext datasets
train_iter = iter(AG_NEWS(split="train"))

#Show an example
next(train_iter)

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [20]:
# Define the class names in a dictionary to facilitate human evaluation
ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}

### Data preprocessing

Here is an example for typical NLP data processing with tokenizer and vocabulary. The first step is to build a vocabulary with the raw training dataset. Here we use built in factory function build_vocab_from_iterator which accepts iterator that yield list or iterator of tokens. Users can also pass any special symbols to be added to the vocabulary

In [7]:
# Download the tokenizer for english text
tokenizer = get_tokenizer("basic_english")

# Tokenize the text
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# Create the vocabulary dictionary 
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
# Define the unknown token
vocab.set_default_index(vocab["<unk>"])
# Test and show a tokenized text example
vocab(['here', 'is', 'an', 'example'])

[475, 21, 30, 5297]

Prepare the text processing pipeline with the tokenizer and vocabulary. The text and label pipelines will be used to process the raw data strings from the dataset iterators. The text pipeline converts a text string into a list of integers based on the lookup table defined in the vocabulary. The label pipeline converts the label into integers

In [10]:
#Convert text string into vocabulary items
text_pipeline = lambda x: vocab(tokenizer(x))
# Convert to the right label, starting from 0
label_pipeline = lambda x: int(x) - 1
# Test both pipelines
print(text_pipeline('here is the an example'))
print(label_pipeline('4'))

[475, 21, 2, 30, 5297]
3


### Generate data batches

In [13]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Defie the collation function for a batch data
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


# Create a Dataloader using the collator
dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)

## Create the model

The model is composed of the nn.EmbeddingBag layer plus a linear layer for the classification purpose. nn.EmbeddingBag with the default mode of “mean” computes the mean value of a “bag” of embeddings. Although the text entries here have different lengths, nn.EmbeddingBag module requires no padding here since the text lengths are saved in offsets.

Additionally, since nn.EmbeddingBag accumulates the average across the embeddings on the fly, nn.EmbeddingBag can enhance the performance and memory efficiency to process a sequence of tensors.

In [15]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

## Initialize the training process

Connect to Weigths&biases to track training metrics and model performance

In [19]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/studio-lab-user/.netrc


True

In [54]:
# Download the train and test dataset
train_iter, test_iter = AG_NEWS()


Define some paramete4rs and hyperparameters for the training phase:

In [55]:
# Set a dictionary with the configuration partameters    
config = {
    'num_class' : len(set([label for (label, text) in train_iter])),
    'vocab_size' : len(vocab),
    'validation_split' : 0.1,    
    'embedding_size' : 64,
    'log_interval' : 500,
    # Hyperparameters
    'epochs' : 8,  # epoch
    'lr' : 5,  # learning rate
    'batch_size' : 64,  # batch size for training
}


In [56]:
print(config)

{'num_class': 4, 'vocab_size': 95810, 'validation_split': 0.1, 'embedding_size': 64, 'log_interval': 500, 'epochs': 8, 'lr': 5, 'batch_size': 64}


## Define the train and evaluation functions

In [57]:
def train(dataloader):
    model.train()
    total_acc, total_count, running_loss = 0, 0, 0
    log_interval = config.log_interval
    start_time = time.time()
    # For every batch
    for idx, (label, text, offsets) in enumerate(dataloader):
        # Reset gradients
        optimizer.zero_grad()
        # Get predictions
        predicted_label = model(text, offsets)
        # Calculate loss
        loss = criterion(predicted_label, label)
        # Run backward
        loss.backward()
        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        # Apply optimizer
        optimizer.step()
        # Calculate accuracy metric
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        # print statistics
        running_loss += loss.item()
        # Register training metrics
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            # Log to wandb
            wandb.log({'epoch': epoch, 'step': idx, 'loss': running_loss / log_interval, 
                       'accuracy': total_acc / total_count})
            
            total_acc, total_count, running_loss = 0, 0, 0
            start_time = time.time()


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

## Run the training process

In [58]:
# Init the W&B job to collect metrics
wandb.init(project='text_classification_demo', config=config, save_code=True, job_type='training')
config = wandb.config


In [59]:
# Init the W&B job to collect metrics

# define the model
model = TextClassificationModel(config.vocab_size, config.embedding_size, config.num_class).to(device)

# Set the loss function, the optimizer and the lr scheduler
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=config.lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

total_accu = None
# Convert to Datasets
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
# Split the training dataset into a train and validation datasets
num_train = int(len(train_dataset) * (1.0-config.validation_split))
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=config.batch_size, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=config.batch_size, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_batch
)

for epoch in range(1, config.epochs + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)
    wandb.log({'epoch': epoch, 'validation_acc': accu_val})

| epoch   1 |   500/ 1688 batches | accuracy    0.699
| epoch   1 |  1000/ 1688 batches | accuracy    0.859
| epoch   1 |  1500/ 1688 batches | accuracy    0.875
-----------------------------------------------------------
| end of epoch   1 | time: 33.35s | valid accuracy    0.892 
-----------------------------------------------------------
| epoch   2 |   500/ 1688 batches | accuracy    0.898
| epoch   2 |  1000/ 1688 batches | accuracy    0.900
| epoch   2 |  1500/ 1688 batches | accuracy    0.900
-----------------------------------------------------------
| end of epoch   2 | time: 34.55s | valid accuracy    0.899 
-----------------------------------------------------------
| epoch   3 |   500/ 1688 batches | accuracy    0.912
| epoch   3 |  1000/ 1688 batches | accuracy    0.911
| epoch   3 |  1500/ 1688 batches | accuracy    0.916
-----------------------------------------------------------
| end of epoch   3 | time: 30.89s | valid accuracy    0.908 
-------------------------------

## Evaluate the test dataset

In [60]:
print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader)
print("test accuracy {:8.3f}".format(accu_test))
wandb.log({'test_acc': accu_test})

Checking the results of test dataset.
test accuracy    0.908


In [61]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.120714…

0,1
accuracy,▁▆▆▇▇▇▇▇▇▇▇▇█▇██████████
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
loss,█▄▃▃▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁
step,▁▅█▁▅█▁▅█▁▅█▁▅█▁▅█▁▅█▁▅█
test_acc,▁
validation_acc,▁▃▆▆▅███

0,1
accuracy,0.9455
epoch,8.0
loss,0.17061
step,1500.0
test_acc,0.90803
validation_acc,0.91325


## Make a prediction

In [62]:
def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1


ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to("cpu")

print("This is a %s news" % ag_news_label[predict(ex_text_str, text_pipeline)])

This is a Sports news
