# Pytorch Text - Text Classification with the torchtext library
Notebook for following along with Pytorch Text NLP tutorials that is looking to use the torchtext library to build the dataset for text classification analysis [Pytorch](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html)  website tutorial. <br><br>

### Choices for data

<br>

### Libaries and Modules
Importing the necessary libaries and modules for the notebook.

In [1]:
#Import cell
import glob
import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import math
import numpy as np
import os
import pandas as pd
import pickle as pk
import random
import re
import string
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

from io import open
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(1247) #setting seed value
print(f"Device: {device}. Cuda available: {torch.cuda.is_available()}")
print(f"Torch current seed = {torch.seed()}")
print("Imports complete")

Device: cpu. Cuda available: False
Torch current seed = 723166802715600
Imports complete


<br>

### Data Loading and Manipulation Functions
<b>Functions:</b><br>
<ul>
    <li>collate_batch - uses pipelines to process input batch of data</li>
    <li>yield_tokens - processes data_iter for build_vocab_from_iterator()</li>
</ul>

In [2]:
#Data loading and manipulation function definition cell
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)
        
        
label_pipeline = lambda x: int(x) - 1        
text_pipeline = lambda x: vocab(tokenizer(x))

print("Data loading and manipulation functions defined.")

Data loading and manipulation functions defined.


### Importing and preparing data sets
Importing and preparing the data for the models.

In [3]:
#Importing data sets
train_iter = iter(AG_NEWS(split='train'))

#Printing demonstration training data
for i in range(3): print(next(train_iter), "\n")

print(f"\nData sets successfully imported, running on device: {device}")

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.") 

(3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.') 

(3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.") 


Data sets successfully imported, running on device: cpu


In [4]:
#Build a vocab with the raw training dataset, generating data batch and iter
tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')
num_class = len(set([label for (label, text) in train_iter]))
     
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
vocab_size = len(vocab)

dataloader = DataLoader(train_iter, batch_size=8, shuffle=False,
                        collate_fn=collate_batch)

print(vocab(['here', 'is', 'an', 'example']))
print(text_pipeline('here is an example'))
print(label_pipeline('10'))

[475, 21, 30, 5297]
[475, 21, 30, 5297]
9


In [5]:
BATCH_SIZE = 64

train_iter, test_iter = AG_NEWS()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset)*0.95)

split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset)-num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

<br>

### Class Definitions
<b>Classes:</b><br>
<ul>
    <li>TextClassificationModel - nn.Module class with an embedding bag and a linear layer for manipulating torchtext library</li>
</ul>

In [6]:
#Class definition cell
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class) -> None:
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
        return None

    def init_weights(self) -> None:
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        return None
    
    def forward(self, text, offsets):
        embedded = self.embeddingtext(text, offsets)
        return self.fc(embedded)
    
print("Classes defined.")

Classes defined.


<br>

### Calculation functions
<b>Functions:</b><br>
<ul>
    <li></li>
</ul>

In [7]:
#Calculation functions cell
    
print("Calculation functions defined.")

Calculation functions defined.


<br>

### Plotting functions
<b>Functions:</b>
<ul>
    <li></li>
</ul>

In [8]:
#Plotting functions Cell
%matplotlib inline

print("Plotting functions defined.")

Plotting functions defined.


<br>

### Training Functions
<b>Functions:</b>
<ul>
    <li>evaluate - evaluation loop, takes dataloader as input, returns accuracy.</li>
    <li>train - training loop, takes dataloader as input, no return.</li>
</ul>

In [9]:
#Training Functions
def evaluate(dataloader) -> float:
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1)==label).sum().item()
            total_count += label.size(0)       
    return total_acc/total_count


def train(dataloader) -> None:
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()
    
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1)==label).sum().item()
        total_count += label.size(0)
        if idx%log_interval == 0 and idx>0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                           total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()
        return None

print("Training functions defined.")

Training functions defined.


### Main code
The `AG_NEWS` dataset has 4 labels, and therefore for classes:
`1: World`, `2: Sports`, `3: Business` and `4:Sci/Tec`. This is defined in [Importing and preparing data sets](#Importing-and-preparing-data-sets).

In [10]:
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class)

EPOCHS = 10
LR = 5

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

<br>