# Simple Text Classification with Torch

In [None]:
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext import datasets
from torchtext.data import Field, LabelField
from torchtext.data import Iterator, BucketIterator

In [None]:
TEXT = Field(sequential=True, lower=True, batch_first=True, fix_length=None)
LABEL = LabelField(batch_first=True)

In [None]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
train.examples[0].text

In [None]:
train.examples[0].label

In [None]:
# TEXT.build_vocab(trn, vectors="charngram.100d")
# TEXT.build_vocab(trn, vectors="fasttext.simple.300d")
# TEXT.build_vocab(trn, vectors="fasttext.en.300d")
TEXT.build_vocab(train, vectors="glove.6B.50d")

### Possible vectors
- charngram.100d
- fasttext.en.300d
- fasttext.simple.300d
- glove.42B.300d
- glove.840B.300d
- glove.twitter.27B.25d 
- glove.twitter.27B.50d 
- glove.twitter.27B.100d 
- glove.twitter.27B.200d 
- glove.6B.50d 
- glove.6B.100d 
- glove.6B.200d 
- glove.6B.300d

In [None]:
TEXT.vocab.stoi["z"]

In [None]:
TEXT.vocab.itos[1]

In [None]:
TEXT.vocab.vectors.shape

In [None]:
TEXT.vocab.vectors[12]

In [None]:
TEXT.vocab.vectors.shape[1]

In [None]:
TEXT.vocab.freqs.most_common(10)

### Q: is there any problem with the vocab

In [None]:
LABEL.build_vocab(train)

In [None]:
LABEL.vocab.freqs

## Creating the Iterator

Minimizes amount of padding needed while producing freshly shuffled batches for each new epoch. See pool for the bucketing procedure used.

In [None]:
train_iter, test_iter = BucketIterator.splits(
        (train, test),
        batch_sizes=(8, 8),
        device='cuda',
        sort=True,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True
)

In [None]:
batch = next(train_iter.__iter__()); batch.text

In [None]:
len(batch)

In [None]:
batch.__dict__.keys()

## Defining Text Classifier Model

![title](https://pytorch.org/tutorials/_images/text_sentiment_ngrams_model.png)

**nn.Embedding**

It’s only a lookup table, given the index, it will return the corresponding vector.
The vector representation indicated the weighted matrix is initialized as random values and will be updated by backpropagation.

**nn.EmbeddingBag**

Since nn.EmbeddingBag accumulates the average across the embeddings on the fly, nn.EmbeddingBag can enhance the performance and memory efficiency to process a sequence of tensors.

In [None]:
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag.from_pretrained(TEXT.vocab.vectors, mode='mean', freeze=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        embedded = self.embedding(text, None)
        return self.fc(embedded)

### Q: Try to replace nn.EmbeddingBag with nn.Embedding in the code, make appropriate changes

In [None]:
VOCAB_SIZE = len(TEXT.vocab)
EMBED_DIM = TEXT.vocab.vectors.shape[1]
NUN_CLASS = len(LABEL.vocab)
EPOCHS = 100
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to('cuda')

In [None]:
opt = torch.optim.SGD(model.parameters(), lr=0.1)
loss_func = nn.CrossEntropyLoss()

## The training loop

In [None]:
import sys
import tqdm
from torch import autograd

def training_loop(model, train_iter, val_iter, loss_func, opt, EPOCHS):
    history = {}
    history['train_loss'] = []
    history['val_loss'] = []
    history['val_acc'] = []

    for epoch in range(1, EPOCHS + 1):
        val_acc, train_acc = 0, 0
        train_loss, val_loss = 0,0
    
        model.train() 
        for batch in train_iter:         
        
            x = batch.text
            y = batch.label
                
            opt.zero_grad()
            preds = model(x)
            
            loss = loss_func(preds, y)
                
            loss.backward()
            opt.step()

            train_loss += loss.item()
            train_acc += (preds.argmax(1) == y).sum().item()
        
        print("Train loss:", train_loss/len(train), " train acc:", train_acc/len(train))
    
        model.eval()
        for batch in val_iter:
        
            x = batch.text
            y = batch.label
        
            preds = model(x)
            loss = loss_func(preds, y)
            val_loss += loss.item()
        
            val_acc += (preds.argmax(1) == y).sum().item()
        
        print("Val loss:", val_loss/len(test), " val acc:", val_acc/len(test), "\n")
    
        history['train_loss'].append(train_loss/len(train))
        history['val_loss'].append(val_loss/len(test))
        history['val_acc'].append(val_acc/len(test))
        
    return history['train_loss'], history['val_loss'], history['val_acc']

In [None]:
train_loss, val_loss, val_acc = training_loop(model, train_iter, test_iter, loss_func, opt, EPOCHS)

In [None]:
import matplotlib.pyplot as plt

# plt.plot(train_loss)
# plt.plot(val_loss)
plt.plot(val_acc)

## References

- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
- https://github.com/keitakurita/practical-torchtext/blob/master/Lesson%201%20intro%20to%20torchtext%20with%20text%20classification.ipynb
http://www.cse.chalmers.se/~richajo/nlp2019/l2/Text%20classification%20using%20a%20CBoW%20representation.html
- https://github.com/miyyer/dan/blob/master/dan_sentiment.py
- https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf
- https://medium.com/tech-that-works/deep-averaging-network-in-universal-sentence-encoder-465655874a04