# Simple Text Classification with Deep Averaging Network (DAN)

![title](https://miro.medium.com/max/678/1*0LezMYWUk3pXptoMdO5M_Q.png)

This model, which we call a deep averaging network (DAN), is still unordered, but its depth allowsit to capture subtle variations in the input better than the standard NBOW model. Furthermore, computing each layer requires just a single matrix multiplication, so the complexity scales with the number of layers rather than the number of nodes in a parse tree.

Paper: **Deep Unordered Composition Rivals Syntactic Methods for Text Classification**

Link: https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf

* Unordered: Treats as bag of word embeddings
* Accuracy can be improved by using a variant of dropout, which randomly drops some of words embeddings before averaging i.e. dropout inspired regularizer
* The choice of composition function is not as important as initializing with pre-trained embeddings and using a deep network
* Training speed of unordered function and accuracy of syntactic functions.

In [None]:
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext import datasets
from torchtext.data import Field, LabelField
from torchtext.data import Iterator, BucketIterator

In [None]:
TEXT = Field(sequential=True, lower=True, batch_first=True, fix_length=None)
LABEL = LabelField(batch_first=True)

In [None]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
train.examples[0].text

In [None]:
train.examples[0].label

In [None]:
# TEXT.build_vocab(trn, vectors="charngram.100d")
# TEXT.build_vocab(trn, vectors="fasttext.simple.300d")
# TEXT.build_vocab(trn, vectors="fasttext.en.300d")
TEXT.build_vocab(train, vectors="glove.6B.300d")

### Possible vectors
- charngram.100d
- fasttext.en.300d
- fasttext.simple.300d
- glove.42B.300d
- glove.840B.300d
- glove.twitter.27B.25d 
- glove.twitter.27B.50d 
- glove.twitter.27B.100d 
- glove.twitter.27B.200d 
- glove.6B.50d 
- glove.6B.100d 
- glove.6B.200d 
- glove.6B.300d

In [None]:
TEXT.vocab.stoi["z"]

In [None]:
TEXT.vocab.itos[1]

In [None]:
TEXT.vocab.vectors.shape

In [None]:
TEXT.vocab.vectors[12]

In [None]:
TEXT.vocab.vectors.shape[1]

In [None]:
TEXT.vocab.freqs.most_common(10)

### Q: is there any problem with the vocab

In [None]:
LABEL.build_vocab(train)

In [None]:
LABEL.vocab.freqs

## Creating the Iterator

In [None]:
train_iter, test_iter = BucketIterator.splits(
        (train, test),
        batch_sizes=(64, 64),
        device='cuda',
        sort=True,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True
)

### Q: take a look BucketIterator class, notice the difference with Iterator class

In [None]:
batch = next(train_iter.__iter__()); batch.text

In [None]:
len(batch)

In [None]:
batch.__dict__.keys()

## Defining Text Classifier Model

In [None]:
class DAN(nn.Module):
    def __init__(self, emb_dim, n_layers,
                 hidden_size, n_outputs, pad_idx=1):
        super().__init__()
        self.dropout = nn.Dropout(0.3)
        self.emb_dim = emb_dim
        self.emb = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False, padding_idx=1)

        modules = []
        in_features = emb_dim

        for i in range(n_layers):
            modules.append(nn.Linear(in_features, hidden_size))
            modules.append(nn.ReLU())
            in_features = hidden_size

        modules.append(nn.Linear(hidden_size, n_outputs))

        self.layers = nn.Sequential(*modules)

    def forward(self, x):
        x = self.emb(x)
        x = self.dropout(x)
        x = x.mean(dim=1)
        x = self.layers(x)
        return x

In [None]:
VOCAB_SIZE = len(TEXT.vocab)
N_LAYERS = 1
EMBED_DIM = TEXT.vocab.vectors.shape[1]
HIDDEN_SIZE = int(EMBED_DIM*3)
N_OUTPUTS = len(LABEL.vocab)
EPOCHS = 100
model = DAN(EMBED_DIM, N_LAYERS, HIDDEN_SIZE, N_OUTPUTS).to('cuda')

In [None]:
opt = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

## The training loop

In [None]:
import sys
import tqdm
from torch import autograd

def training_loop(model, train_iter, val_iter, loss_func, opt, EPOCHS):
    history = {}
    history['train_loss'] = []
    history['val_loss'] = []
    history['val_acc'] = []

    for epoch in range(1, EPOCHS + 1):
        val_acc, train_acc = 0, 0
        train_loss, val_loss = 0,0
    
        model.train() 
        for batch in train_iter:
        
            x = batch.text
            y = batch.label
                
            opt.zero_grad()
            preds = model(x)
            
            loss = loss_func(preds, y)
                
            loss.backward()
            opt.step()

            train_loss += loss.item()
            train_acc += (preds.argmax(1) == y).sum().item()
        
        print("Epoch:", epoch, "Train loss:", train_loss/len(train), "train acc:", train_acc/len(train))
    
        model.eval()
        for batch in val_iter:
        
            x = batch.text
            y = batch.label
        
            preds = model(x)
            loss = loss_func(preds, y)
            val_loss += loss.item()
        
            val_acc += (preds.argmax(1) == y).sum().item()
        
        print("Epoch:", epoch, "Val loss:", val_loss/len(test), "val acc:", val_acc/len(test), "\n")
    
        history['train_loss'].append(train_loss/len(train))
        history['val_loss'].append(val_loss/len(test))
        history['val_acc'].append(val_acc/len(test))
        
    return history['train_loss'], history['val_loss'], history['val_acc']

In [None]:
train_loss, val_loss, val_acc = training_loop(model, train_iter, test_iter, loss_func, opt, EPOCHS)

In [None]:
import matplotlib.pyplot as plt

# plt.plot(train_loss)
plt.plot(val_loss)
# plt.plot(val_acc)

## References
- https://github.com/Pinafore/qb/blob/master/qanta/guesser/dan.py
- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
- https://github.com/keitakurita/practical-torchtext/blob/master/Lesson%201%20intro%20to%20torchtext%20with%20text%20classification.ipynb
http://www.cse.chalmers.se/~richajo/nlp2019/l2/Text%20classification%20using%20a%20CBoW%20representation.html
- https://github.com/miyyer/dan/blob/master/dan_sentiment.py
- https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf
- https://medium.com/tech-that-works/deep-averaging-network-in-universal-sentence-encoder-465655874a04

@InProceedings{Iyyer:Manjunatha:Boyd-Graber:III}-2015,
    Title = {Deep Unordered Composition Rivals Syntactic Methods for Text Classification},
    Booktitle = {Association for Computational Linguistics},
    Author = {Mohit Iyyer and Varun Manjunatha and Jordan Boyd-Graber and Hal {Daum\'{e} III}},
    Year = {2015},
    Location = {Beijing, China}
}