<a href="https://colab.research.google.com/github/graviraja/100-Days-of-NLP/blob/applications%2Fclassification/applications/classification/Document%20Classification%20with%20HAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz

--2020-06-06 17:22:56--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2460495 (2.3M) [application/x-gzip]
Saving to: ‘reviews_Musical_Instruments_5.json.gz’


2020-06-06 17:22:59 (832 KB/s) - ‘reviews_Musical_Instruments_5.json.gz’ saved [2460495/2460495]



In [2]:
!ls

reviews_Musical_Instruments_5.json.gz  sample_data


In [0]:
!gunzip reviews_Musical_Instruments_5.json.gz

In [4]:
!ls

reviews_Musical_Instruments_5.json  sample_data


In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
import os
import json
import time
import numpy as np
import pandas as pd

from nltk.tokenize import sent_tokenize
from fastai.text import Tokenizer, Vocab
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
datafile = 'reviews_Musical_Instruments_5.json'

cuda


In [0]:
with open(datafile, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

texts = []
labels = []

for line in lines[:-1]:
    data = json.loads(line)
    texts.append(data["reviewText"].lower())
    labels.append(int(data["overall"]) - 1)

In [9]:
train_data, valid_data, train_label, valid_label = train_test_split(
        texts, labels, train_size=0.8, random_state=1
    )

print(f"Number of training data examples: {len(train_label)}")
print(f"Number of validation data examples: {len(valid_label)}")

Number of training data examples: 8208
Number of validation data examples: 2053


In [0]:
train_label = np.array(train_label, dtype="int32")
valid_label = np.array(valid_label, dtype="int32")

In [0]:
class HANPreprocessor:
    """
    Preprocessor to prepare the data for Hierarchical Attention Networks.
    It will tokenize a document into sentences and sentences into tokens
    """

    def __init__(self, max_vocab, min_freq, percentile, tokenizer):
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.percentile = percentile
        self.tokenizer = tokenizer
        self.vocab = None

    def _make_sentences(self, texts):
        texts_sents = [sent_tokenize(text) for text in texts]
        print(f"Sample sentences: {texts_sents[0]}")
        return texts_sents
    
    def tokenize(self, texts):
        print(f"Processing {len(texts)} documents")
        texts_sents = self._make_sentences(texts)
        all_sents = [s for sent in texts_sents for s in sent]

        texts_length = [0] + [len(s) for s in texts_sents]
        range_idx = [sum(texts_length[: i + 1]) for i in range(len(texts_length))]

        print(f"Tokenizing {len(all_sents)} sentences")
        sents_tokens = self.tokenizer(all_sents)

        # calculating lengths of tokens in each sentence for padding purposes
        sents_length = [len(s) for s in sents_tokens]

        if self.vocab is None:
            self.vocab = Vocab.create(sents_tokens, max_vocab=self.max_vocab, min_freq=self.min_freq)

        sents_nums = [self.vocab.numericalize(s) for s in sents_tokens]

        # group sentences into documents
        texts_nums = [sents_nums[range_idx[i]: range_idx[i + 1]] for i in range(len(range_idx[:-1]))]

        # compute max lengths for padding purposes
        self.maxlen_sent = int(np.quantile(sents_length, q=self.percentile))
        self.maxlen_doc = int(np.quantile(texts_length[1:], q=self.percentile))

        print("Padding sentences and documents...")
        self.pad_token = self.vocab.stoi['xxpad']

        padded_texts = [pad_nested_sequences(r, self.maxlen_sent, self.maxlen_doc, self.pad_token) for r in texts_nums]
        return np.stack(padded_texts, axis=0)
    
    def transform(self, texts):
        return self.tokenize(texts)

In [0]:
def tokenizer(texts):
    tokens = Tokenizer().process_all(texts)
    print(f"sentence: {texts[0]}")
    print(f"tokens: {tokens[0]}")
    return tokens

In [0]:
def pad_sequences(seq, max_len, pad_idx):
    if len(seq) > max_len:
        return np.array(seq[:max_len]).astype("int32")
    else:
        res = np.zeros(max_len, dtype="int32") + pad_idx
        res[:len(seq)] = seq
        return res

In [0]:
def pad_nested_sequences(seq, maxlen_sent, maxlen_doc, pad_idx):
    if len(seq) == 0:
        return np.array([[pad_idx] * maxlen_sent] * maxlen_doc).astype("int32")

    # pad the sentences in all docs
    seq = [pad_sequences(s, maxlen_sent, pad_idx) for s in seq]

    # padding the documents
    if len(seq) > maxlen_doc:
        return np.array(seq[:maxlen_doc])
    else:
        res = np.array([[pad_idx] * maxlen_sent] * maxlen_doc).astype("int32")
        res[:len(seq)] = seq
        return res

In [0]:
MAX_VOCAB = 5000
MIN_FREQ = 5
PERCENTILE = 0.8
BATCH_SIZE = 32
processor = HANPreprocessor(MAX_VOCAB, MIN_FREQ, PERCENTILE, tokenizer)


In [16]:
train_seq = processor.transform(train_data)
valid_seq = processor.transform(valid_data)


Processing 8208 documents
Tokenizing 42003 sentences
sentence: this mxl studio 24 usb microphone is a decent choice for those needing a portable or a good quality studio microphone.
tokens: ['this', 'mxl', 'studio', '24', 'usb', 'microphone', 'is', 'a', 'decent', 'choice', 'for', 'those', 'needing', 'a', 'portable', 'or', 'a', 'good', 'quality', 'studio', 'microphone', '.']
Padding sentences and documents...
Processing 2053 documents
Sample sentences: ["i'm very pleased with this purchase.", 'the cables are flexible, quiet and no pops or loss of signal.', 'not much else to say.', 'buy them.']
Tokenizing 10294 sentences
sentence: i'm very pleased with this purchase.
tokens: ['i', "'m", 'very', 'pleased', 'with', 'this', 'purchase', '.']
Padding sentences and documents...


In [0]:
train_set = TensorDataset(
    torch.from_numpy(train_seq).long(),
    torch.from_numpy(train_label).long()
)

train_loader = DataLoader(
    dataset=train_set,
    batch_size=BATCH_SIZE,
    shuffle=True)


In [0]:
valid_set = TensorDataset(
    torch.from_numpy(valid_seq).long(),
    torch.from_numpy(valid_label).long()
)

valid_loader = DataLoader(
    dataset=valid_set,
    batch_size=BATCH_SIZE,
    shuffle=False)

In [0]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()

        self.attn = nn.Linear(hidden_dim, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
    
    def forward(self, inp):
        # inp => [batch_size, seq_len, hidden_dim]

        energy = torch.tanh(self.attn(inp))
        # energy => [batch_size, seq_len, hidden_dim]

        attention = F.softmax(self.v(energy), dim=1)
        # attention => [batch_size, seq_len, 1]

        return attention

In [0]:

class WordAttention(nn.Module):
    def __init__(self, input_dim, emb_dim, pad_idx, hidden_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(emb_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.word_attn = Attention(hidden_dim * 2)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inp, hidden):
        # inp => [batch_size, seq_len]
        # hidden => [num_dir * num_layers, batch_size, hidden_dim]

        embedded = self.embedding(inp)
        embedded = self.dropout(embedded)
        # embedded => [batch_size, seq_len, emb_dim]

        output, hidden = self.rnn(embedded, hidden)
        # output => [batch_size, seq_len, hidden_dim * num_dir]
        #        => [batch_size, seq_len, hidden_dim * 2]
        # hidden => [num_dir * n_layers, batch_size, hidden_dim]

        attention = self.word_attn(output)
        # attention => [batch_size, seq_len, 1]

        # output => [batch_size, seq_len, hid_dim * 2]
        weighted = torch.sum(attention * output, dim=1)
        # weighted => [batch_size, hidden_dim * 2]

        attention = attention.permute(0, 2, 1)
        # attention => [batch_size, 1, seq_len]

        weighted = weighted.unsqueeze(1)
        # weighted => [batch_size, 1, hidden_dim * 2]

        return attention, weighted, hidden

In [0]:
class SentenceAttention(nn.Module):
    def __init__(self, word_hidden_dim, sent_hidden_dim, pad_idx, dropout):
        super().__init__()

        self.rnn = nn.GRU(word_hidden_dim * 2, sent_hidden_dim, bidirectional=True)
        self.sent_attn = Attention(sent_hidden_dim * 2)
    
    def forward(self, inp):
        # inp => [batch_size, seq_len, word_hid_dim * 2]

        output, hidden = self.rnn(inp)
        # output => [batch_size, seq_len, sent_hid_dim * 2]
        # hidden => [num_layers * num_dir, batch_size, sent_hid_dim]

        attention = self.sent_attn(output)
        # attention => [batch_size, seq_len, 1]

        # output => [batch_size, seq_len, hid_dim * 2]
        weighted = torch.sum(attention * output, dim=1)
        # weighted => [batch_size, hidden_dim * 2]

        attention = attention.permute(0, 2, 1)
        # attention => [batch_size, 1, seq_len]

        return attention, weighted

In [0]:

class HierarchicalAttention(nn.Module):
    def __init__(self, input_dim, emb_dim, word_hid_dim, sent_hid_dim, pad_idx, output_dim, dropout, device):
        super().__init__()

        self.word_hid_dim = word_hid_dim
        self.device = device
        self.word_attention = WordAttention(input_dim, emb_dim, pad_idx, word_hid_dim, dropout)
        self.sent_attention = SentenceAttention(word_hid_dim, sent_hid_dim, pad_idx, dropout)

        self.fc = nn.Linear(sent_hid_dim * 2, output_dim)
    
    def forward(self, inp):
        # inp => [batch_size, max_sents, max_words]
        
        batch_size = inp.shape[0]
        inp = inp.permute(1, 0 , 2)
        # inp => [max_sents, batch_size, max_words]

        # initialize word rnn hiddens state
        hidden = torch.nn.Parameter(torch.zeros(2, batch_size, self.word_hid_dim)).to(self.device)

        word_attentions, sents_reps = [], []

        for sent in inp:
            word_attn, sent_rep, hidden = self.word_attention(sent, hidden)
            word_attentions.append(word_attn)
            sents_reps.append(sent_rep)
        
        sents = torch.cat(sents_reps, 1)
        # sents => [batch_size, max_sents, word_hid_dim * 2]

        word_attns = torch.cat(word_attentions, 1)
        # word_attns => [batch_size, max_sents, max_words]

        sent_attn, doc_rep = self.sent_attention(sents)
        sent_attn = sent_attn.squeeze(1)
        # sent_attn => [batch_size, max_sents]
        # doc_rep => [batch_size, sent_hid_dim * 2]

        logits = self.fc(doc_rep)
        # logits => [batch_size, 5]

        return logits, sent_attn, word_attns

In [23]:
input_dim = len(processor.vocab.itos)
emb_dim = 50
word_hid_dim = 32
sent_hid_dim = 32
pad_idx = processor.pad_token
output_dim = 5
dropout = 0.5
model = HierarchicalAttention(input_dim, emb_dim, word_hid_dim, sent_hid_dim, pad_idx, output_dim, dropout, device)
model = model.to(device)
print(model)


HierarchicalAttention(
  (word_attention): WordAttention(
    (embedding): Embedding(5000, 50, padding_idx=1)
    (rnn): GRU(50, 32, batch_first=True, bidirectional=True)
    (word_attn): Attention(
      (attn): Linear(in_features=64, out_features=64, bias=True)
      (v): Linear(in_features=64, out_features=1, bias=False)
    )
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (sent_attention): SentenceAttention(
    (rnn): GRU(64, 32, bidirectional=True)
    (sent_attn): Attention(
      (attn): Linear(in_features=64, out_features=64, bias=True)
      (v): Linear(in_features=64, out_features=1, bias=False)
    )
  )
  (fc): Linear(in_features=64, out_features=5, bias=True)
)


In [0]:
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()


In [0]:
def train(model, iterator, criterion, optimizer, clip):
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0
    for batch in iterator:
        optimizer.zero_grad()
        inp, labels = batch
        inp = inp.to(device)
        labels = labels.to(device)

        logits, _, _ = model(inp)
        # logits => [batch_size, 5]
        # labels => [batch_size]

        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

        _, predictions = torch.max(logits, 1)
        correct = torch.sum((predictions == labels))
        epoch_accuracy = (correct / len(labels)).item()
    
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)


In [0]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_accuracy = 0
    with torch.no_grad():
        for batch in iterator:
            inp, labels = batch
            inp = inp.to(device)
            labels = labels.to(device)

            logits, _, _ = model(inp)
            # logits => [batch_size, 5]
            # labels => [batch_size]

            loss = criterion(logits, labels)
            epoch_loss += loss.item()

            _, predictions = torch.max(logits, 1)
            correct = torch.sum((predictions == labels))
            epoch_accuracy = (correct / len(labels)).item()
    
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
N_EPOCHS = 20
CLIP = 1
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, CLIP)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 0.870 | Train Acc: 0.00%
	 Val. Loss: 0.880 |  Val. Acc: 1.54%
Epoch: 02 | Epoch Time: 0m 8s
	Train Loss: 0.830 | Train Acc: 0.00%
	 Val. Loss: 0.863 |  Val. Acc: 1.54%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 0.793 | Train Acc: 0.00%
	 Val. Loss: 0.875 |  Val. Acc: 1.54%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 0.765 | Train Acc: 0.00%
	 Val. Loss: 0.875 |  Val. Acc: 1.54%
Epoch: 05 | Epoch Time: 0m 8s
	Train Loss: 0.746 | Train Acc: 0.00%
	 Val. Loss: 0.921 |  Val. Acc: 1.54%
Epoch: 06 | Epoch Time: 0m 8s
	Train Loss: 0.722 | Train Acc: 0.00%
	 Val. Loss: 0.876 |  Val. Acc: 1.54%
Epoch: 07 | Epoch Time: 0m 8s
	Train Loss: 0.696 | Train Acc: 0.00%
	 Val. Loss: 0.888 |  Val. Acc: 0.00%
Epoch: 08 | Epoch Time: 0m 8s
	Train Loss: 0.678 | Train Acc: 0.00%
	 Val. Loss: 0.987 |  Val. Acc: 1.54%
Epoch: 09 | Epoch Time: 0m 8s
	Train Loss: 0.659 | Train Acc: 0.00%
	 Val. Loss: 0.938 |  Val. Acc: 1.54%
Epoch: 10 | Epoch Time: 0m 8s
	Train Loss: 0.6