<a href="https://colab.research.google.com/github/epigos/learning-pytorch/blob/master/05_LSTM_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from collections import defaultdict, deque
import math
import time
import copy
import itertools
import warnings
from datetime import datetime
from pathlib import Path
warnings.filterwarnings('ignore')

import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchtext import data 
import torchtext
from torchtext import datasets
from torchtext.vocab import GloVe

import pandas as pd
import spacy

In [115]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available(): print('Device name:', torch.cuda.get_device_name())

Device name: Tesla P100-PCIE-16GB


# Load IMDB dataset

In [0]:
# define the text and label field
# lowercase all the text, tokenize the text, and trim it to a maximum length of 200
TEXT = data.Field(lower=True, batch_first=False, fix_length=200)
LABEL = data.Field(sequential=False)

In [0]:
# load dataset
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [118]:
print(len(train_data), len(test_data))

25000 25000


In [119]:
print("Train fields:", train_data.fields)

Train fields: {'text': <torchtext.data.field.Field object at 0x7faff4d92198>, 'label': <torchtext.data.field.Field object at 0x7faff4d92d30>}


In [120]:
print(vars(train_data[0]))

{'text': ['richard', 'widmark', 'is', 'a', 'tainted', 'character', 'in', 'this', 'movie.', 'he', 'is', 'a', 'professional', 'pickpocket.', "he's", 'been', 'in', 'prison', 'three', 'times,', 'yet', 'at', 'the', 'beginning', 'of', 'the', 'film,', 'he', 'tries', 'to', 'make', 'it', 'four.', 'thelma', 'ritter', 'is', 'a', 'busy', 'body', 'selling', 'information', 'to', 'almost', 'everybody.', 'jean', 'peters', 'is', 'amazing', 'as', 'the', 'girl', 'flamed', 'by', 'widmark.<br', '/><br', '/>this', 'is', 'a', 'period', 'piece', 'during', 'the', 'mccarthy', 'era', 'where', 'the', 'red', 'scare', 'ruled', 'the', 'politics', 'and', 'is', 'worked', 'into', 'this', 'plot', 'quite', 'nicely.', 'what', 'is', 'unusual', 'about', 'this', 'film', 'is', 'that', 'peters', '&', 'ritter', 'are', 'both', 'victims', 'of', 'violent', 'beatings', 'in', 'an', 'era', 'where', 'women', 'were', 'seldom', 'more', 'than', 'sex', 'objects', 'in', 'films.', 'this', 'is', 'what', 'makes', 'this', 'film', 'noir', 'as',

## Create word vocabulary

In [0]:
# create vocabulary using GloVe with vocab size of 10000 and min word frequency of 10
TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300), max_size=10000, min_freq=10)
LABEL.build_vocab(train_data)

In [122]:
# how big is our vocabulary
(len(TEXT.vocab), len(LABEL.vocab))

(10002, 3)

In [123]:
# most common words
TEXT.vocab.freqs.most_common(10)

[('the', 322198),
 ('a', 159953),
 ('and', 158572),
 ('of', 144462),
 ('to', 133967),
 ('is', 104171),
 ('in', 90527),
 ('i', 70480),
 ('this', 69714),
 ('that', 66292)]

## Create data loader

We use the torchtext BucketIterator function for creating batches, and the size of the batches will be sequence length and batches

In [0]:
train_loader, test_loader = data.BucketIterator.splits(
    (train_data, test_data), batch_size=32, device=device, repeat=False,
    sort_key = lambda x: len(x.text), sort_within_batch = False
)

# Create the network

In [0]:
class IMDBLSTM(nn.Module):

    def __init__(self, vocab_size, hidden_size, output_size, num_layers=2, 
                 batch_size=32):
        super().__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers)
        self.fcl = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        self.dropout = nn.Dropout(0.8)

    def forward(self, seq):
        batch_size = seq.size()[1]
        if batch_size != self.batch_size:
            self.batch_size = batch_size
        # get embedding output
        x = self.embed(seq)
        # get initial hidden output
        h0 = c0 = Variable(x.data.new(*(self.num_layers, self.batch_size, self.hidden_size)).zero_())
        # get output from lstm layer
        output, _ = self.lstm(x, (h0, c0))
        # get output from linear layer
        fc = self.dropout(self.fcl(output[-1]))
        return self.softmax(fc)


In [0]:
# test model
vocab_size = len(TEXT.vocab)
hidden_size = 100
output_size = len(LABEL.vocab)
model = IMDBLSTM(vocab_size, hidden_size, output_size).to(device)

In [142]:
model

IMDBLSTM(
  (embed): Embedding(10002, 100)
  (lstm): LSTM(100, 100, num_layers=2)
  (fcl): Linear(in_features=100, out_features=3, bias=True)
  (softmax): LogSoftmax()
  (dropout): Dropout(p=0.8, inplace=False)
)

In [143]:
# input size 200 * 32
out = model(torch.randint(0, vocab_size, (200, 32), device=device))
out

tensor([[-1.0986, -1.0986, -1.0986],
        [-1.1273, -1.0436, -1.1273],
        [-1.1940, -1.0821, -1.0269],
        [-1.0348, -1.0348, -1.2398],
        [-1.2031, -1.0502, -1.0502],
        [-1.3127, -1.0066, -1.0066],
        [-1.0986, -1.0986, -1.0986],
        [-1.0907, -1.1146, -1.0907],
        [-1.0986, -1.0986, -1.0986],
        [-1.2371, -1.0360, -1.0360],
        [-1.0986, -1.0986, -1.0986],
        [-1.2116, -1.0466, -1.0466],
        [-1.1235, -1.0507, -1.1235],
        [-1.0221, -1.0221, -1.2719],
        [-1.2982, -1.0120, -1.0120],
        [-1.1128, -1.0708, -1.1128],
        [-1.0986, -1.0986, -1.0986],
        [-1.0870, -1.1223, -1.0870],
        [-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986],
        [-1.0442, -1.0443, -1.2171],
        [-1.0294, -1.0294, -1.2532],
        [-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986],
        [-1.1194, -0.9998, -1.1856],
 

In [144]:
out.size()

torch.Size([32, 3])

## Train the network

In [0]:
def train(model, data_loader, epochs=5):
    model = model.to(device)
    # define training functions
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.NLLLoss()
    # define training variables
    since = time.time()
    best_weights = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    loss_history = defaultdict(lambda: deque(maxlen=epochs))
    accuracy_history = defaultdict(lambda: deque(maxlen=epochs))

    for epoch in range(1, epochs + 1):
        print('\nEpoch {}/{}'.format(epoch, epochs))
        print('-' * 60)    
        
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            for batch in data_loader[phase]:
                text, target = batch.text.to(device), batch.label.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # make predictions
                    outputs = model(text)
                    loss = criterion(outputs, target)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    # statistics
                running_loss += loss.item() * text.size(0)
                running_corrects += preds.eq(target.view_as(preds)).cpu().sum()
                
            data_size = len(data_loader[phase].dataset)
            epoch_loss = running_loss / data_size
            epoch_acc = running_corrects.double().item() / data_size

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            loss_history[phase].append(epoch_loss)
            accuracy_history[phase].append(epoch_acc)
            
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_weights = copy.deepcopy(model.state_dict())
            

    time_elapsed = time.time() - since
    print('-' * 60)
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:.3f} %'.format(100 * best_acc))
    print('=' * 60, '\n')
    # load best weights
    model.load_state_dict(best_weights)
    return loss_history, accuracy_history
 

In [148]:
# define data loader
data_loader = {'train': train_loader, 'val': test_loader}
# instantiate model
model = IMDBLSTM(vocab_size, hidden_size, output_size)
# train model
loss_history, accuracy_history = train(model, data_loader, epochs=25)


Epoch 1/25
------------------------------------------------------------
train Loss: 6.2792 Acc: 0.2450
val Loss: 5.0806 Acc: 0.5093

Epoch 2/25
------------------------------------------------------------
train Loss: 6.2629 Acc: 0.2444
val Loss: 4.9133 Acc: 0.5255

Epoch 3/25
------------------------------------------------------------
train Loss: 6.2245 Acc: 0.2422
val Loss: 4.8287 Acc: 0.5540

Epoch 4/25
------------------------------------------------------------
train Loss: 6.1424 Acc: 0.2428
val Loss: 4.7686 Acc: 0.5942

Epoch 5/25
------------------------------------------------------------
train Loss: 6.1680 Acc: 0.2414
val Loss: 4.7011 Acc: 0.5593

Epoch 6/25
------------------------------------------------------------
train Loss: 5.9822 Acc: 0.2511
val Loss: 3.9085 Acc: 0.7623

Epoch 7/25
------------------------------------------------------------
train Loss: 5.7062 Acc: 0.2575
val Loss: 3.7242 Acc: 0.7761

Epoch 8/25
---------------------------------------------------------