In [219]:
# Check if a gpu is available
!nvidia-smi


Tue Mar  3 04:00:43 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.48.02    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    29W /  70W |  12989MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

# Sentiment Classification


1. Use Pytorch to load the IMDb movie dataset and do preprocessing;
2. Develop a Recurrent Neural Network (RNN) Classifier for the same dataset;
3. Convert the RNN to a bidirectional Long-Short-Term-Memory (LSTM) model


## 1. Loading dataset

In [0]:
import torch
from torchtext import data

SEED = 12138

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Torchtext will let us to load the text and labels separately.
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [221]:
 # follow the steps to authorize colab to get access to your google drive data
 from google.colab import drive
 drive.mount('/content/gdrive')
#4/xAHVfTtagNdR6xFZQwe5kH4F_x4JZ2I-84q4gtffEeiJgWeaLicbcto

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [222]:
# make sure that you can see the ipynb files and IMDB.gz
!ls  gdrive/My\ Drive/Colab\ Notebooks/nlp_hw2/

ls: cannot access 'gdrive/My Drive/Colab Notebooks/nlp_hw2/': No such file or directory


## Data loading
Read more: https://pytorchnlp.readthedocs.io/en/latest/_modules/index.html


In [223]:
from torchtext import datasets
import os

# set up the path
ROOT_DIR = " gdrive/My\ Drive/Colab\ Notebooks/nlp_hw2/"
DATA_DIR = ROOT_DIR+'IMDB.gz'

# load data, this may take a while
all_data = datasets.IMDB(DATA_DIR,TEXT, LABEL)
train_data, test_data = all_data.splits(TEXT, LABEL)

print ('Loading finished!')

Loading finished!


In [224]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [225]:
import random
# split into train and validation set
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [0]:
# set vocab
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)


In [0]:
def count_parameters(model):
    ## fill here:
    param_number = 0
    param_number = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    return param_number



##### Define iterator

Define an iterator that batches examples of similar lengths together. 
There are other options. For more: https://torchtext.readthedocs.io/en/latest/data.html



In [0]:
BATCH_SIZE = 64

# If there is a GPU available, we will set to use it; otherwise we will use cpu.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

## 2. Recurrent Neural Network


The RNN model has the following structure:
1. start by an embedding layer; shape:  (input_dim, embedding_dim)
2. then we put the RNN layer; shape: (embedding_dim, hidden_dim)
3. last, we add a liner layer; shape: (hidden_dim, output_dim)

In [0]:
import torch.nn as nn
import torch.optim as optim

## TODO: define the RNN class
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        ## TODO starts
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

        ## TODO ends
        
    def forward(self, text):

        ## TODO starts
        embeds = self.embedding(text)
        output, hidden = self.rnn(embeds)
        #print(output.shape)
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        
        result =  self.fc(hidden.squeeze(0))
        ## TODO ends

        return  result

## Model Training



In [230]:
# define some hyperparameters
INPUT_DIM = len(TEXT.vocab)
print(INPUT_DIM)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
LEARNING_RATE = 1e-3


25002


In [231]:

# apply our RNN model here
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
print(f'The model has {count_parameters(model):,} trainable parameters')


optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

criterion = nn.BCEWithLogitsLoss()#nn.CrossEntropyLoss()


The model has 2,592,105 trainable parameters


In [0]:

## setup device
model = model.to(device)
criterion = criterion.to(device)

### Calculate accuracy

In [0]:
##  return the accuracy given the preditions (preds) and true values (y); acc should be a float number
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    acc = (rounded_preds == y).float() 
    acc = acc.sum() / len(acc)
    #round predictions to the closest integer

    return acc

## Training function

In [0]:

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
      #print(batch.text.shape[0])
      optimizer.zero_grad() # Clears existing gradients from previous epoch
      predictions = model(batch.text).squeeze(1)
        
      loss = criterion(predictions, batch.label)
      
      acc = binary_accuracy(predictions, batch.label)
      
      loss.backward()
      
      optimizer.step()
      
      epoch_loss += loss.item()
      epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### Evaluation function

This step is to copy and paste what you did in the training function into the evaluate function. This time, there’s no additional optimization after the predictions, loss, and accuracy are calculated.

In [1]:

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### Start training
It may take a few minutes in total. The validate accuracy is around 50-51%.



In [236]:
N_EPOCHS = 5

best_valid_loss = float('inf')
# let's train 5 epochs
for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
      
    # we keep track of the best model, and save it
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: 0.697 | Train Acc: 49.89%
	 Val. Loss: 0.701 |  Val. Acc: 50.17%
	Train Loss: 0.700 | Train Acc: 49.54%
	 Val. Loss: 0.695 |  Val. Acc: 50.57%
	Train Loss: 0.697 | Train Acc: 50.18%
	 Val. Loss: 0.697 |  Val. Acc: 49.17%
	Train Loss: 0.697 | Train Acc: 49.72%
	 Val. Loss: 0.695 |  Val. Acc: 48.22%
	Train Loss: 0.697 | Train Acc: 50.37%
	 Val. Loss: 0.694 |  Val. Acc: 50.12%


### Restore the best model and evaluate

The test accuracy is around 47%



In [237]:
model.load_state_dict(torch.load('best_model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)


print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.690 | Test Acc: 53.38%


## 3. LSTM


In [0]:

class RNN(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, bidirectional):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,bidirectional = bidirectional)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(0.3)        
        ## CHANGE THESE DEFINITIONS

    def forward(self, text):
        embeds = self.embedding(text)
        output, (hidden,cell) = self.lstm(embeds)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]),dim = 1)
        #print(output.shape)
        #print(hidden.shape)
        
        return self.fc(hidden)

In [239]:

BIDIRECTIONAL = True

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, BIDIRECTIONAL)
optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
print(f'The model has {count_parameters(model):,} trainable parameters')
## setup device
model = model.to(device)
criterion = criterion.to(device)

The model has 3,233,897 trainable parameters


It may take a few minutes in total. The validate accuracy is around 50%.

In [240]:

best_valid_loss = float('inf')
# let's train 5 epochs
for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
      
    # we keep track of the best model, and save it
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model_LSTM.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: 0.685 | Train Acc: 55.86%
	 Val. Loss: 0.672 |  Val. Acc: 58.40%
	Train Loss: 0.624 | Train Acc: 65.51%
	 Val. Loss: 0.612 |  Val. Acc: 66.50%
	Train Loss: 0.544 | Train Acc: 72.76%
	 Val. Loss: 0.547 |  Val. Acc: 72.89%
	Train Loss: 0.449 | Train Acc: 79.74%
	 Val. Loss: 0.472 |  Val. Acc: 78.55%
	Train Loss: 0.391 | Train Acc: 82.95%
	 Val. Loss: 0.452 |  Val. Acc: 80.55%


In [241]:
model.load_state_dict(torch.load('best_model_LSTM.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)


print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.489 | Test Acc: 78.56%



1.   LSTM can solve the problem of gradient vanishing problem perfectly by an extra recurrent state called a cell.
2.   LSTM takes less resource and time to train on.
3. LSTM has a higher accuracy because it's more controllable compared to RNN.

Compare to the model complexity:
1. LSTM has more parameters, and thus would take more training time (which we can calculated with `import time`)

