## Sample RNN layers

In [1]:
import torch
import torch.nn as nn

torch.manual_seed(1)
rnn_layer = nn.RNN(input_size=5, 
                   hidden_size=2,
                   num_layers=1, 
                   batch_first=True)

w_xh = rnn_layer.weight_ih_l0
w_hh = rnn_layer.weight_hh_l0
b_xh = rnn_layer.bias_ih_l0
b_hh = rnn_layer.bias_hh_l0

print('w_xh shape: ', w_xh.shape)
print('w_hh shape: ', w_hh.shape)
print('b_xh shape: ', b_xh.shape)
print('b_hh shape: ', b_hh.shape)

w_xh shape:  torch.Size([2, 5])
w_hh shape:  torch.Size([2, 2])
b_xh shape:  torch.Size([2])
b_hh shape:  torch.Size([2])


## Load Data

In [2]:
from torchtext.datasets import IMDB

train_data = IMDB(split='train')
test_data = IMDB(split='test')

## Data Preprocessing

In [9]:
## Split training dataset into training and validation sets
from torch.utils.data.dataset import random_split

torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_data), [20000, 5000])

In [10]:
## Identify unique words in training set
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
           ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
    
print('vocab size:', len(token_counts))

vocab size: 69381


In [19]:
## Map each unique word to a unique integer and encode the review text into encoded integers
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x:x[1], reverse=True)

ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

In [22]:
## Divide the datset into mini-batches as input to the model

### define transformation pipeline
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 'pos' else 0.