### RNN Implmentation for NLP

In [2]:
# Importing the libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

In [3]:
with open('Data/shakespeare.txt', 'r', encoding = 'utf8') as t:
    text = t.read()

In [4]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [5]:
len(text)

5445609

In [6]:
all_char = set(text)

In [7]:
len(all_char)

84

In [8]:
# decoder
decoder = dict(enumerate(all_char))

In [9]:
# encoder
encoder = {char: i for i,char in decoder.items()}

In [10]:
encoded_text = np.array([encoder[char] for char in text])

In [11]:
encoded_text[:100]

array([18, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 83, 18, 43, 43, 33, 71, 72, 12, 43, 46, 36, 54,
       71, 45, 26, 19, 43, 64, 71, 45, 36, 19, 20, 71, 45, 26, 43, 81, 45,
       43, 82, 45, 26, 54, 71, 45, 43, 54,  3, 64, 71, 45, 36, 26, 45, 21,
       18, 43, 43, 14, 65, 36, 19, 43, 19, 65, 45, 71, 45, 63, 76, 43, 63,
       45, 36, 20, 19, 76, 41, 26, 43, 71, 72, 26, 45, 43, 12, 54])

In [12]:
# one hot encoding
def one_hot_enc(batch_text, uni_chars):
    one_hot = np.zeros((batch_text.size, uni_chars))
    one_hot = one_hot.astype(np.float32)
    
    one_hot[np.arange(one_hot.shape[0]), batch_text.flatten()] = 1.0
    
    one_hot = one_hot.reshape((*batch_text.shape, uni_chars))
    
    return one_hot

In [13]:
# Example
x = np.array([1,2,0])

In [14]:
one_hot_enc(x, 3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [26]:
# generate batches for training
def gen_batch(en_text, sample_size = 10, seq_len = 50):
    
    char_len = sample_size * seq_len
    num_batches = int(len(text) / char_len)
    
    en_text = en_text[: num_batches * char_len]
    en_text = en_text.reshape((sample_size, -1))
    
    for n in range(0,en_text.shape[-1], seq_len):
        x = en_text[:, n : n + seq_len]
        y = np.zeros_like(x)
        
        try:
            y[:, : -1] = x[:, 1:]
            y[:, -1] = en_text[:, n + seq_len]
        
        except:
            y[:, : -1] = x[:, 1:]
            y[:, -1] = en_text[:, 0]
        
        yield x,y

In [32]:
# Example 
sample_text = np.arange(20)

In [33]:
sample_text

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [34]:
batch_gen = gen_batch(sample_text, sample_size = 2, seq_len = 5)

In [35]:
x,y = next(batch_gen)

In [36]:
x

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14]])

In [37]:
y

array([[ 1,  2,  3,  4,  5],
       [11, 12, 13, 14, 15]])

In [40]:
# RNN model
class CharRNN(nn.Module):
    def __init__(self, all_chars, num_hidden = 256, num_layers = 4, drop_prob = 0.5):
        
        super().__init__()
        
        self.drop_pob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.all_chars = all_chars
        
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: i for i, char in decoder.items()}
        
        # Architecture
        self.lstm = nn.LSTM(len(all_chars), num_hidden, num_layers, dropout = drop_prob, batch_first = True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(num_hidden, len(self.all_chars))
        
    def forward(self, x, hidden):
        
        lstm_out, hidden = self.lstm(x, hidden)
        drop_out = self.dropout(lstm_out)
        drop_out = drop_out.contiguous().view(-1, self.num_hidden)
        output = self.fc1(drop_out)
        
        return output, hidden
    
    def hidden_state(self, batch_size):
        
        hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda(),
                  torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda())
        
        return hidden

In [41]:
model = CharRNN(all_chars = all_char, num_hidden = 512, num_layers = 3, drop_prob = 0.5)

In [42]:
model

CharRNN(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=512, out_features=84, bias=True)
)

In [43]:
params = []

for p in model.parameters():
    params.append(int(p.numel()))

In [45]:
sum(params)
# have some of params roughly equal to size of text data set to prevent over fitting

5470292

In [49]:
# hyperparams
lr = 0.001
train_per = 0.9
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()
train_ind = int(len(encoded_text) * train_per)

In [50]:
train_data = encoded_text[:train_ind]
test_data = encoded_text[train_ind:]

In [51]:
# training