### RNN Implmentation for NLP

In [1]:
# Importing the libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

In [2]:
with open('Data/shakespeare.txt', 'r', encoding = 'utf8') as t:
    text = t.read()

In [3]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [4]:
len(text)

5445609

In [3]:
all_char = set(text)

In [4]:
len(all_char)

84

In [4]:
# decoder
decoder = dict(enumerate(all_char))

In [5]:
# encoder
encoder = {char: i for i,char in decoder.items()}

In [6]:
encoded_text = np.array([encoder[char] for char in text])

In [8]:
encoded_text[:100]

array([ 0,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
        7,  7,  7,  7,  7, 22,  0,  7,  7, 14, 68,  3, 31,  7, 38, 67, 28,
       68, 64, 37, 83,  7, 32, 68, 64, 67, 83, 55, 68, 64, 37,  7,  9, 64,
        7, 10, 64, 37, 28, 68, 64,  7, 28, 69, 32, 68, 64, 67, 37, 64, 70,
        0,  7,  7, 82, 18, 67, 83,  7, 83, 18, 64, 68, 64, 19, 60,  7, 19,
       64, 67, 55, 83, 60, 43, 37,  7, 68,  3, 37, 64,  7, 31, 28])

In [7]:
# one hot encoding
def one_hot_enc(batch_text, uni_chars):
    one_hot = np.zeros((batch_text.size, uni_chars))
    one_hot = one_hot.astype(np.float32)
    
    one_hot[np.arange(one_hot.shape[0]), batch_text.flatten()] = 1.0
    
    one_hot = one_hot.reshape((*batch_text.shape, uni_chars))
    
    return one_hot

In [10]:
# Example
x = np.array([1,2,0])

In [11]:
one_hot_enc(x, 3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [8]:
# generate batches for training
def gen_batch(en_text, sample_size = 10, seq_len = 50):
    
    char_len = sample_size * seq_len
    num_batches = int(len(en_text) / char_len)
    
    en_text = en_text[: num_batches * char_len]
    en_text = en_text.reshape((sample_size, -1))
    
    for n in range(0,en_text.shape[-1], seq_len):
        x = en_text[:, n : n + seq_len]
        y = np.zeros_like(x)
        
        try:
            y[:, : -1] = x[:, 1:]
            y[:, -1] = en_text[:, n + seq_len]
        
        except:
            y[:, : -1] = x[:, 1:]
            y[:, -1] = en_text[:, 0]
        
        yield x,y

In [13]:
# Example 
sample_text = np.arange(20)

In [14]:
sample_text

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [15]:
batch_gen = gen_batch(sample_text, sample_size = 2, seq_len = 5)

In [16]:
x,y = next(batch_gen)

In [17]:
x

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14]])

In [18]:
y

array([[ 1,  2,  3,  4,  5],
       [11, 12, 13, 14, 15]])

In [9]:
# RNN model
class CharRNN(nn.Module):
    def __init__(self, all_chars, num_hidden = 256, num_layers = 4, drop_prob = 0.5):
        
        super().__init__()
        
        self.drop_pob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.all_chars = all_chars
        
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: i for i, char in decoder.items()}
        
        # Architecture
        self.lstm = nn.LSTM(len(all_chars), num_hidden, num_layers, dropout = drop_prob, batch_first = True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(num_hidden, len(self.all_chars))
        
    def forward(self, x, hidden):
        
        lstm_out, hidden = self.lstm(x, hidden)
        drop_out = self.dropout(lstm_out)
        drop_out = drop_out.contiguous().view(-1, self.num_hidden)
        output = self.fc1(drop_out)
        
        return output, hidden
    
    def hidden_state(self, batch_size):
        
        hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda(),
                  torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda())
        
        return hidden

In [10]:
model = CharRNN(all_chars = all_char, num_hidden = 512, num_layers = 3, drop_prob = 0.5)

In [21]:
model

CharRNN(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=512, out_features=84, bias=True)
)

In [22]:
params = []

for p in model.parameters():
    params.append(int(p.numel()))

In [23]:
sum(params)
# have some of params roughly equal to size of text data set to prevent over fitting

5470292

In [11]:
# hyperparams
lr = 0.001
train_per = 0.9
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()
train_ind = int(len(encoded_text) * train_per)

In [25]:
train_ind

4901048

In [12]:
train_data = encoded_text[:train_ind]
test_data = encoded_text[train_ind:]

In [27]:
train_data[:50]

array([ 0,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
        7,  7,  7,  7,  7, 22,  0,  7,  7, 14, 68,  3, 31,  7, 38, 67, 28,
       68, 64, 37, 83,  7, 32, 68, 64, 67, 83, 55, 68, 64, 37,  7,  9])

In [13]:
# training hyperparams
epochs = 20
batch_size = 100
seq_len = 100
t = 0
num_char = max(encoded_text) + 1

In [29]:
num_char

84

In [30]:
# training
model.train()
model.cuda()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    for x,y in gen_batch(train_data, batch_size, seq_len):
        t += 1
        
        x = one_hot_enc(x, num_char)
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        inputs = inputs.cuda()
        targets = targets.cuda()
        
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_out, hidden = model.forward(inputs, hidden)
        
        loss = criterion(lstm_out, targets.view(batch_size * seq_len).long())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 5)
        
        optimizer.step()
        
        if t % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            
            model.eval()
            
            for x,y in gen_batch(test_data, batch_size, seq_len):
                
                x = one_hot_enc(x, num_char)
                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)
        
                inputs = inputs.cuda()
                targets = targets.cuda()
                
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_out,val_hidden = model.forward(inputs, val_hidden)
                val_loss = criterion(lstm_out, targets.view(batch_size * seq_len).long())
                
                val_losses.append(val_loss.item())
            
            model.train()
            
            print(f"epoch: {i} | step: {t} | val loss {val_loss.item()}")

epoch: 0 | step: 25 | val loss 3.203482151031494
epoch: 0 | step: 50 | val loss 3.1927154064178467
epoch: 0 | step: 75 | val loss 3.1937904357910156
epoch: 0 | step: 100 | val loss 3.183349132537842
epoch: 0 | step: 125 | val loss 3.0834155082702637
epoch: 0 | step: 150 | val loss 2.9948766231536865
epoch: 0 | step: 175 | val loss 2.990663528442383
epoch: 0 | step: 200 | val loss 2.8331055641174316
epoch: 0 | step: 225 | val loss 2.7366182804107666
epoch: 0 | step: 250 | val loss 2.6722817420959473
epoch: 0 | step: 275 | val loss 2.5732550621032715
epoch: 0 | step: 300 | val loss 2.4460690021514893
epoch: 0 | step: 325 | val loss 2.361585855484009
epoch: 0 | step: 350 | val loss 2.2889010906219482
epoch: 0 | step: 375 | val loss 2.2331292629241943
epoch: 0 | step: 400 | val loss 2.196660280227661
epoch: 0 | step: 425 | val loss 2.1434803009033203
epoch: 0 | step: 450 | val loss 2.1132051944732666
epoch: 0 | step: 475 | val loss 2.068819761276245
epoch: 1 | step: 500 | val loss 2.036888

epoch: 8 | step: 4025 | val loss 1.3488202095031738
epoch: 8 | step: 4050 | val loss 1.346407413482666
epoch: 8 | step: 4075 | val loss 1.34852933883667
epoch: 8 | step: 4100 | val loss 1.3438316583633423
epoch: 8 | step: 4125 | val loss 1.3467200994491577
epoch: 8 | step: 4150 | val loss 1.3416556119918823
epoch: 8 | step: 4175 | val loss 1.3414490222930908
epoch: 8 | step: 4200 | val loss 1.3484151363372803
epoch: 8 | step: 4225 | val loss 1.34339439868927
epoch: 8 | step: 4250 | val loss 1.3423597812652588
epoch: 8 | step: 4275 | val loss 1.3435819149017334
epoch: 8 | step: 4300 | val loss 1.3435900211334229
epoch: 8 | step: 4325 | val loss 1.3469713926315308
epoch: 8 | step: 4350 | val loss 1.3461576700210571
epoch: 8 | step: 4375 | val loss 1.3414428234100342
epoch: 8 | step: 4400 | val loss 1.3343620300292969
epoch: 9 | step: 4425 | val loss 1.3403033018112183
epoch: 9 | step: 4450 | val loss 1.338165044784546
epoch: 9 | step: 4475 | val loss 1.339518666267395
epoch: 9 | step: 45

epoch: 16 | step: 7950 | val loss 1.317154049873352
epoch: 16 | step: 7975 | val loss 1.3117581605911255
epoch: 16 | step: 8000 | val loss 1.316208839416504
epoch: 16 | step: 8025 | val loss 1.3148536682128906
epoch: 16 | step: 8050 | val loss 1.3142485618591309
epoch: 16 | step: 8075 | val loss 1.3082797527313232
epoch: 16 | step: 8100 | val loss 1.3092001676559448
epoch: 16 | step: 8125 | val loss 1.311583161354065
epoch: 16 | step: 8150 | val loss 1.3118605613708496
epoch: 16 | step: 8175 | val loss 1.3058611154556274
epoch: 16 | step: 8200 | val loss 1.3160910606384277
epoch: 16 | step: 8225 | val loss 1.3135119676589966
epoch: 16 | step: 8250 | val loss 1.3198447227478027
epoch: 16 | step: 8275 | val loss 1.3144415616989136
epoch: 16 | step: 8300 | val loss 1.3120951652526855
epoch: 16 | step: 8325 | val loss 1.3125619888305664
epoch: 17 | step: 8350 | val loss 1.3114821910858154
epoch: 17 | step: 8375 | val loss 1.3085685968399048
epoch: 17 | step: 8400 | val loss 1.3151229619979

In [14]:
# converges around a 1.30 loss
# premise: loss does not reduce after a whole epoch
# save model
name = 'CharRNN_hidden512_layers3_shakes.net'

In [32]:
torch.save(model.state_dict(), name)

In [15]:
# Prediction 
def predict_next(model, char, hidden = None, k = 1):
    
    encoded_text = model.encoder[char]
    encoded_text = np.array([[encoded_text]])
    encoded_text = one_hot_enc(encoded_text, len(model.all_chars))
    
    inputs = torch.from_numpy(encoded_text)
    inputs = inputs.cuda()
    
    hidden = tuple([state.data for state in hidden])
    
    lstm_out, hidden = model(inputs, hidden)
    
    probs = F.softmax(lstm_out, dim = 1).data
    probs = probs.cpu()
    
    probs, index_pos = probs.topk(k)
    index_pos = index_pos.numpy().squeeze()
    probs = probs.numpy().flatten()
    probs = probs/probs.sum()
    
    char = np.random.choice(index_pos, p = probs)
    
    return model.decoder[char], hidden

In [None]:
# Generate text
def generate_text(model, size, seed = 'The', k = 1):
    