### RNN Implmentation for NLP

In [1]:
# Importing the libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

In [2]:
with open('Data/shakespeare.txt', 'r', encoding = 'utf8') as t:
    text = t.read()

In [3]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [4]:
len(text)

5445609

In [5]:
all_char = set(text)

In [6]:
len(all_char)

84

In [7]:
# decoder
decoder = dict(enumerate(all_char))

In [8]:
# encoder
encoder = {char: i for i,char in decoder.items()}

In [9]:
encoded_text = np.array([encoder[char] for char in text])

In [10]:
encoded_text[:100]

array([24, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
       38, 38, 38, 38, 38, 77, 24, 38, 38, 45, 79, 21,  0, 38, 68, 36, 50,
       79, 72, 59, 42, 38,  8, 79, 72, 36, 42, 74, 79, 72, 59, 38, 58, 72,
       38, 78, 72, 59, 50, 79, 72, 38, 50, 17,  8, 79, 72, 36, 59, 72, 47,
       24, 38, 38, 51, 64, 36, 42, 38, 42, 64, 72, 79, 72, 16, 80, 38, 16,
       72, 36, 74, 42, 80, 11, 59, 38, 79, 21, 59, 72, 38,  0, 50])

In [11]:
# one hot encoding
def one_hot_enc(batch_text, uni_chars):
    one_hot = np.zeros((batch_text.size, uni_chars))
    one_hot = one_hot.astype(np.float32)
    
    one_hot[np.arange(one_hot.shape[0]), batch_text.flatten()] = 1.0
    
    one_hot = one_hot.reshape((*batch_text.shape, uni_chars))
    
    return one_hot

In [12]:
# Example
x = np.array([1,2,0])

In [13]:
one_hot_enc(x, 3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [14]:
# generate batches for training
def gen_batch(en_text, sample_size = 10, seq_len = 50):
    
    char_len = sample_size * seq_len
    num_batches = int(len(en_text) / char_len)
    
    en_text = en_text[: num_batches * char_len]
    en_text = en_text.reshape((sample_size, -1))
    
    for n in range(0,en_text.shape[-1], seq_len):
        x = en_text[:, n : n + seq_len]
        y = np.zeros_like(x)
        
        try:
            y[:, : -1] = x[:, 1:]
            y[:, -1] = en_text[:, n + seq_len]
        
        except:
            y[:, : -1] = x[:, 1:]
            y[:, -1] = en_text[:, 0]
        
        yield x,y

In [15]:
# Example 
sample_text = np.arange(20)

In [16]:
sample_text

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [17]:
batch_gen = gen_batch(sample_text, sample_size = 2, seq_len = 5)

In [18]:
x,y = next(batch_gen)

In [19]:
x

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14]])

In [20]:
y

array([[ 1,  2,  3,  4,  5],
       [11, 12, 13, 14, 15]])

In [21]:
# RNN model
class CharRNN(nn.Module):
    def __init__(self, all_chars, num_hidden = 256, num_layers = 4, drop_prob = 0.5):
        
        super().__init__()
        
        self.drop_pob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.all_chars = all_chars
        
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: i for i, char in decoder.items()}
        
        # Architecture
        self.lstm = nn.LSTM(len(all_chars), num_hidden, num_layers, dropout = drop_prob, batch_first = True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(num_hidden, len(self.all_chars))
        
    def forward(self, x, hidden):
        
        lstm_out, hidden = self.lstm(x, hidden)
        drop_out = self.dropout(lstm_out)
        drop_out = drop_out.contiguous().view(-1, self.num_hidden)
        output = self.fc1(drop_out)
        
        return output, hidden
    
    def hidden_state(self, batch_size):
        
        hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda(),
                  torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda())
        
        return hidden

In [22]:
model = CharRNN(all_chars = all_char, num_hidden = 512, num_layers = 3, drop_prob = 0.5)

In [23]:
model

CharRNN(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=512, out_features=84, bias=True)
)

In [24]:
params = []

for p in model.parameters():
    params.append(int(p.numel()))

In [25]:
sum(params)
# have some of params roughly equal to size of text data set to prevent over fitting

5470292

In [26]:
# hyperparams
lr = 0.001
train_per = 0.9
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()
train_ind = int(len(encoded_text) * train_per)

In [27]:
train_ind

4901048

In [28]:
train_data = encoded_text[:train_ind]
test_data = encoded_text[train_ind:]

In [29]:
train_data[:50]

array([24, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
       38, 38, 38, 38, 38, 77, 24, 38, 38, 45, 79, 21,  0, 38, 68, 36, 50,
       79, 72, 59, 42, 38,  8, 79, 72, 36, 42, 74, 79, 72, 59, 38, 58])

In [33]:
# training hyperparams
epochs = 20
batch_size = 100
seq_len = 100
t = 0
num_char = max(encoded_text) + 1

In [31]:
num_char

84

In [32]:
# training
model.train()
model.cuda()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    for x,y in gen_batch(train_data, batch_size, seq_len):
        t += 1
        
        x = one_hot_enc(x, num_char)
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        inputs = inputs.cuda()
        targets = targets.cuda()
        
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_out, hidden = model.forward(inputs, hidden)
        
        loss = criterion(lstm_out, targets.view(batch_size * seq_len).long())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 5)
        
        optimizer.step()
        
        if t % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            
            model.eval()
            
            for x,y in gen_batch(test_data, batch_size, seq_len):
                
                x = one_hot_enc(x, num_char)
                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)
        
                inputs = inputs.cuda()
                targets = targets.cuda()
                
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_out,val_hidden = model.forward(inputs, val_hidden)
                val_loss = criterion(lstm_out, targets.view(batch_size * seq_len).long())
                
                val_losses.append(val_loss.item())
            
            model.train()
            
            print(f"epoch: {i} | step: {t} | val loss {val_loss.item()}")

epoch : 0 | step: 25 | val loss 3.2135159969329834
epoch : 0 | step: 50 | val loss 3.1979482173919678
epoch : 0 | step: 75 | val loss 3.1852169036865234
epoch : 0 | step: 100 | val loss 3.067037582397461
epoch : 0 | step: 125 | val loss 2.9654741287231445
epoch : 0 | step: 150 | val loss 2.8195881843566895
epoch : 0 | step: 175 | val loss 2.719484329223633
epoch : 0 | step: 200 | val loss 2.6193275451660156
epoch : 0 | step: 225 | val loss 2.464092493057251
epoch : 0 | step: 250 | val loss 2.353654146194458
epoch : 0 | step: 275 | val loss 2.2722887992858887
epoch : 0 | step: 300 | val loss 2.204639434814453
epoch : 0 | step: 325 | val loss 2.154618263244629
epoch : 0 | step: 350 | val loss 2.104109287261963
epoch : 0 | step: 375 | val loss 2.0644683837890625
epoch : 0 | step: 400 | val loss 2.024813175201416
epoch : 0 | step: 425 | val loss 2.0004148483276367
epoch : 0 | step: 450 | val loss 1.9739222526550293
epoch : 0 | step: 475 | val loss 1.9516977071762085
epoch : 1 | step: 500 |

epoch : 8 | step: 3950 | val loss 1.24131441116333
epoch : 8 | step: 3975 | val loss 1.2407772541046143
epoch : 8 | step: 4000 | val loss 1.2465368509292603
epoch : 8 | step: 4025 | val loss 1.251021146774292
epoch : 8 | step: 4050 | val loss 1.254417896270752
epoch : 8 | step: 4075 | val loss 1.251602292060852
epoch : 8 | step: 4100 | val loss 1.2540996074676514
epoch : 8 | step: 4125 | val loss 1.251752495765686
epoch : 8 | step: 4150 | val loss 1.2541669607162476
epoch : 8 | step: 4175 | val loss 1.251842737197876
epoch : 8 | step: 4200 | val loss 1.2560356855392456
epoch : 8 | step: 4225 | val loss 1.2550889253616333
epoch : 8 | step: 4250 | val loss 1.2542452812194824
epoch : 8 | step: 4275 | val loss 1.2521226406097412
epoch : 8 | step: 4300 | val loss 1.2560330629348755
epoch : 8 | step: 4325 | val loss 1.2524782419204712
epoch : 8 | step: 4350 | val loss 1.2523118257522583
epoch : 8 | step: 4375 | val loss 1.2511502504348755
epoch : 8 | step: 4400 | val loss 1.2470382452011108


KeyboardInterrupt: 

In [None]:
# converges around a X.XX loss
# premise: loss does not reduce after a whole epoch
# save model
name = 'CharRNN_hidden512_layers3_shakes.net'

In [None]:
torch.save(model.state_dict(), name)