In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

In [282]:
# Define the default tensor type at the top
torch.set_default_tensor_type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The project goal is to create an RNN that operates at the character level to generate text. This means, given a sequence of characters, the RNN will generate the next character in the sequence. If the model is trained well, this can be done repeatedly to generate text that resembles the training corpus. I'll try to make everything as generic as possible so any source text can be used.  

I've been working with the first two books from the Wheel of Time by Robert Jordan.

### Text preprocessing

In [442]:
# Import the text file as one big string
path = "../data/WoT/"
filenames = ["WoT1.txt", "WoT2.txt", "WoT3.txt"]
text = ""
for f in filenames:
    text += open(f'{path}{f}').read()
print(len(text))

4544942


In [336]:
text[1711173:1711500]

'THE WHEEL OF TIME\n\n\n\nBook Two\n\n\n\nTHE GREAT HUNT\n\n“Jordan has come to dominate the world that Tolkien began to reveal. . . . The battle scenes have the breathless urgency of firsthand experience, and the . . . evil laced into the forces of good, the dangers latent in any promised salvation, the sense of the unavoidable onslaug'

In [443]:
"""
The vocabulary is all the unique symbols used in the text. This is the real benefit of working with a character-
level RNN. The vocabulary is tiny, and you don't need to deal with unkown words.
"""
chars = sorted(set(text))
print(type(chars))
print(chars)
vocab_size = len(chars)
print(vocab_size)

<class 'list'>
['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '©', '®', '–', '—', '‘', '’', '“', '”', '…', '™']
89


In [444]:
# Create dictionaries from character->index and index->character
c_to_idx = {c:i for i, c in enumerate(chars)}
idx_to_c = {i:c for i, c in enumerate(chars)}

In [445]:
# Convert whole text to indices. Want each character to be represented by its index in the vocabulary. This is how
# we'll feed it to the RNN.
text_idx = [c_to_idx[c] for c in text]
text_len = len(text_idx)
text_idx[:10]

[85, 45, 60, 57, 1, 30, 77, 57, 1, 67]

In [446]:
# Check it works to convert back: Join up the indices
print(text[25:100])
print("-----------")
print(''.join([idx_to_c[i] for i in text_idx[25:100]]))

the best of its genre.”

—The Ottawa Citizen



“A splendid tale of heroic 
-----------
the best of its genre.”

—The Ottawa Citizen



“A splendid tale of heroic 


### Create a dataloader

In [447]:
# Sequence of characters passed to RNN at a time. This dictates the length of the unrolled model (# timesteps)
# Batch size affects splitting of raw data as well as model architecture.
seq_len = 8
batch_size = 512

In [85]:
# Want a non-verlapping set of inputs and outputs. Each X should be equal to the sequence length. So should Y,
# but should be shifted by 1. Want to shift X by the sequence length each step.
# Note that we don't go right to the end so that there's room for Y and for a whole final sequence.
idx_in_data = [text_idx[idx:idx+seq_len] for idx in range(0, text_len-1-seq_len, seq_len)]

# idx_in_data = [[text_idx[idx] for idx in range(text_pt, text_pt+seq_len)] \
#                for text_pt in range(0, text_len - 1 - seq_len, seq_len)]

In [86]:
# Convert these inputs into a numpy array and provide some info. Note that the dimensions are the total number of
# sequences in the corpus and the sequence length
inp = np.array(idx_in_data)
print(inp.shape)
print(inp[:3, :])

(75111, 8)
[[39 41 28 29 24 26 28  0]
 [ 0  0 42 44 39 39 38 42]
 [32 37 30  1 72 60 53 72]]


In [87]:
# Do the same thing for Y
idx_out_data = [text_idx[idx:idx+seq_len] for idx in range(1, text_len-seq_len, seq_len)]

# idx_out_data = [[text_idx[idx] for idx in range(text_pt, text_pt+seq_len)] \
#                 for text_pt in range(1, text_len - seq_len, seq_len)]

In [88]:
# Confirm that the target array is the input array shifted by 1. We'll be predicting the next character in the
# sequence.
outp = np.array(idx_out_data)
print(outp.shape)
print(outp[:3,:])

(75111, 8)
[[41 28 29 24 26 28  0  0]
 [ 0 42 44 39 39 38 42 32]
 [37 30  1 72 60 53 72  1]]


In [89]:
'''
Split up the input and target data into training and test sets.
Return 4 numpy arrays - training input, training targets, test input, and test targets
'''
def train_test_split(inp_data, out_data, train_fraction):
    trn_idx = np.random.rand(len(inp_data)) < train_fraction
    
    inp_trn = inp_data[trn_idx]
    inp_test = inp_data[~trn_idx]
    
    outp_trn = out_data[trn_idx]
    outp_test = out_data[~trn_idx]
    return inp_trn, outp_trn, inp_test, outp_test

In [90]:
# Split the data into 90% training, 10% test. This ratio could change with bigger corpus
x_trn, y_trn, x_val, y_val = train_test_split(inp, outp, 0.9)

In [448]:
'''
PyTorch Dataset class for character level text generation.
X and Y have widths equal to the sequence length.
'''
class CharSeqDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X);
    
    def __getitem__(self, idx):
        item = self.X[idx];
        label = self.Y[idx];
        
        return (item, label);

In [415]:
# Training and validation datasets
train_ds = CharSeqDataset(x_trn, y_trn)
val_ds = CharSeqDataset(x_val, y_val)

In [93]:
# Turn these into PyTorch dataloaders with batch size = batch_size.
# This will take care of the shuffling and batching,.
train_dl = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True);
val_dl = DataLoader(dataset=val_ds, batch_size=batch_size, shuffle=True);

A couple experiments with the data loaders.  
1. The X and Y values are paired. Show that shuffling keeps the lined up.
2. You get a different order whenever you iterate over a dataloader

In [94]:
exp_iter = iter(train_dl)
x_exp, y_exp = next(exp_iter)

In [95]:
# Exp 1.
print(x_exp.shape) # batch size by sequence length
print(type(x_exp))
print(x_exp[:2, :])
print("*****")
print(y_exp.shape)
print(type(y_exp))
print(y_exp[:2, :])

torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[57, 56,  1, 72, 60, 57, 70, 57],
        [77,  1, 72, 67,  1, 61, 65, 68]], device='cpu')
*****
torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[56,  1, 72, 60, 57, 70, 57, 54],
        [ 1, 72, 67,  1, 61, 65, 68, 73]], device='cpu')


In [96]:
# Exp 2.
exp_iter2 = iter(train_dl)
x_exp2, y_exp2 = next(exp_iter2)

print(x_exp2.shape) # batch size by sequence length
print(type(x_exp2))
print(x_exp2[:2, :])
print("*****")
print(y_exp2.shape)
print(type(y_exp2))
print(y_exp2[:2, :])

torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[54, 57, 71, 72,  7,  1, 75, 53],
        [ 1, 72, 60, 53, 72,  1, 53, 70]], device='cpu')
*****
torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[57, 71, 72,  7,  1, 75, 53, 66],
        [72, 60, 53, 72,  1, 53, 70, 57]], device='cpu')


### Character level RNN model class, using Pytorch

In [449]:
# Dimension for character's learned embeddings. Number of hidden units in the RNN
emb_dim = 42
n_hidden = 256

In [450]:
'''
Pytorch model.
One sequence step involves embedding layer->RNN->fully connected layer->softmax over vocabulary
A couple tricky points:
-Want to keep the hidden activation values after a forward pass. So I have to detach h after a 
forward pass so BPTT doesn't have to go through all the steps back to the very beginning of the corpus.
-Output predictions are rank 3 tensor of batch_size x seq_len x vocab length (it's a prediction over the vocab
for each char in the sequence and for each sequence in the minibatch). Softmax only accepts rank 2, so need to
reshape this into a (batch_size * seq_len) x vocab_length tensor.
'''
class CharRnn(nn.Module):
    def __init__(self, vocab_size, emb_dim, bs):
        super().__init__()
        self.e = nn.Embedding(vocab_size, emb_dim) # Going from vocab size down to embedding size
        # Automatically runs for N sequence steps, which is known from input data size
        self.rnn = nn.RNN(emb_dim, n_hidden) # embedding size to number of hidden units
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.h = self.init_h(bs)
        
    def forward(self, cs):
        bs = cs.shape[0]
        if self.h.shape[1] != bs:
            self.h = self.init_h(bs)
        inp = self.e(cs)
        inp = torch.transpose(inp, 0, 1)
        outp, h = self.rnn(inp, self.h)
        self.h = Variable(h.data) # Save hidden values for next forward pass. Remove from BPTT by rewrapping in Var
        outp = F.log_softmax(self.l_out(outp), dim=-1)
        outp = torch.transpose(outp, 0, 1)
        return outp.contiguous().view(-1, vocab_size) #This is tricky! Write myself a note it
    
    def init_h(self, bs):
        return Variable(torch.zeros(1, bs, n_hidden))

In [451]:
# Training function does 1 epoch (pass through the data)
def train(model, opt, crit, train_loader):
    losses = []
    model.train()
    
    for i, (inputs, targets) in enumerate(train_loader):
        opt.zero_grad()
        outputs = model(inputs.to(device))
        targets = targets.view(-1).to(device)
        loss = crit(outputs, targets)
        loss.backward()
        opt.step()

        losses.append(loss.data);
    return losses

In [452]:
# Test function calculates average loss over all the test data.
def test(model, test_loader, crit):
    # Put model in evaluation mode. Read up on what it does
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs.to(device))
            targets = targets.view(-1).to(device)
#             l = F.nll_loss(outputs, targets, reduction='sum').item() / len(targets)# sum up batch loss
            l = crit(outputs, targets)
            test_loss += l.item()
            pred = outputs.max(1, keepdim=True)[1] # get the index of the max log-probability (char index)
            correct += pred.eq(targets.view_as(pred)).sum().item()
    test_loss /= len(test_loader)
    return test_loss

In [24]:
model = CharRnn(vocab_size, emb_dim, batch_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss();

In [25]:
epochs = 4
for ep in range(epochs):
    tr_loss = train(model, optimizer, criterion, train_dl)
    test_loss = test(model, val_dl, criterion)
    print(f'Epoch: {ep+1} / {epochs}, Training Loss: {tr_loss[-1]:.4f}, Validation Loss: {test_loss:.4f}')

Epoch: 1 / 4, Training Loss: 2.4561, Validation Loss: 2.4268
Epoch: 2 / 4, Training Loss: 2.2769, Validation Loss: 2.2383
Epoch: 3 / 4, Training Loss: 2.1676, Validation Loss: 2.1346
Epoch: 4 / 4, Training Loss: 2.0905, Validation Loss: 2.0739


### Testing - look at this quantitatively and see how well it works

In [26]:
'''
Given an input and a trained model, do a forward pass and predict the next character in the input sequence.
Return this character as its integer index in the vocabulary.
'''
def next_letter(my_model, inp):

    inp = torch.tensor([inp])
    model_out = my_model(inp)
    # Grab the last letter from the model output
    # And sample from the vocabulary based on the weighted probability for character in the vocab.
    # This makes this result non-deterministic, there can be variance between the next letter in the sequence
    # depending on the sampling. Especially if multiple character get assigned similar probabilities.
    next_idx = torch.multinomial(model_out[-1].exp(), 1).item()
    
    # return the next character index in the sequence
    return next_idx

In [27]:
mytext = "thos"
mytext = [c_to_idx[i] for i in mytext]
nl = next_letter(model, mytext)
print(nl, idx_to_c[nl])

7 ,


In [28]:
'''
Keep generating the next character in the sequence. Repeatedly move the sampling window to include the latest
prediction and predict the next letter. Goes for num_chars repetitions.
'''
def gen_text(my_model, inp, num_chars):
    text = inp
    inp = [c_to_idx[i] for i in inp]
    for c in range(num_chars):
        l = next_letter(my_model, inp)
        text += idx_to_c[l]
        inp = inp[1:]+[l]
    print(text)

In [29]:
gen_text(model, "Hello", 400)

Helloranoponof
val angs by
itsed Ipower alves feiks thems concaus hemean the itself cheasio revenoss, Qe NUE The dormed best of the againssicct is sas the aroutnt a ATSASNLTECOSSSothingle et mslever a saysantions out myencersind which ty that we all to the wor have nom the lstion of a dosed as what it. For smecopher cour, the we was, to the sersthoundikn Rir po once;--the loge houver
man sen ghighatura


### Step 1 Summary

It looks like it's starting to work alright, especially since it hasn't trained for too long. I found that benefits of continued training started to level off after ~30 epochs or so.

I learned something important here though. When I split up the corpus into sequences of length 8 (sequence length / bptt length), characters 1 - 8 are the first training example in batch 1, 9 - 16 are the second etc.
What that means is that the hidden states after the forward pass are meaningless for the next batch. There's no information gained about the previous sequence to help you out with the current sequence! 

Here's a different idea -> What if characters 1 - 8 make up the first training example of the first batch, then characters 9 - 16 make up the first training example of the second batch. That way (since we're saving activation values) when character 9 gets passed in as the first step to the RNN, the activations correspond to what came out after character 8, which was the last character of example 1 in the previous minibatch.

### Split the data up into "vertical stacks" as explained above

In [319]:
print(f'Corpus length: {len(text)}')
print(f'Batch size: {batch_size}')
print(f'Sequence length / bptt length: {seq_len}')

Corpus length: 1711173
Batch size: 512
Sequence length / bptt length: 8


Want to split the corpus into a number of chunks equal to the number of mini batches (512) because each chunk will represent a row example in successive minibatches. Also, the sequences need to still be seq_len long. So it's easiest to figure out how long (number of chars) a block can be if we need to get 512 into the corpus, then round that length to something evenly divisible by the sequene length. We lose a little bit of potential information, but it MAY be easier than having the final minibatch have a shorter sequence. Another option would be to zero-pad that last sequence, but I'm not going to worry about that since there's a lot of training data.

In [453]:
'''
-Pass in text, batch size, and sequence length to get back numpy array where consecutive text is lined up
across minibatches.
-Remember, in a list comprehension, the second for executes fully (it's the nested one). Each pass through the text, 
we grab a sequence lengthed bit of text from each "mini-batch block". Then next pass the index is shifted over.
The idea is that you build an array where mini_batch example i makes continuous text across the mini batches,
rather than within a minibatch.
'''
def vertical_chunk(text, bs, sl):
    s_per_block = len(text) // sl // bs
    c_per_block = s_per_block * sl
    tl = c_per_block * bs
    
    r = [text[b+i : b+i+sl] for i in range(0,c_per_block,sl) for b in range(0,tl,c_per_block)]
    return np.array(r)

In [454]:
stacked_inp = vertical_chunk(text_idx, batch_size, seq_len)
stacked_outp = vertical_chunk(text_idx[1:], batch_size, seq_len)

In [455]:
print(stacked_inp.shape)
print(stacked_outp.shape)

(567808, 8)
(567808, 8)


In [456]:
# Show that continuous text is split over minibatch indices
print(text_idx[:16])
print("********")
print(stacked_inp[0])
print(stacked_outp[0])
print("********")
print(stacked_inp[512])
print(stacked_outp[512])

[85, 45, 60, 57, 1, 30, 77, 57, 1, 67, 58, 1, 72, 60, 57, 1]
********
[85 45 60 57  1 30 77 57]
[45 60 57  1 30 77 57  1]
********
[ 1 67 58  1 72 60 57  1]
[67 58  1 72 60 57  1 48]


Time to make a dataloader. But we don't want to shuffle the data, because the continuity is important for the activations. So just split up the data into test and train by index, then make a dataloader without shuffle on.

In [457]:
# Don't want to randomly split. Just take the first half
# st_x_trn, st_y_trn, st_x_val, st_y_val = train_test_split(stacked_inp, stacked_outp, 0.9)
def data_split_nonrandom(in_data, out_data, train_frac):
    portion = int(len(in_data) * train_frac)
    return in_data[:portion], out_data[:portion], in_data[portion:], out_data[portion:]

In [458]:
st_x_trn, st_y_trn, st_x_val, st_y_val = data_split_nonrandom(stacked_inp, stacked_outp, 0.9)

In [459]:
# Training and validation datasets
st_train_ds = CharSeqDataset(st_x_trn, st_y_trn)
st_val_ds = CharSeqDataset(st_x_val, st_y_val)

In [460]:
# Unshuffled dataloaders
# Is there a way to shuffle cross batch? Does this even have a point?
st_train_dl = DataLoader(dataset=st_train_ds, batch_size=batch_size, shuffle=False);
st_val_dl = DataLoader(dataset=st_val_ds, batch_size=batch_size, shuffle=False);

In [461]:
# Test that we're not shuffling
test_iter = iter(st_train_dl)
x_test, y_test = next(test_iter)
print(x_test.shape)
print(x_test[0])
print(y_test[0])

torch.Size([512, 8])
tensor([85, 45, 60, 57,  1, 30, 77, 57], device='cpu')
tensor([45, 60, 57,  1, 30, 77, 57,  1], device='cpu')


In [462]:
x_test, y_test = next(test_iter)
print(x_test.shape)
print(x_test[0])
print(y_test[0])

torch.Size([512, 8])
tensor([ 1, 67, 58,  1, 72, 60, 57,  1], device='cpu')
tensor([67, 58,  1, 72, 60, 57,  1, 48], device='cpu')


In [463]:
x_test, y_test = next(test_iter)
print(x_test.shape)
print(x_test[0])
print(y_test[0])

torch.Size([512, 8])
tensor([48, 67, 70, 64, 56,  1, 61, 71], device='cpu')
tensor([67, 70, 64, 56,  1, 61, 71,  1], device='cpu')


In [381]:
print(text_idx[16:24])

[48, 67, 70, 64, 56, 1, 61, 71]


### Train the model with vertically stacked data

The data all looks great now. Try training the existing model and see what we get.

In [464]:
st_model = CharRnn(vocab_size, emb_dim, batch_size).to(device)
st_optimizer = torch.optim.Adam(st_model.parameters(), lr=1e-3)
st_criterion = nn.CrossEntropyLoss();

In [465]:
len(st_train_dl) # Books 1 and 2 was 697

999

In [466]:
epochs = 20
for ep in range(epochs):
    tr_loss = train(st_model, st_optimizer, st_criterion, st_train_dl)
    test_loss = test(st_model, st_val_dl, st_criterion)
    print(f'Epoch: {ep+1} / {epochs}, Training Loss: {tr_loss[-1]:.4f}, Validation Loss: {test_loss:.4f}')

Epoch: 1 / 20, Training Loss: 2.1074, Validation Loss: 1.5616
Epoch: 2 / 20, Training Loss: 1.9761, Validation Loss: 1.4362
Epoch: 3 / 20, Training Loss: 1.8828, Validation Loss: 1.3812
Epoch: 4 / 20, Training Loss: 1.7989, Validation Loss: 1.3507
Epoch: 5 / 20, Training Loss: 1.7317, Validation Loss: 1.3318
Epoch: 6 / 20, Training Loss: 1.6758, Validation Loss: 1.3188
Epoch: 7 / 20, Training Loss: 1.6241, Validation Loss: 1.3085
Epoch: 8 / 20, Training Loss: 1.5787, Validation Loss: 1.3001
Epoch: 9 / 20, Training Loss: 1.5404, Validation Loss: 1.2934
Epoch: 10 / 20, Training Loss: 1.5076, Validation Loss: 1.2879
Epoch: 11 / 20, Training Loss: 1.4791, Validation Loss: 1.2835
Epoch: 12 / 20, Training Loss: 1.4556, Validation Loss: 1.2798
Epoch: 13 / 20, Training Loss: 1.4358, Validation Loss: 1.2768
Epoch: 14 / 20, Training Loss: 1.4165, Validation Loss: 1.2744
Epoch: 15 / 20, Training Loss: 1.3961, Validation Loss: 1.2725
Epoch: 16 / 20, Training Loss: 1.3757, Validation Loss: 1.2708
E

This is training pretty nicely now. I'm seeing it max out around epoch 30 with a validation loss of 1.42 when looking at just the first book. From there the training loss keeps coming down and the validation loss starts to climb again. This gives us some clues for what to do next to improve things.

### Test this model out. How does it do?

In [467]:
gen_text(st_model, "Get thos", 400)

Get those get going food? Didy despite he had be who, travel as thought climbed they would grace of passing.”

Caution, if he sugs—but Perrin worse realized to be direction, the sniffed, some minute.”

“I supposed,” Moiraine was any cource. Sandfilled; they’d be?”

“So that had hardle of an edfew and never been much as he could does wool; she? I can’t calm taceles did not be.

SLove for youthon, Loial, sh


Generation looks pretty good! Structure and mainly real words. I'm not sure if I'm doing the generation completely efficiently / properly, because I'm passing in a new sequence each forward pass. Think about this some more.

### Function to create all my data

In [468]:
# Given some text, batch size, and sequence length, return the training and validation dataloaders
def create_dataloader(text, bs, sl, spl_frac):
    stacked_inp = vertical_chunk(text_idx, batch_size, sl)
    stacked_outp = vertical_chunk(text_idx[1:], batch_size, sl)
    st_x_trn, st_y_trn, st_x_val, st_y_val = data_split_nonrandom(stacked_inp, stacked_outp, spl_frac)
    st_train_ds = CharSeqDataset(st_x_trn, st_y_trn)
    st_val_ds = CharSeqDataset(st_x_val, st_y_val)
    training_dl = DataLoader(dataset=st_train_ds, batch_size=batch_size, shuffle=False);
    validation_dl = DataLoader(dataset=st_val_ds, batch_size=batch_size, shuffle=False);
    return training_dl, validation_dl

In [469]:
wot_sl = 24

In [478]:
# Try some data with different sequence lengths.
wot_trn_dl, wot_val_dl = create_dataloader(text_idx, batch_size, wot_sl, 0.95)

In [479]:
print(len(wot_trn_dl), len(wot_val_dl)) #length is dependent on sequence length.

351 19


In [480]:
# Test that everything looks ok. Sequence length is equal to wot_sl
test_iter = iter(wot_trn_dl)
x_test, y_test = next(test_iter)
print(x_test.shape)
print(x_test[0])
print(y_test[0])

torch.Size([512, 24])
tensor([85, 45, 60, 57,  1, 30, 77, 57,  1, 67, 58,  1, 72, 60, 57,  1, 48, 67,
        70, 64, 56,  1, 61, 71], device='cpu')
tensor([45, 60, 57,  1, 30, 77, 57,  1, 67, 58,  1, 72, 60, 57,  1, 48, 67, 70,
        64, 56,  1, 61, 71,  1], device='cpu')


### Can I get better results with a GRU?

In [481]:
def train_gpu(model, opt, crit, train_loader):
    losses = []
    model.train()
    
    for i, (inputs, targets) in enumerate(train_loader):
        opt.zero_grad()
        outputs = model(inputs.cuda())
        
        targets = targets.view(-1).cuda()
        loss = crit(outputs, targets)
        loss.backward()
        opt.step()

        losses.append(loss.data);
        if i+1 % 75 == 0:
            print(f'Iteration: {i}, Training Loss: {loss.data}')
    return losses

In [482]:
# Test function with gpu loading
def test_gpu(model, test_loader, crit):
    # Put model in evaluation mode. Read up on what it does
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs.cuda())
            targets = targets.view(-1).cuda()
#             l = F.nll_loss(outputs, targets, reduction='sum').item() / len(targets)# sum up batch loss
            l = crit(outputs, targets)
            test_loss += l.item()
            pred = outputs.max(1, keepdim=True)[1] # get the index of the max log-probability (char index)
            correct += pred.eq(targets.view_as(pred)).sum().item()
    test_loss /= len(test_loader)
    return test_loss

In [483]:
class CharGRU(nn.Module):
    def __init__(self, vocab_size, emb_dim, bs, num_layers):
        super().__init__()
        self.vocab_size = vocab_size
        self.layers = num_layers
        self.e = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, n_hidden, num_layers=self.layers, dropout=0.3)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs.shape[0]
        if self.h.shape[1] != bs:
            self.init_hidden(bs)
        # Does the GRU note need a transpose and a transpose back?
        inp = self.e(cs)
        inp = torch.transpose(inp, 0, 1)
        outp,h = self.rnn(inp, self.h)
        self.h = Variable(h.data)
        # Transpose back
        outp = F.log_softmax(self.l_out(outp), dim=-1)
        outp = torch.transpose(outp, 0, 1)
        return outp.contiguous().view(-1, vocab_size) #This is tricky! Write myself a note it
    
    def init_hidden(self, bs): 
        self.h = Variable(torch.zeros(self.layers, bs, n_hidden))

In [484]:
n_hidden= 256
n_layer = 2
learning_rate=1e-3
num_epochs=100

In [485]:
modelGRU = CharGRU(vocab_size, emb_dim, batch_size, n_layer).cuda()
optGRU = torch.optim.Adam(modelGRU.parameters(), lr=learning_rate)
critGRU = nn.CrossEntropyLoss();

In [486]:
for ep in range(num_epochs):
    tr_loss = train_gpu(modelGRU, optGRU, critGRU, wot_trn_dl) #st_train_dl
    test_loss = test_gpu(modelGRU, wot_val_dl, critGRU)
    print(f'Epoch: {ep+1} / {num_epochs}, Training Loss: {tr_loss[-1]:.4f}, Validation Loss: {test_loss:.4f}')

Epoch: 1 / 100, Training Loss: 1.7859, Validation Loss: 1.5686
Epoch: 2 / 100, Training Loss: 1.6115, Validation Loss: 1.3610
Epoch: 3 / 100, Training Loss: 1.5358, Validation Loss: 1.2873
Epoch: 4 / 100, Training Loss: 1.4989, Validation Loss: 1.2493
Epoch: 5 / 100, Training Loss: 1.4640, Validation Loss: 1.2242
Epoch: 6 / 100, Training Loss: 1.4514, Validation Loss: 1.2077
Epoch: 7 / 100, Training Loss: 1.4310, Validation Loss: 1.1954
Epoch: 8 / 100, Training Loss: 1.4206, Validation Loss: 1.1854
Epoch: 9 / 100, Training Loss: 1.4081, Validation Loss: 1.1768
Epoch: 10 / 100, Training Loss: 1.3967, Validation Loss: 1.1707
Epoch: 11 / 100, Training Loss: 1.3813, Validation Loss: 1.1644
Epoch: 12 / 100, Training Loss: 1.3760, Validation Loss: 1.1592
Epoch: 13 / 100, Training Loss: 1.3695, Validation Loss: 1.1554
Epoch: 14 / 100, Training Loss: 1.3730, Validation Loss: 1.1513
Epoch: 15 / 100, Training Loss: 1.3565, Validation Loss: 1.1484
Epoch: 16 / 100, Training Loss: 1.3564, Validatio

KeyboardInterrupt: 

### GRU test

In [437]:
gen_text(modelGRU, "Rand isn", 400)

Rand isn’t liketask. Med masters halfwinterstones, we’lld the worlds Raal arresses and case from the Army look at him. Just notice and found. In abivemen. Few of the Seat Sundray old ever. As you are got and born ones and trying to detead to hide and you’re calling at all, but a little stripe I have someone of leaf as the Gaidin, a man right, and though well up—the Hurin when his tongues. Rand had to be s


Validation error is around 1.12 or so and the test output looks pretty decent. To get it better, I'll have to get creative.

### LSTM?

In [110]:
# Two layer LSTM. Possible to make it 1?
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
#         print(type(h))
        self.h = tuple(Variable(v.data) for v in h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (Variable(torch.zeros(self.nl, bs, n_hidden)),
                  Variable(torch.zeros(self.nl, bs, n_hidden)))

In [136]:
n_hidden= 256
n_layer = 2
learning_rate=1e-3
num_epochs=100

In [137]:
modelLSTM = CharLSTM(vocab_size, emb_dim, batch_size, n_layer).cuda()
optLSTM = torch.optim.Adam(modelLSTM.parameters(), lr=learning_rate)
critLSTM = nn.CrossEntropyLoss();

In [140]:
for ep in range(num_epochs):
    tr_loss = train_gpu(modelLSTM, optLSTM, critLSTM, st_train_dl)
    test_loss = test_gpu(modelLSTM, st_val_dl, critLSTM)
    print(f'Epoch: {ep+1} / {num_epochs}, Training Loss: {tr_loss[-1]:.4f}, Validation Loss: {test_loss:.4f}')

Epoch: 1 / 100, Training Loss: 2.6875, Validation Loss: 2.6872
Epoch: 2 / 100, Training Loss: 2.5807, Validation Loss: 2.5795
Epoch: 3 / 100, Training Loss: 2.5426, Validation Loss: 2.5384
Epoch: 4 / 100, Training Loss: 2.5071, Validation Loss: 2.5186
Epoch: 5 / 100, Training Loss: 2.5020, Validation Loss: 2.5079
Epoch: 6 / 100, Training Loss: 2.4940, Validation Loss: 2.5011
Epoch: 7 / 100, Training Loss: 2.4868, Validation Loss: 2.4962
Epoch: 8 / 100, Training Loss: 2.4866, Validation Loss: 2.4924
Epoch: 9 / 100, Training Loss: 2.4715, Validation Loss: 2.4898
Epoch: 10 / 100, Training Loss: 2.4773, Validation Loss: 2.4878
Epoch: 11 / 100, Training Loss: 2.4731, Validation Loss: 2.4863
Epoch: 12 / 100, Training Loss: 2.4679, Validation Loss: 2.4852
Epoch: 13 / 100, Training Loss: 2.4697, Validation Loss: 2.4840
Epoch: 14 / 100, Training Loss: 2.4677, Validation Loss: 2.4833
Epoch: 15 / 100, Training Loss: 2.4655, Validation Loss: 2.4827
Epoch: 16 / 100, Training Loss: 2.4642, Validatio

KeyboardInterrupt: 