In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

The project goal is to create an RNN that operates at the character level to generate text. This means, given a sequence of characters, the RNN will generate the next character in the sequence. If the model is trained well, this can be done repeatedly to generate text that resembles the training corpus. I'll try to make everything as generic as possible so any source text can be used.

### Text preprocessing

In [6]:
# Import the text file as one big string
path = "../data/"
filename = "nietzsche.txt"
text = open(f'{path}{filename}').read()
print(len(text))

600893


In [7]:
"""
The vocabulary is all the unique symbols used in the text. This is the real benefit of working with a character-
level RNN. The vocabulary is tiny, and you don't need to deal with unkown words.
"""
chars = sorted(set(text))
print(type(chars))
print(chars)
vocab_size = len(chars)
print(vocab_size)

<class 'list'>
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Æ', 'ä', 'æ', 'é', 'ë']
84


In [8]:
# Create dictionaries from character->index and index->character
c_to_idx = {c:i for i, c in enumerate(chars)}
idx_to_c = {i:c for i, c in enumerate(chars)}

In [9]:
# Convert whole text to indices. Want each character to be represented by its index in the vocabulary. This is how
# we'll feed it to the RNN.
text_idx = [c_to_idx[c] for c in text]
text_len = len(text_idx)
text_idx[:10]

[39, 41, 28, 29, 24, 26, 28, 0, 0, 0]

In [10]:
# Check it works to convert back: Join up the indices
print(text[25:100])
print("-----------")
print(''.join([idx_to_c[i] for i in text_idx[25:100]]))

Truth is a woman--what then? Is there not ground
for suspecting that all ph
-----------
Truth is a woman--what then? Is there not ground
for suspecting that all ph


### Create a dataloader

In [14]:
# Sequence of characters passed to RNN at a time. This dictates the length of the unrolled model (# timesteps)
# Batch size affects splitting of raw data as well as model architecture.
seq_len = 8
batch_size = 512

In [22]:
# Want a non-verlapping set of inputs and outputs. Each X should be equal to the sequence length. So should Y,
# but should be shifted by 1. Want to shift X by the sequence length each step.
# Note that we don't go right to the end so that there's room for Y and for a whole final sequence.
idx_in_data = [text_idx[idx:idx+seq_len] for idx in range(0, text_len-1-seq_len, seq_len)]

# idx_in_data = [[text_idx[idx] for idx in range(text_pt, text_pt+seq_len)] \
#                for text_pt in range(0, text_len - 1 - seq_len, seq_len)]

In [23]:
# Convert these inputs into a numpy array and provide some info. Note that the dimensions are the total number of
# sequences in the corpus and the sequence length
inp = np.array(idx_in_data)
print(inp.shape)
print(inp[:3, :])

(75111, 8)
[[39 41 28 29 24 26 28  0]
 [ 0  0 42 44 39 39 38 42]
 [32 37 30  1 72 60 53 72]]


In [25]:
# Do the same thing for Y
idx_out_data = [text_idx[idx:idx+seq_len] for idx in range(1, text_len-seq_len, seq_len)]

# idx_out_data = [[text_idx[idx] for idx in range(text_pt, text_pt+seq_len)] \
#                 for text_pt in range(1, text_len - seq_len, seq_len)]

In [26]:
# Confirm that the target array is the input array shifted by 1. We'll be predicting the next character in the
# sequence.
outp = np.array(idx_out_data)
print(outp.shape)
print(outp[:3,:])

(75111, 8)
[[41 28 29 24 26 28  0  0]
 [ 0 42 44 39 39 38 42 32]
 [37 30  1 72 60 53 72  1]]


In [27]:
'''
Split up the input and target data into training and test sets.
Return 4 numpy arrays - training input, training targets, test input, and test targets
'''
def train_test_split(inp_data, out_data, train_fraction):
    trn_idx = np.random.rand(len(inp_data)) < train_fraction
    
    inp_trn = inp_data[trn_idx]
    inp_test = inp_data[~trn_idx]
    
    outp_trn = out_data[trn_idx]
    outp_test = out_data[~trn_idx]
    return inp_trn, outp_trn, inp_test, outp_test

In [28]:
# Split the data into 90% training, 10% test. This ratio could change with bigger corpus
x_trn, y_trn, x_val, y_val = train_test_split(inp, outp, 0.9)

In [29]:
'''
PyTorch Dataset class for character level text generation.
X and Y have widths equal to the sequence length.
'''
class CharSeqDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X);
    
    def __getitem__(self, idx):
        item = self.X[idx];
        label = self.Y[idx];
        
        return (item, label);

In [30]:
# Training and validation datasets
train_ds = CharSeqDataset(x_trn, y_trn)
val_ds = CharSeqDataset(x_val, y_val)

In [31]:
# Turn these into PyTorch dataloaders with batch size = batch_size.
# This will take care of the shuffling and batching,.
train_dl = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True);
val_dl = DataLoader(dataset=val_ds, batch_size=batch_size, shuffle=True);

A couple experiments with the data loaders.  
1. The X and Y values are paired. Show that shuffling keeps the lined up.
2. You get a different order whenever you iterate over a dataloader

In [39]:
exp_iter = iter(train_dl)
x_exp, y_exp = next(exp_iter)

In [40]:
# Exp 1.
print(x_exp.shape) # batch size by sequence length
print(type(x_exp))
print(x_exp[:2, :])
print("*****")
print(y_exp.shape)
print(type(y_exp))
print(y_exp[:2, :])

torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[ 1, 58, 67, 70, 59, 61, 74, 57],
        [72, 60, 57, 65,  1, 72, 53, 71]])
*****
torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[58, 67, 70, 59, 61, 74, 57, 66],
        [60, 57, 65,  1, 72, 53, 71, 72]])


In [45]:
# Exp 2.
exp_iter2 = iter(train_dl)
x_exp2, y_exp2 = next(exp_iter2)

print(x_exp2.shape) # batch size by sequence length
print(type(x_exp2))
print(x_exp2[:2, :])
print("*****")
print(y_exp2.shape)
print(type(y_exp2))
print(y_exp2[:2, :])

torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[55, 67, 73, 66, 72, 57, 70,  1],
        [55,  1, 65, 67, 56, 57,  1, 67]])
*****
torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[67, 73, 66, 72, 57, 70,  1, 68],
        [ 1, 65, 67, 56, 57,  1, 67, 58]])


### Character level RNN model class, using Pytorch

In [51]:
# Dimension for character's learned embeddings. Number of hidden units in the RNN
emb_dim = 42
n_hidden = 256

In [52]:
'''
Pytorch model.
One sequence step involves embedding layer->RNN->fully connected layer->softmax over vocabulary
A couple tricky points:
-Want to keep the hidden activation values after a forward pass. So I have to detach h after a 
forward pass so BPTT doesn't have to go through all the steps back to the very beginning of the corpus.
-Output predictions are rank 3 tensor of batch_size x seq_len x vocab length (it's a prediction over the vocab
for each char in the sequence and for each sequence in the minibatch). Softmax only accepts rank 2, so need to
reshape this into a (batch_size * seq_len) x vocab_length tensor.
'''
class CharRnn(nn.Module):
    def __init__(self, vocab_size, emb_dim, bs):
        super().__init__()
        self.e = nn.Embedding(vocab_size, emb_dim) # Going from vocab size down to embedding size
        # Automatically runs for N sequence steps, which is known from input data size
        self.rnn = nn.RNN(emb_dim, n_hidden) # embedding size to number of hidden units
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.h = self.init_h(bs)
        
    def forward(self, cs):
        bs = cs.shape[0]
        if self.h.shape[1] != bs:
            self.h = self.init_h(bs)
        inp = self.e(cs)
        inp = torch.transpose(inp, 0, 1)
        outp, h = self.rnn(inp, self.h)
        self.h = Variable(h.data) # Save hidden values for next forward pass. Remove from BPTT by rewrapping in Var
        outp = F.log_softmax(self.l_out(outp), dim=-1)
        outp = torch.transpose(outp, 0, 1)
        return outp.contiguous().view(-1, vocab_size) #This is tricky! Write myself a note it
    
    def init_h(self, bs):
        return Variable(torch.zeros(1, bs, n_hidden))

In [53]:
# Training function does 1 epoch (pass through the data)
def train(model, opt, crit, train_loader):
    losses = []
    model.train()
    
    for i, (inputs, targets) in enumerate(train_loader):
        opt.zero_grad()
        outputs = model(inputs)
        targets = targets.view(-1)
        loss = crit(outputs, targets)
        loss.backward()
        opt.step()

        losses.append(loss.data);
    return losses

In [54]:
# Test function calculates average loss over all the test data.
def test(model, test_loader, crit):
    # Put model in evaluation mode. Read up on what it does
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            targets = targets.view(-1)
#             l = F.nll_loss(outputs, targets, reduction='sum').item() / len(targets)# sum up batch loss
            l = crit(outputs, targets)
            test_loss += l.item()
            pred = outputs.max(1, keepdim=True)[1] # get the index of the max log-probability (char index)
            correct += pred.eq(targets.view_as(pred)).sum().item()
    test_loss /= len(test_loader)
    return test_loss

In [55]:
model = CharRnn(vocab_size, emb_dim, batch_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss();

In [56]:
epochs = 4
for ep in range(epochs):
    tr_loss = train(model, optimizer, criterion, train_dl)
    test_loss = test(model, val_dl, criterion)
    print(f'Epoch: {ep+1} / {epochs}, Training Loss: {tr_loss[-1]:.4f}, Validation Loss: {test_loss:.4f}')

Epoch: 1 / 4, Training Loss: 2.4953, Validation Loss: 2.4161
Epoch: 2 / 4, Training Loss: 2.2785, Validation Loss: 2.2416
Epoch: 3 / 4, Training Loss: 2.2211, Validation Loss: 2.1428
Epoch: 4 / 4, Training Loss: 2.0888, Validation Loss: 2.0796


### Testing - look at this quantitatively and see how well it works

In [57]:
'''
Given an input and a trained model, do a forward pass and predict the next character in the input sequence.
Return this character as its integer index in the vocabulary.
'''
def next_letter(my_model, inp):

    inp = torch.tensor([inp])
    model_out = my_model(inp)
    # Grab the last letter from the model output
    # And sample from the vocabulary based on the weighted probability for character in the vocab.
    # This makes this result non-deterministic, there can be variance between the next letter in the sequence
    # depending on the sampling. Especially if multiple character get assigned similar probabilities.
    next_idx = torch.multinomial(model_out[-1].exp(), 1).item()
    
    # return the next character index in the sequence
    return next_idx

In [73]:
mytext = "thos"
mytext = [c_to_idx[i] for i in mytext]
nl = next_letter(model, mytext)
print(nl, idx_to_c[nl])

57 e


In [74]:
'''
Keep generating the next character in the sequence. Repeatedly move the sampling window to include the latest
prediction and predict the next letter. Goes for num_chars repetitions.
'''
def gen_text(my_model, inp, num_chars):
    text = inp
    inp = [c_to_idx[i] for i in inp]
    print(inp)
    for c in range(num_chars):
        l = next_letter(my_model, inp)
        text += idx_to_c[l]
        inp = inp[1:]+[l]
#         print(text)
#         print(inp)
    print(text)

In [75]:
gen_text(model, "Hello", 400)

[31, 57, 64, 64, 67]
Hellog tethere formies. Heluty: in they the adny! Th!s that is some intrual--seneq. That e telve for precul compally of whace or
mhan Has prch thightuch forturself soul that cas ats, causs, afre te stireds and small ragal
  ior sopless
dy, dimestions fage, even as, h phim lape
lands of " beveres. Bursen dv, tasting who a smelvedwer intthe
spirelf our beave casity perf ooK theque'tive ps the, they

107



### Step 1 Summary

It's working alright. Learned something important here though. When I split up the corpus into sequences of length 8 (sequence length / bptt length), characters 1 - 8 are the first training example in batch 1, 9 - 16 are the second etc.
What that means is that the hidden states after the forward pass are meaningless for the next batch. There's no information gained about the previous sequence to help you out with the current sequence!
Here's a different idea -> What if characters 1 - 8 make up the first training example of the first batch, then characters 9 - 16 make up the first training example of the second batch. That way (since we're saving activation values) when character 9 gets passed in as the first step to the RNN, the activations correspond to what came out after character 8.

The challenge is how to split up the dataset now.

In [78]:
print(f'Corpus length: {len(text)}')
print(f'Batch size: {batch_size}')
print(f'Sequence length / bptt length: {seq_len}')

Corpus length: 600893
Batch size: 512
Sequence length / bptt length: 8


In [108]:
bs_cut = 64

In [109]:
#512*8 = 4096
# print(len(text_idx)) = 600,893
# inp and outp are the previous ones of shape 75111 x 8
print(inp.shape)
print(outp.shape)

(75111, 8)
(75111, 8)


In [115]:
max_per_block = len(text_idx) // batch_size
max_per_block

1173

In [113]:
max_per_block / seq_len

146.702392578125

In [121]:
# Throw away a tiny bit of information so each block is the same length
block_len_rounded = max_per_block - (max_per_block % seq_len)
block_len_rounded

1168

In [128]:
vert_block = [text_idx[i*block_len_rounded:i*block_len_rounded+block_len_rounded] for i in range(batch_size)]

In [139]:
print(len(vert_block))
print(len(vert_block[511]))

512
1168


In [151]:
# Now need to re-piece this into an X by 8(seq_len) array where the first entry and the 512th entry are continuous
# Note: the second loop is executed first? Yes it is.
x = [n[i:i+seq_len] for i in range(0, len(vert_block[0]), seq_len) for n in vert_block ]

In [152]:
print(len(x))
print(len(x[0]))

74752
8


In [153]:
# Original layout
vert_block[0][:16]

[39, 41, 28, 29, 24, 26, 28, 0, 0, 0, 42, 44, 39, 39, 38, 42]

In [158]:
vert_block[1][:16]

[67, 70, 65, 1, 67, 58, 1, 71, 73, 54, 62, 57, 55, 72, 8, 1]

In [182]:
# New layout. This should match above
print(x[0])
print(x[512])
print("**")
print(x[1])
print(x[513])

[39, 41, 28, 29, 24, 26, 28, 0]
[0, 0, 42, 44, 39, 39, 38, 42]
**
[67, 70, 65, 1, 67, 58, 1, 71]
[73, 54, 62, 57, 55, 72, 8, 1]


In [160]:
chunk_inp = np.array(x)
chunk_inp.shape

(74752, 8)

In [198]:
# Pass in text, batch size, and sequence length to get back numpy array where consecutive text is lined up
# across minibatches.
def vertical_chunk(text, bs, sl):
    s_per_block = len(text) // sl // bs
    c_per_block = s_per_block * sl
    tl = c_per_block * bs
    # groups of sequence length does each chunk get
    r = [text[b+i:b+i+sl] for i in range(0,c_per_block,sl) for b in range(0,tl,c_per_block)]
    return np.array(r)
    
    

In [202]:
stacked_inp = vertical_chunk(text_idx, batch_size, seq_len)
stacked_outp = vertical_chunk(text_idx[1:], batch_size, seq_len)

In [203]:
print(stacked_inp.shape)
print(stacked_outp.shape)

(74752, 8)
(74752, 8)


In [205]:
# Nice, this is working now.
print(stacked_inp[0])
print(stacked_outp[0])
print("********")
print(stacked_inp[512])
print(stacked_outp[512])

[39 41 28 29 24 26 28  0]
[41 28 29 24 26 28  0  0]
********
[ 0  0 42 44 39 39 38 42]
[ 0 42 44 39 39 38 42 32]


It's working now. Do I want to make a dataloader? I guess so but without shuffling.

In [219]:
# Don't want to randomly split. Just take the first half
# st_x_trn, st_y_trn, st_x_val, st_y_val = train_test_split(stacked_inp, stacked_outp, 0.9)
def data_split_nonrandom(in_data, out_data, train_frac):
    portion = int(len(in_data) * train_frac)
    return in_data[:portion], out_data[:portion], in_data[portion:], out_data[portion:]

In [220]:
st_x_trn, st_y_trn, st_x_val, st_y_val = data_split_nonrandom(stacked_inp, stacked_outp, 0.9)

In [224]:
# Training and validation datasets
st_train_ds = CharSeqDataset(st_x_trn, st_y_trn)
st_val_ds = CharSeqDataset(st_x_val, st_y_val)

In [227]:
print(stacked_inp[0])
print(stacked_inp[512])

[39 41 28 29 24 26 28  0]
[ 0  0 42 44 39 39 38 42]


In [228]:
# Unshuffled dataloaders
# Is there a way to shuffle cross batch? Does this even have a point?
st_train_dl = DataLoader(dataset=st_train_ds, batch_size=batch_size, shuffle=False);
st_val_dl = DataLoader(dataset=st_val_ds, batch_size=batch_size, shuffle=False);

In [235]:
# Test that we're not shuffling
test_iter = iter(st_train_dl)
x_test, y_test = next(test_iter)
print(x_test.shape)
print(x_test[0])
print(y_test[0])

torch.Size([512, 8])
tensor([39, 41, 28, 29, 24, 26, 28,  0])
tensor([41, 28, 29, 24, 26, 28,  0,  0])


In [236]:
x_test, y_test = next(test_iter)
print(x_test.shape)
print(x_test[0])
print(y_test[0])

torch.Size([512, 8])
tensor([ 0,  0, 42, 44, 39, 39, 38, 42])
tensor([ 0, 42, 44, 39, 39, 38, 42, 32])


In [237]:
x_test, y_test = next(test_iter)
print(x_test.shape)
print(x_test[0])
print(y_test[0])

torch.Size([512, 8])
tensor([32, 37, 30,  1, 72, 60, 53, 72])
tensor([37, 30,  1, 72, 60, 53, 72,  1])


In [238]:
print(text_idx[16:24])

[32, 37, 30, 1, 72, 60, 53, 72]


The data all looks great now. Try doing some training with this new data.

In [245]:
st_model = CharRnn(vocab_size, emb_dim, batch_size)
st_optimizer = torch.optim.Adam(st_model.parameters(), lr=1e-3)
st_criterion = nn.CrossEntropyLoss();

In [246]:
epochs = 30
for ep in range(epochs):
    tr_loss = train2(st_model, st_optimizer, st_criterion, st_train_dl)
    test_loss = test2(st_model, st_val_dl, st_criterion)
    print(f'Epoch: {ep+1} / {epochs}, Training Loss: {tr_loss[-1]:.4f}, Validation Loss: {test_loss:.4f}')

Epoch: 1 / 30, Training Loss: 2.4621, Validation Loss: 2.3361
Epoch: 2 / 30, Training Loss: 2.2171, Validation Loss: 2.0658
Epoch: 3 / 30, Training Loss: 2.1074, Validation Loss: 1.9289
Epoch: 4 / 30, Training Loss: 2.0401, Validation Loss: 1.8369
Epoch: 5 / 30, Training Loss: 1.9923, Validation Loss: 1.7731
Epoch: 6 / 30, Training Loss: 1.9564, Validation Loss: 1.7274
Epoch: 7 / 30, Training Loss: 1.9274, Validation Loss: 1.6926
Epoch: 8 / 30, Training Loss: 1.9021, Validation Loss: 1.6654
Epoch: 9 / 30, Training Loss: 1.8800, Validation Loss: 1.6446
Epoch: 10 / 30, Training Loss: 1.8595, Validation Loss: 1.6286
Epoch: 11 / 30, Training Loss: 1.8389, Validation Loss: 1.6137
Epoch: 12 / 30, Training Loss: 1.8189, Validation Loss: 1.5994
Epoch: 13 / 30, Training Loss: 1.8001, Validation Loss: 1.5878
Epoch: 14 / 30, Training Loss: 1.7821, Validation Loss: 1.5781
Epoch: 15 / 30, Training Loss: 1.7644, Validation Loss: 1.5699
Epoch: 16 / 30, Training Loss: 1.7469, Validation Loss: 1.5629
E

### Really training well now. Do some testing

In [247]:
gen_text(st_model, "Hello ev", 400)

[31, 57, 64, 64, 67, 1, 57, 74]
Hello evenO: look the
neighbously, from a develow regard on which if which
no not "pheilics untrily then have possible to
play be belief to where commations (which belong: innotical misnops of which, and wishet freedest elevation, from striving in our science--almost been is not swate: they tastes FOR COMSENNEIPEST which
at-like under that man, which this
scienw one concernion freered, disevetify, as beli


### Can I get better results with an LSTM?

In [20]:
n_hidden= 256
n_layer = 2

In [21]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
#         print(type(h))
        self.h = tuple(Variable(v.data) for v in h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (Variable(torch.zeros(self.nl, bs, n_hidden)),
                  Variable(torch.zeros(self.nl, bs, n_hidden)))

In [22]:
modelLSTM = CharSeqStatefulLSTM(vocab_size, emb_dim, batch_size, n_layer)
optLSTM = torch.optim.Adam(modelLSTM.parameters(), lr=1e-3)
critLSTM = nn.CrossEntropyLoss();

In [23]:
def train3(model, opt, crit, train_loader):
    losses = []
    model.train()
    
    for i, (inputs, targets) in enumerate(train_loader):
        opt.zero_grad()
        outputs = model(inputs)
        targets = targets.view(-1)
        loss = crit(outputs, targets)
        loss.backward()
        opt.step()

        losses.append(loss.data);
        if i % 5 == 0:
            print(f'Iteration: {i}, Training Loss: {loss.data}')
    return losses

In [34]:
for ep in range(num_epochs):
    tr_loss = train3(modelLSTM, optLSTM, critLSTM, train_dl)
#     test_loss = test2(modelLSTM, val_dl, critLSTM)
    print(f'Epoch: {ep+1} / {num_epochs}, Training Loss: {tr_loss[-1]:.4f}')#, Validation Loss: {test_loss:.4f}')

Iteration: 0, Training Loss: 4.424938678741455
Iteration: 5, Training Loss: 4.038959980010986
Iteration: 10, Training Loss: 3.1668179035186768
Iteration: 15, Training Loss: 3.1785495281219482
Iteration: 20, Training Loss: 3.131784439086914
Iteration: 25, Training Loss: 3.1282219886779785
Iteration: 30, Training Loss: 3.176347494125366
Iteration: 35, Training Loss: 3.1276752948760986
Iteration: 40, Training Loss: 3.102572202682495
Iteration: 45, Training Loss: 3.0849645137786865
Iteration: 50, Training Loss: 3.069145441055298
Iteration: 55, Training Loss: 3.097275972366333
Iteration: 60, Training Loss: 2.9951300621032715
Iteration: 65, Training Loss: 2.9081475734710693
Iteration: 70, Training Loss: 2.904214859008789
Iteration: 75, Training Loss: 2.9010636806488037
Iteration: 80, Training Loss: 2.8490655422210693
Iteration: 85, Training Loss: 2.84846830368042
Iteration: 90, Training Loss: 2.775449514389038
Iteration: 95, Training Loss: 2.7863214015960693
Iteration: 100, Training Loss: 2.

KeyboardInterrupt: 