In [119]:
import numpy as np
import torch
import sys
from helpers import one_hot, prepare_batches
from random import seed, shuffle
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence

from importlib import reload
import helpers
reload(helpers)

#### Data Extraction

In [23]:
with open('data/t8.shakespeare.txt') as f:
    text = f.read()

In [43]:
shakespeare = []
#start after the header
skip = False
for line in text.split("\n")[244:]:
    if line[:2] == "<<":
        skip = True
    elif line[-2:] == ">>":
        skip = False
    if skip or line == "":
        continue
    shakespeare.append(line)

In [45]:
len(shakespeare)

113282

In [48]:
sys.getsizeof(shakespeare)

927568

#### Data Preprocessing

In [49]:
flatten = lambda l: [item for sublist in l for item in sublist]
flattened = flatten(shakespeare)

In [62]:
chars = tuple(set(flattened))
int2char = dict(enumerate(chars))
char2int = {value: key for key, value in int2char.items()}

In [142]:
len(chars)

83

In [88]:
#roughly 90% of sequences are shorter than 70 characters, so we'll make the max sequence length 70.
sorted([len(l) for l in shakespeare], reverse=True)[1100]

69

In [91]:
#90 % are longer than 39 characters... maybe we should have a shortest length too?
[len(l) for l in shakespeare][1100]

39

In [89]:
#simple character to number encoding
#truncate sequences longer than 70
numeric_sequences = [[char2int[char] for char in line][:70] for line in shakespeare]

In [106]:
seed(1609)
#randomly shuffle the sequences
shuffle(numeric_sequences)

#### Split into Training and Validation Sets

In [111]:
#90 - 10 training / validation split
n_training_sequences = int(.9 * len(numeric_sequences))
training = numeric_sequences[:n_training_sequences]
validation = numeric_sequences[n_training_sequences:]

In [110]:
len(validation), len(training)

(11329, 101953)

In [113]:
training_input = [sequence[:-1] for sequence in training]
training_target = [sequence[1:] for sequence in training]

In [291]:
reload(helpers)

<module 'helpers' from '/Users/nikku/projects/vates/helpers.py'>

In [293]:
packed_batches = helpers.prepare_batches(training_input, batch_size = 20, n_states = len(chars), sequence_length = 69)

#### Did the method work?

In [314]:
lines = []

for packed_batch in packed_batches:
    
    unpacked_sequences, sequence_lengths = pad_packed_sequence(packed_batch)
    
    for i in range(len(sequence_lengths)):
        
        length = sequence_lengths[i]
        sequence = unpacked_sequences[:,i,:][:length]
        
        numbers_sequence = [helpers.decode_one_hot(vec) for vec in sequence]
        
        lines.append([int2char[num] for num in numbers_sequence])

In [315]:
for i in range(15):
    print(''.join(lines[i]) + '\n')

    interim be but a se'nnight, Time's pace is so hard that it seems 

and three or four FOLLOWERS accordingly, with PORTIA, NERISSA, and tr

Alarum. Excursions. Enter the King, the Prince, Lord John of Lancaste

    John Doit of Staffordshire, and black George Barnes, and Francis 

  Ham. Let me see. [Takes the skull.] Alas, poor Yorick! I knew him, 

GLOUCESTER offers to put up a bill; WINCHESTER snatches it, and tears

Enter King, Queen, Polonius, Ophelia, Rosencrantz, Guildenstern, and 

  SPEED. Marry, by these special marks: first, you have learn'd, like

    never did such deeds in arms as I have done this day. I have paid

Trumpets, sennet, and cornets. Enter two VERGERS, with short silver w

  PANDARUS. Good boy, tell him I come.                       Exit Boy

    it with security. I look'd 'a should have sent me two and twenty 

    thus much for greeting. Now, my spruce companions, is all ready, 

  LAUNCE. Out with that too; it was Eve's legacy, and cannot be ta'en

    yo

In [274]:
''.join(text_sequences[4])

''

In [226]:
"".join(text_sequences[0])

' aA  GE  T      EED '

In [211]:
#padded_batches[0]

In [187]:
#comment out lines dealing w/ un-packing


#unpacked_batch, sequence_lengths = pad_packed_sequence(input_batches[0], padding_value=0, total_length=69)
text_sequences = []
for padded_sequence, seq_length in zip(unpacked_batch, sequence_lengths):
    
    #unpadded sequence
    sequence = padded_sequence[:seq_length]
    #each element of this is a one-hot vector

    numbers_sequence = [decode_one_hot(vec) for vec in sequence]
    
    text_sequences.append([int2char[num] for num in numbers_sequence])

In [205]:
''.join(text_sequences[10])

'moco  ,ads.huOo.D Ee'

#### Training the Model

In [121]:
input_size = len(chars)

In [125]:
lstm = nn.LSTM(input_size, hidden_size = 100, num_layers=2, dropout=0.5)

In [126]:
loss_function = nn.NLLLoss()

In [None]:
# def init_hidden(self, batch_size):
#     hidden = Variable(next(self.parameters()).data.new(self.num_layers, batch_size, self.hidden_size))
#     cell =  Variable(next(self.parameters()).data.new(self.num_layers, batch_size, self.hidden_size))
#     return (hidden, cell)

In [133]:
help(nn.modules.module)

Help on module torch.nn.modules.module in torch.nn.modules:

NAME
    torch.nn.modules.module

CLASSES
    builtins.object
        Module
    
    class Module(builtins.object)
     |  Base class for all neural network modules.
     |  
     |  Your models should also subclass this class.
     |  
     |  Modules can also contain other Modules, allowing to nest them in
     |  a tree structure. You can assign the submodules as regular attributes::
     |  
     |      import torch.nn as nn
     |      import torch.nn.functional as F
     |  
     |      class Model(nn.Module):
     |          def __init__(self):
     |              super(Model, self).__init__()
     |              self.conv1 = nn.Conv2d(1, 20, 5)
     |              self.conv2 = nn.Conv2d(20, 20, 5)
     |  
     |          def forward(self, x):
     |             x = F.relu(self.conv1(x))
     |             return F.relu(self.conv2(x))
     |  
     |  Submodules assigned in this way will be registered, and will have 

In [129]:
with torch.no_grad():
    scores = lstm(input_batches[0])
    print(scores)

(PackedSequence(data=tensor([[ 0.0189,  0.0024, -0.0323,  ..., -0.0147,  0.0388,  0.0309],
        [ 0.0215,  0.0096, -0.0329,  ..., -0.0156,  0.0337,  0.0302],
        [ 0.0214,  0.0103, -0.0301,  ..., -0.0076,  0.0365,  0.0240],
        ...,
        [ 0.0177,  0.0146, -0.0666,  ..., -0.0225,  0.0607,  0.0344],
        [ 0.0071, -0.0038, -0.0874,  ..., -0.0457,  0.0558,  0.0341],
        [ 0.0092,  0.0054, -0.0773,  ..., -0.0455,  0.0440,  0.0410]]), batch_sizes=tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]), sorted_indices=None, unsorted_indices=None), (tensor([[[ 2.2851e-02,  3.5399e-02,  1.4999e-01,  ...,  1.3775e-02,
          -3.7720e-02,  3.6586e-02],
         [ 1.0992e-02,  2.8352e-02,  8.5252e-02,  ..., -9.9754e-04,
   