In [2]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
with open('../Data/shakespeare.txt','r',encoding='utf8') as f:
    text = f.read()
text = text[0:40000]

In [4]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [5]:
type(text)

str

In [6]:
len(text)

40000

In [7]:
#character level encoding
all_characters = set(text)

In [8]:
len(all_characters)

70

In [9]:
# Vectorization
# decoder & encoder dictionaries
# decoder: num -> letter
decoder = dict(enumerate(all_characters)) #enumerate adds indices as keys of letter values

In [10]:
# letter -> num
encoder = {char:ind for ind,char in decoder.items()}

In [12]:
encoded_text = np.array([encoder[char] for char in text]) #build one hot index array

In [13]:
encoded_text

array([23, 10, 10, ..., 38, 42, 33])

In [14]:
def one_hot_encoder(encoded_text,num_uni_chars):
    # encode - > batch of encoded text
    # num of chars -> len(set(text))
    # https://stackoverflow.com/questions/29831489/convert-array-of-indices-to-1-hot-encoded-numpy-array
    one_hot=np.zeros((encoded_text.size,num_uni_chars))
    
    one_hot = one_hot.astype(np.float32)#must change data type
    
    # example: change index 27 into 1
    one_hot[np.arange(one_hot.shape[0]),encoded_text.flatten()]=1.0
    
    one_hot = one_hot.reshape((*encoded_text.shape,num_uni_chars))
    
    return one_hot

In [33]:
arr = np.array([1,2,0])

In [34]:
arr

array([1, 2, 0])

In [38]:
one_hot_encoder(arr,3)


array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [16]:
# training batches - hello, ther -> ello, there
example_text = np.arange(10)
example_text

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
example_text.reshape((5,-1))#5 batches of data

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [16]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    
    '''
    Generate (using yield) batches for training.
    
    X: Encoded Text of length seq_len
    Y: Encoded Text shifted by one
    
    Example:
    
    X:
    
    [[1 2 3]]
    
    Y:
    
    [[ 2 3 4]]
    
    encoded_text : Complete Encoded Text to make batches from
    batch_size : Number of samples per batch
    seq_len : Length of character sequence
       
    '''
    
    # Total number of characters per batch
    # Example: If samp_per_batch is 2 and seq_len is 50, then 100
    # characters come out per batch.
    char_per_batch = samp_per_batch * seq_len
    
    
    # Number of batches available to make
    # Use int() to roun to nearest integer
    num_batches_avail = int(len(encoded_text)/char_per_batch)
    
    # Cut off end of encoded_text that
    # won't fit evenly into a batch
    encoded_text = encoded_text[:num_batches_avail * char_per_batch]
    
    
    # Reshape text into rows the size of a batch
    encoded_text = encoded_text.reshape((samp_per_batch, -1))
    

    # Go through each row in array.
    for n in range(0, encoded_text.shape[1], seq_len):
        
        # Grab feature characters
        x = encoded_text[:, n:n+seq_len]
        
        # y is the target shifted over by 1
        y = np.zeros_like(x)
       
        #
        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1]  = encoded_text[:, n+seq_len]
            
        # FOR POTENTIAL INDEXING ERROR AT THE END    
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
            
        yield x, y

In [17]:
sample_text = encoded_text[:20]
sample_text

array([23, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10])

In [18]:
t=generate_batches(sample_text,samp_per_batch=2,seq_len=5)

In [19]:
x,y = next(t)

In [20]:
x

array([[23, 10, 10, 10, 10],
       [10, 10, 10, 10, 10]])

In [21]:
y

array([[10, 10, 10, 10, 10],
       [10, 10, 10, 10, 10]])

In [22]:
# building LSTM models
class CharModel(nn.Module):
    
    def __init__(self,all_chars,num_hidden=256,num_layers=4,drop_prob=0.5,use_gpu=False):
        
        super().__init__()
        #assign the parameters
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu
        
        #CHARACTER SET, ENCODER, and DECODER
        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char:ind for ind,char in decoder.items()}
        
        #layers
        #input shape, hidden dim, layers
        # batch_first: if T, the input and output tensors are like (batch,seq,feature)
        #self.lstm = nn.LSTM(len(all_chars),num_hidden,num_layers,dropout=drop_prob,batch_first=True)
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)

        self.dropout = nn.Dropout(drop_prob)
        self.fc_linear = nn.Linear(num_hidden,len(self.all_chars))
        
    def forward(self,x,hidden):
        lstm_output, hidden = self.lstm(x,hidden)
        
        drop_output = self.dropout(lstm_output)
        
        #adjust dropput results
        drop_output = drop_output.contiguous().view(-1,self.num_hidden)
        
        final_out = self.fc_linear(drop_output)
        
        return final_out, hidden
        
    def hidden_state(self,batch_size):# optional gpu use
        
        if self.use_gpu:
            
            #a tuple of hidden states and cell states
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                      torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                      torch.zeros(self.num_layers,batch_size,self.num_hidden))
        return hidden

In [23]:
#create an instance of the model
model = CharModel(all_chars=all_characters,
                 num_hidden =124,
                 num_layers = 3,
                 drop_prob=0.5)
# num_hidden = 512
#model = CharModel(
#    all_chars=all_characters,
#    num_hidden=512,
#    num_layers=3,
#    drop_prob=0.5,
#    use_gpu=True,
#)
 
#model_name = 'example.net'
#model.load_state_dict(torch.load(model_name))
#model.eval()

In [24]:
model

CharModel(
  (lstm): LSTM(70, 124, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc_linear): Linear(in_features=124, out_features=70, bias=True)
)

In [32]:
#xx = torch.randn(1, 4).long()
encoded_text[0]

23

In [27]:
# figure our the parameters
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))
total_param

[34720,
 61504,
 496,
 496,
 61504,
 61504,
 496,
 496,
 61504,
 61504,
 496,
 496,
 8680,
 70]

In [28]:
sum(total_param) #important: the original para# is at least similar to the data size

353966

In [29]:
len(encoded_text)

40000

In [30]:
# training
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

In [31]:
train_percent =0.9
train_ind = int(len(encoded_text)*train_percent)

In [32]:
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]

In [33]:
len(train_data)

36000

In [34]:
len(val_data)

4000

In [35]:
epochs = 5
batch_size = 100

seq_len = 100

tracker = 0
num_char = max(encoded_text) + 1

In [36]:
# Set model to train
model.train()


# Check to see if using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    #get hidden states first
    hidden = model.hidden_state(batch_size)
    
    
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        # One Hot Encode incoming data
        x = one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        # Adjust for GPU if necessary
        
        if model.use_gpu:
            
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)
                

                # Convert Numpy Arrays to Tensor

                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust for GPU if necessary

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")
print("over")

over


In [37]:
type(encoded_text)

numpy.ndarray

In [38]:
len(encoded_text)

40000

In [40]:
# saving: Be careful to overwrite our original name file!
model_name = 'hidden124_layer3_shakespare.net'

In [41]:
torch.save(model.state_dict(),model_name)

In [53]:
#loading and predicting
def predict_next_char(model,char,hidden=None,k=1):#kth largest prob
    
    encoded_text = model.encoder[char]
    encoded_text = np.array([[encoded_text]])#doube list
    
    encoded_text = one_hot_encoder(encoded_text,len(model.all_chars))
    
    inputs = torch.from_numpy(encoded_text)
    
    if model.use_gpu:
        inputs = inputs.cuda()
    
    hidden = tuple([state.data for state in hidden])
    
    lstm_out, hidden = model.forward(inputs,hidden)
    
    #probabilities for every single character
    probs = F.softmax(lstm_out,dim=1).data
    
    #make sure to switch on CPU for numpy
    if model.use_gpu:
        probs = probs.cpu()
    
    probs,index_positions = probs.topk(k)
    index_positions = index_positions.numpy().squeeze()
    
    probs = probs.numpy().flatten()
    
    probs =probs/probs.sum()
    
    char = np.random.choice(index_positions,p=probs)
    
    return model.decoder[char],hidden

In [60]:
def generate_text(model, size, seed='The', k=1):
        
    
    # CHECK FOR GPU
    if(model.use_gpu):
        model.cuda()
    else:
        model.cpu()
    
    # Evaluation mode
    model.eval()
    
    # begin output from initial seed
    output_chars = [c for c in seed]
    
    # intiate hidden state
    hidden = model.hidden_state(1)
    
    # predict the next character for every character in seed
    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)
    
    # add initial characters to output
    output_chars.append(char)
    
    # Now generate for size requested
    for i in range(size):
        
        # predict based off very last letter in output_chars
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)
        
        # add predicted character
        output_chars.append(char)
    
    # return string of predicted text
    return ''.join(output_chars)

In [61]:
# some sample results
print(generate_text(model,1000,seed="The",k=3))

Thehaah h hhh n    s r      e h   h h a     e a       d    hae   h   h d     ao     hhh  a   e     hh   h shhes    dh     a  h  l s ah     hh l  o   o  a           r       s   n       o   h h   d  n   a h o  s   a h  hehha    hdsh     ah   d      s  t   h d  d a    h he hs ah h  h    a    h     e    e h  h o a  hh hh alolh      nha   shh         n  ah    hh   h aa     t  no o  d  n drrho hh  h       a  d  hhaoe ah hs l e  e  hh       rh   a oho h   hr hh   r  h sh  di  e  l     h       a h    a        h h h h s     h ad   h  hl      e      so   h h   hu   hs a        ae     o      e n l       nh    s    h  ,a   h  a     h h         hh  o e s h       hdh        h  s        h     s       h          eh     hs      ah    h     s     hlis  s      ss    n   e     s   i          h  nha     rr s       ur rh h e   rhhh   h i  o  e d   hsi  a  h  hr a   od      h  n d d     hs  h h  h o  ah  d arh h       d        hh   h a hhlh h  h h ii   la d  h     n    n  h    h  d   h   d    ss   o       ns