<img src="../Pierian-Data-Logo.PNG">
<br>
<strong><center>Copyright 2019. Created by Jose Marcial Portilla.</center></strong>

# Generating Text (encoded variables)

We saw how to generate continuous values, now let's see how to generalize this to generate categorical sequences (such as words or letters).

## Imports

In [70]:
# Importing libraries
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

## Grab Text Data

In [56]:
# Open shakespeare text file and read in data as `text`
with open('../Data/shakespeare.txt', 'r', encoding='utf8') as f:
    text = f.read()

In [71]:
# Showing the first 100 characters
text[:100]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose mi"

## Encoding and Decoding

In [72]:
all_characters = tuple(set(text))

In [74]:
all_characters

('k',
 'F',
 'c',
 'f',
 'r',
 '[',
 'p',
 'J',
 '>',
 'j',
 'z',
 '_',
 'i',
 'I',
 '}',
 'h',
 '-',
 'y',
 'Z',
 '<',
 's',
 'A',
 'B',
 'E',
 ')',
 '1',
 'W',
 '|',
 '"',
 'a',
 'P',
 '4',
 'U',
 '6',
 'Q',
 'o',
 '(',
 '`',
 '.',
 'S',
 ';',
 '?',
 'V',
 '2',
 '&',
 'X',
 'L',
 'n',
 'b',
 'l',
 'O',
 '9',
 '!',
 '\n',
 '8',
 'D',
 'H',
 ',',
 'T',
 'C',
 'x',
 ':',
 "'",
 'M',
 'q',
 'e',
 'w',
 'R',
 't',
 '0',
 '7',
 'd',
 'N',
 '5',
 'K',
 'v',
 'g',
 'Y',
 ']',
 'u',
 'G',
 'm',
 ' ',
 '3')

In [80]:
decoder = dict(enumerate(all_characters))

In [81]:
decoder

{0: 'k',
 1: 'F',
 2: 'c',
 3: 'f',
 4: 'r',
 5: '[',
 6: 'p',
 7: 'J',
 8: '>',
 9: 'j',
 10: 'z',
 11: '_',
 12: 'i',
 13: 'I',
 14: '}',
 15: 'h',
 16: '-',
 17: 'y',
 18: 'Z',
 19: '<',
 20: 's',
 21: 'A',
 22: 'B',
 23: 'E',
 24: ')',
 25: '1',
 26: 'W',
 27: '|',
 28: '"',
 29: 'a',
 30: 'P',
 31: '4',
 32: 'U',
 33: '6',
 34: 'Q',
 35: 'o',
 36: '(',
 37: '`',
 38: '.',
 39: 'S',
 40: ';',
 41: '?',
 42: 'V',
 43: '2',
 44: '&',
 45: 'X',
 46: 'L',
 47: 'n',
 48: 'b',
 49: 'l',
 50: 'O',
 51: '9',
 52: '!',
 53: '\n',
 54: '8',
 55: 'D',
 56: 'H',
 57: ',',
 58: 'T',
 59: 'C',
 60: 'x',
 61: ':',
 62: "'",
 63: 'M',
 64: 'q',
 65: 'e',
 66: 'w',
 67: 'R',
 68: 't',
 69: '0',
 70: '7',
 71: 'd',
 72: 'N',
 73: '5',
 74: 'K',
 75: 'v',
 76: 'g',
 77: 'Y',
 78: ']',
 79: 'u',
 80: 'G',
 81: 'm',
 82: ' ',
 83: '3'}

In [82]:
encoder = {char: i for i, char in encoder.items()}

In [83]:
encoder

{'k': 0,
 'F': 1,
 'c': 2,
 'f': 3,
 'r': 4,
 '[': 5,
 'p': 6,
 'J': 7,
 '>': 8,
 'j': 9,
 'z': 10,
 '_': 11,
 'i': 12,
 'I': 13,
 '}': 14,
 'h': 15,
 '-': 16,
 'y': 17,
 'Z': 18,
 '<': 19,
 's': 20,
 'A': 21,
 'B': 22,
 'E': 23,
 ')': 24,
 '1': 25,
 'W': 26,
 '|': 27,
 '"': 28,
 'a': 29,
 'P': 30,
 '4': 31,
 'U': 32,
 '6': 33,
 'Q': 34,
 'o': 35,
 '(': 36,
 '`': 37,
 '.': 38,
 'S': 39,
 ';': 40,
 '?': 41,
 'V': 42,
 '2': 43,
 '&': 44,
 'X': 45,
 'L': 46,
 'n': 47,
 'b': 48,
 'l': 49,
 'O': 50,
 '9': 51,
 '!': 52,
 '\n': 53,
 '8': 54,
 'D': 55,
 'H': 56,
 ',': 57,
 'T': 58,
 'C': 59,
 'x': 60,
 ':': 61,
 "'": 62,
 'M': 63,
 'q': 64,
 'e': 65,
 'w': 66,
 'R': 67,
 't': 68,
 '0': 69,
 '7': 70,
 'd': 71,
 'N': 72,
 '5': 73,
 'K': 74,
 'v': 75,
 'g': 76,
 'Y': 77,
 ']': 78,
 'u': 79,
 'G': 80,
 'm': 81,
 ' ': 82,
 '3': 83}

In [84]:
encoder['h']

15

In [85]:
decoder[15]

'h'

### Encode entire text

In [86]:
encoded_text = [encoder[char] for char in text]
encoded_text = np.array(encoded_text)

In [87]:
encoded_text[:50]

array([53, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
       82, 82, 82, 82, 82, 25, 53, 82, 82,  1,  4, 35, 81, 82,  3, 29, 12,
        4, 65, 20, 68, 82,  2,  4, 65, 29, 68, 79,  4, 65, 20, 82, 66])

In [90]:
decoder[82]

' '

## Create Batch Function

In [None]:
def gen_batches(char_arr, batch_size, seq_len):
    
    '''
    char_arr : array of characters
    batch_size : size of batch
    seq_len : length of training sequence
    '''
    
    total_batch_size = batch_size * seq_len
  
    num_batches = len(char_arr) / total_batch_size
    
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    
    
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))

    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

## One Hot Encoding

In [44]:
def one_hot(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot_arr = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot_arr[np.arange(one_hot_arr.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

# Model

In [None]:
class CharRNN(nn.Module):
    
    def __init__(self, input_size, n_hidden=256, n_layers=2,drop_prob=0.5, lr=0.001):
        '''
        LSTM --> DropOut ---> FC
        '''
        
        # For init_hidden method
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        super().__init__()
        self.lstm = nn.LSTM(input_size, n_hidden, n_layers, dropout=drop_prob, batch_first=True)   
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, input_size)


    def forward(self, x, hidden):
        

        #get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(x, hidden)

        # Pass LSTM_Output through Dropout
        drop_output = self.dropout(lstm_output)

        # Reshape output using view for FC
        # https://stackoverflow.com/questions/48915810/pytorch-contiguous
        drop_output = drop_output.contiguous().view(-1, self.n_hidden)

        # Pass through FC
        fc_final_out = self.fc(drop_output)

        # return the final output and the hidden state
        return fc_final_out, hidden


    def init_hidden(self, batch_size):
        
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_())

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())

        return hidden

In [46]:
# Check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


In [62]:
# Declaring the train method
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
        

In [63]:
              
# Define and print the net
n_hidden=512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

# Declaring the hyperparameters
batch_size = 128
seq_length = 100
n_epochs = 100 # start smaller if you are just testing initial behavior

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=50)

CharRNN(
  (lstm): LSTM(84, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=512, out_features=84, bias=True)
)
Epoch: 1/100... Step: 50... Loss: 3.1955... Val Loss: 3.2117
Epoch: 1/100... Step: 100... Loss: 3.1437... Val Loss: 3.1279
Epoch: 1/100... Step: 150... Loss: 2.8496... Val Loss: 2.8282
Epoch: 1/100... Step: 200... Loss: 2.5257... Val Loss: 2.4636
Epoch: 1/100... Step: 250... Loss: 2.3174... Val Loss: 2.2507
Epoch: 1/100... Step: 300... Loss: 2.1953... Val Loss: 2.1250
Epoch: 1/100... Step: 350... Loss: 2.0662... Val Loss: 2.0319
Epoch: 2/100... Step: 400... Loss: 2.0426... Val Loss: 1.9600
Epoch: 2/100... Step: 450... Loss: 1.9731... Val Loss: 1.8998
Epoch: 2/100... Step: 500... Loss: 1.8989... Val Loss: 1.8398
Epoch: 2/100... Step: 550... Loss: 1.8564... Val Loss: 1.7976
Epoch: 2/100... Step: 600... Loss: 1.8277... Val Loss: 1.7582
Epoch: 2/100... Step: 650... Loss: 1.7905... Val Loss: 1.7206
Epoch: 2/100... Step: 700..

Epoch: 17/100... Step: 6400... Loss: 1.2139... Val Loss: 1.2689
Epoch: 17/100... Step: 6450... Loss: 1.1988... Val Loss: 1.2696
Epoch: 18/100... Step: 6500... Loss: 1.1614... Val Loss: 1.2704
Epoch: 18/100... Step: 6550... Loss: 1.1721... Val Loss: 1.2690
Epoch: 18/100... Step: 6600... Loss: 1.2283... Val Loss: 1.2668
Epoch: 18/100... Step: 6650... Loss: 1.1910... Val Loss: 1.2662
Epoch: 18/100... Step: 6700... Loss: 1.1584... Val Loss: 1.2705
Epoch: 18/100... Step: 6750... Loss: 1.1958... Val Loss: 1.2720
Epoch: 18/100... Step: 6800... Loss: 1.2159... Val Loss: 1.2693
Epoch: 18/100... Step: 6850... Loss: 1.1737... Val Loss: 1.2675
Epoch: 19/100... Step: 6900... Loss: 1.1971... Val Loss: 1.2642
Epoch: 19/100... Step: 6950... Loss: 1.1968... Val Loss: 1.2671
Epoch: 19/100... Step: 7000... Loss: 1.1919... Val Loss: 1.2652
Epoch: 19/100... Step: 7050... Loss: 1.1311... Val Loss: 1.2657
Epoch: 19/100... Step: 7100... Loss: 1.1908... Val Loss: 1.2669
Epoch: 19/100... Step: 7150... Loss: 1.1

Epoch: 34/100... Step: 12800... Loss: 1.1188... Val Loss: 1.2486
Epoch: 34/100... Step: 12850... Loss: 1.1197... Val Loss: 1.2448
Epoch: 34/100... Step: 12900... Loss: 1.1157... Val Loss: 1.2470
Epoch: 34/100... Step: 12950... Loss: 1.0806... Val Loss: 1.2511
Epoch: 35/100... Step: 13000... Loss: 1.1290... Val Loss: 1.2536
Epoch: 35/100... Step: 13050... Loss: 1.1134... Val Loss: 1.2523
Epoch: 35/100... Step: 13100... Loss: 1.1285... Val Loss: 1.2485
Epoch: 35/100... Step: 13150... Loss: 1.0755... Val Loss: 1.2487
Epoch: 35/100... Step: 13200... Loss: 1.0842... Val Loss: 1.2481
Epoch: 35/100... Step: 13250... Loss: 1.1181... Val Loss: 1.2483
Epoch: 35/100... Step: 13300... Loss: 1.0919... Val Loss: 1.2495
Epoch: 35/100... Step: 13350... Loss: 1.1035... Val Loss: 1.2561
Epoch: 36/100... Step: 13400... Loss: 1.1165... Val Loss: 1.2500
Epoch: 36/100... Step: 13450... Loss: 1.0881... Val Loss: 1.2524
Epoch: 36/100... Step: 13500... Loss: 1.1278... Val Loss: 1.2519
Epoch: 36/100... Step: 13

Epoch: 51/100... Step: 19150... Loss: 1.0854... Val Loss: 1.2503
Epoch: 51/100... Step: 19200... Loss: 1.0588... Val Loss: 1.2493
Epoch: 51/100... Step: 19250... Loss: 1.0441... Val Loss: 1.2567
Epoch: 51/100... Step: 19300... Loss: 1.0744... Val Loss: 1.2533
Epoch: 51/100... Step: 19350... Loss: 1.1094... Val Loss: 1.2522
Epoch: 51/100... Step: 19400... Loss: 1.0845... Val Loss: 1.2506
Epoch: 51/100... Step: 19450... Loss: 1.0585... Val Loss: 1.2586
Epoch: 52/100... Step: 19500... Loss: 1.0727... Val Loss: 1.2527
Epoch: 52/100... Step: 19550... Loss: 1.1268... Val Loss: 1.2561
Epoch: 52/100... Step: 19600... Loss: 1.0929... Val Loss: 1.2535
Epoch: 52/100... Step: 19650... Loss: 1.1050... Val Loss: 1.2547
Epoch: 52/100... Step: 19700... Loss: 1.0779... Val Loss: 1.2545
Epoch: 52/100... Step: 19750... Loss: 1.0915... Val Loss: 1.2547
Epoch: 52/100... Step: 19800... Loss: 1.1128... Val Loss: 1.2561
Epoch: 52/100... Step: 19850... Loss: 1.0380... Val Loss: 1.2613
Epoch: 53/100... Step: 19

KeyboardInterrupt: 

In [68]:
# Saving the model
model_name = 'shake_50_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)
    

In [69]:
# Saving the model
model_name = 'shake_50_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)
   

In [66]:
 
# Defining a method to generate the next character
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h
        

In [67]:
# Declaring a method to generate new text
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)
    
# Generating new text
print(sample(net, 1000, prime='A', top_k=5))

ALD,                               Iago, with trumpets.
                                     [They set them stroke and]

          Enter BOLINGBROKE, and other LADIES and SOLDIERS

    And, when I humbly warrant thee, sweet soldier,  
    And we are set this means to stand and beat,
    As thus I spill, and stop as true; worse, honest truth;
    By him that I might hate himself in thee,
    As I am all our crown against the spirits
    And speaked against them. I shall see at home,
    Are bound to meet the power to say to me.
    And, for the wild and that, are not to send
    The thick as there. Have you a subject time
    The conference which I would be an old measure?
    And to be made as madam to my brain.
    Who should steal her and more? To the cheer of men,
    This is a sport that sent you on the way
    To be thy father to thy lovely blessings;
    And, by those what should be my pow'ring souls,
    I warrant thee, sir, that was not but a man
    To many a senanor of my pow

In [28]:
top_ch

NameError: name 'top_ch' is not defined