In [1]:
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F
import time, os

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [None]:
# Building the web scraper

In [36]:
print('start')
for n in range(1, 2000):
    if n % 100 == 0:
        print(n)
    n = str(n)
    file_path = 'data/'+n+'.txt'
    if os.path.isfile(file_path) == False:
        url = 'http://arquivopessoa.net/textos/' + n
        try:
            html = urlopen(url).read()
            found = True
        except:
            found = False
        if found:
            soup = BeautifulSoup(html)
#             title = soup.find('h1', attrs='titulo-texto').get_text()
            text = soup.find('div', attrs='texto-poesia')
            if text:
#                 print('found poem:', n)
                text = text.get_text()
                with open(file_path, 'w') as f:
                    f.write(text)
                    f.close()
        time.sleep(0.25)
print('end')

start
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
end


In [128]:
all_poems = str()
for n in range(1, 2000):
    file_path = 'data/'+str(n)+'.txt'
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            poem = f.read()
            f.close()
        all_poems += poem
        with open('data/all_poems.txt', 'w') as f:
            f.write(all_poems)
            f.close()
text = all_poems

In [54]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [3]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [4]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    ## TODO: Get the number of batches we can make
    batch_size_total = seq_length * batch_size
    n_batches = len(arr) // batch_size_total
    
    ## TODO: Keep only enough characters to make full batches
    arr = arr[:n_batches*batch_size_total]
    
    ## TODO: Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    ## TODO: Iterate over the batches using a window of size seq_length
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:,n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [5]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


In [6]:
class CharLSTM(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(tokens))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the layers of the model
        self.lstm = nn.LSTM(len(tokens), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, len(tokens))
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## TODO: Get the outputs and the new hidden state from the lstm
        
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
        

In [7]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length))
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length))
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [106]:
## TODO: set you model hyperparameters
# define and print the net
n_hidden= 256
n_layers= 2

net = CharLSTM(chars, n_hidden, n_layers)
print(net)

CharLSTM(
  (lstm): LSTM(117, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=256, out_features=117, bias=True)
)


In [114]:
batch_size = 50
seq_length = 20
n_epochs =  10 # start small if you are just testing initial behavior

batch_sizes = [64]
seq_length_list = [20]
lr_list = [0.001]

# train the model

for lr in lr_list:    
    print('lr:', lr)
    for length in seq_length_list:
        print('Seq_length:', length)    
        for batch_size in batch_sizes:
            print('batch_size:', batch_size)
            train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=length, lr=lr, print_every=200)

lr: 0.001
Seq_length: 20
batch_size: 64
Epoch: 1/10... Step: 200... Loss: 1.5190... Val Loss: 1.7200
Epoch: 2/10... Step: 400... Loss: 1.5138... Val Loss: 1.7159
Epoch: 2/10... Step: 600... Loss: 1.4283... Val Loss: 1.7135
Epoch: 3/10... Step: 800... Loss: 1.5360... Val Loss: 1.7116
Epoch: 3/10... Step: 1000... Loss: 1.3786... Val Loss: 1.7098
Epoch: 4/10... Step: 1200... Loss: 1.4964... Val Loss: 1.7110
Epoch: 4/10... Step: 1400... Loss: 1.4287... Val Loss: 1.7094
Epoch: 5/10... Step: 1600... Loss: 1.4945... Val Loss: 1.7098
Epoch: 5/10... Step: 1800... Loss: 1.4486... Val Loss: 1.7055
Epoch: 6/10... Step: 2000... Loss: 1.4040... Val Loss: 1.7137
Epoch: 7/10... Step: 2200... Loss: 1.4397... Val Loss: 1.7111
Epoch: 7/10... Step: 2400... Loss: 1.4275... Val Loss: 1.7179
Epoch: 8/10... Step: 2600... Loss: 1.4069... Val Loss: 1.7131
Epoch: 8/10... Step: 2800... Loss: 1.4063... Val Loss: 1.7186
Epoch: 9/10... Step: 3000... Loss: 1.4704... Val Loss: 1.7185
Epoch: 9/10... Step: 3200... Loss:

In [89]:
# change the name, for saving multiple files
model_name = 'lstm.cp'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

In [8]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

In [9]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [137]:
print(sample(net, 500, prime='Alma', top_k=5))

Almanda de está-las, 
E eu o que se perco o meu sentido a tudo para o seu.

Sinto sinto, alta da minha alma, eu nunca sente eu.
                (Ver,
Não temos nostalidades, não consegues, acestrada alta nenhum não ter
Serei nem esse mar que não se abestam nunca estou.

Se sombra quem a soleção nos cais nem cheias nas escondes.

Eu porque estou contente do mundo!
Eu deixa de ser eu a casa e a vida e de compreensível.

Encontro-me até não ser-se, e a vida.

Que sentir nem me acordar e a conterra
Que e


In [10]:
# Here we have loaded in a model that trained over 20 epochs `rnn_20_epoch.net`
with open('lstm.cp', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharLSTM(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

In [12]:
# Sample using a loaded model
print(sample(loaded, 2000, top_k=2, prime="Para ser grande, "))

Para ser grande, 
A coração esto e estar a meu carta de mim.

A coração de meu coração,
A meus corpos a ser a ser de meu sentir,
E a solho de sonhar do meu coração do sonhar a sentida, e esto e a sol a minha sentido.

Nos corias do minha alma a mim a sonho dos ser a sente as cantar destis de meu como escorte a mim a meu sentida a meu como esta do meu ser de meu ser de minha,
Escrente do mim de sente de sensação de sentir, ser de meu sente,
E estar de sensagem e a minta a ser.

No sensiga do meu carta, se sente de meu carar do sol,
Esto a ser de ser de sentida de mim.

E o mim, e a meu ser do sente a ser a sonhar de mim.

A meu sonho a sentido do ser.

A cara de meu sonho,
A mena meus andesentes,
E a mim a minha minha mesma anter,
E escrente a mim a sol de meu sentir a mim do meu carar do ser de meu ser.

A sensagem de sonha e a meu como a meste dorme
De me escrente a minha meu de meu canto dos sentidas do sentido,
As minha mim de mim,
E a mesmo ser de meu ser a sonho, esto as minhas se