As usual some imports:

In [None]:
import torch
from torch import nn
import numpy as np
from timeit import default_timer as timer
from time import perf_counter

Cuda is available so we can train on GPU:

In [None]:
is_cuda = torch.cuda.is_available()
device = torch.device("cuda")

We define two models. One with dropout and one without. We can use both the same during training.

In [None]:
class ModelLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers):
        super(ModelLSTM, self).__init__()
        output_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True)   
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, full_hidden):
        out, full_hidden = self.lstm(x, full_hidden)
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.fc(out)
        return out, full_hidden
    
    def init_full_hidden(self, batch_size):
        hidden = torch.randn(self.n_layers, batch_size, self.hidden_size).to(device)
        cell_state = torch.randn(self.n_layers, batch_size, self.hidden_size).to(device)
        return (hidden, cell_state)

In [None]:
class ModelLSTMDrop(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers):
        super(ModelLSTMDrop, self).__init__()
        output_size = input_size
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, full_hidden):
        out, full_hidden = self.lstm(x, full_hidden)
        out = self.dropout(out)
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.fc(out)
        return out, full_hidden
    
    def init_full_hidden(self, batch_size):
        hidden = torch.randn(self.n_layers, batch_size, self.hidden_size).to(device)
        cell_state = torch.randn(self.n_layers, batch_size, self.hidden_size).to(device)
        return (hidden, cell_state)

Some helper functions.

split_eq - will split the dataset into equall parts and return them

translate_to_int - will do char to int translation for multiple multicharacter examples (a -> 3)

one_hot_encode does: 5 -> 000010 translation, given that our vocab_size is 6 for example

to_model_format - translates a string of text into model understandable format (pytorch Tensor)

In [None]:
def split_eq(text, no):
    cnt = int(len(text) / no)
    examples = [text[i:i+cnt] for i in range(0, len(text), cnt)]
    if (no*cnt == len(text)):
        return examples
    else:
        return examples[:-1]

    
def produce_targets(examples):
    targets = [ex[1:] for ex in examples]
    inputs = [ex[:-1] for ex in examples]
    return inputs, targets


def translate_to_int(examples):
    translated = [list(map(lambda ch: char2int[ch], ex)) for ex in examples]
    return translated


def translate_to_char(examples):
    translated = [''.join(list(map(lambda i: int2char[i], ex))) for ex in examples]
    return translated


def one_hot_encode(arr, vocab_size):
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, vocab_size), dtype=np.float32)

    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.

    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, vocab_size))
    return one_hot


def to_model_format(inputs, vocab_size):
    if isinstance(inputs, str):
        inputs = [inputs]
    trans_inputs = np.array(translate_to_int(inputs))
    encoded = one_hot_encode(trans_inputs, vocab_size)
    encoded_tensor = torch.from_numpy(encoded)
    return encoded_tensor
    

Configuration of dataset.

No of training chars.
No of examples and batch_size.


Those are important. Notice that no_of_batches decides how often Neural network is going to backpropagate gradients.
You can notice that when looking at the training code.

In [None]:
# config
no_of_chars = 164000
no_of_examples = 1024
batch_size = examples_per_batch = 128
no_of_batches = int(no_of_examples / examples_per_batch)

Loading dataset, creating dictionaries, spliting dataset into examples and batches.

Translating examples into ready-to-use format. (In training only tensors and cuda() is needed)

In [None]:
with open('./datasets/anna_karenina', 'r', encoding='utf-8') as fd:
    full_text = fd.read()
    full_text = full_text[0:no_of_chars]

vocab = set(full_text)
int2char = dict(enumerate(vocab))
char2int = {char: ind for ind, char in int2char.items()}
vocab_size = len(char2int)
print("Vocabulary size:", vocab_size)
print("Text lenght:", len(full_text))

# TODO this should be fixed: translate once than split
examples = split_eq(full_text, no_of_examples)
chars_per_example = len(examples[0])
inputs, targets = produce_targets(examples)
trans_inputs = translate_to_int(inputs)
trans_targets = translate_to_int(targets)

batches = []

for i in range(no_of_batches):
    input_seq = one_hot_encode(np.array(trans_inputs[i*examples_per_batch:(i+1)*examples_per_batch]), vocab_size)
    target_seq = np.array(trans_targets[i*examples_per_batch:(i+1)*examples_per_batch])
    batches.append((input_seq, target_seq))

print("No of examples/No of data parts:", no_of_examples)
print("No of batches:", no_of_batches)
print("Examples per batch:", examples_per_batch)
print("Chars per example:", chars_per_example)

Lets define a network:

In [None]:
lr = 0.001
model = ModelLSTMDrop(input_size=vocab_size, hidden_size=256, n_layers=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model.to(device)

We'll train a network. TODO: describe step by step.

In [None]:
epochs = 380
counter = 0
print_every = 4

t_start = perf_counter()
model.train()
for i in range(epochs):
    counter += 1
    h = model.init_full_hidden(batch_size)
    for batch in batches:
        h = tuple([each.data for each in h])
        model.zero_grad()
        x, y = batch
        inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
        inputs, targets = inputs.to(device), targets.to(device)
        output, h = model(inputs, h)
        loss = criterion(output, targets.view(-1).long())
        loss.backward()
        optimizer.step()
        
    if counter%print_every == 0:
        print("Epoch: {}/{}...".format(i+1, epochs),
              "Loss: {:.6f}...".format(loss.item()))
        t_stop = perf_counter()
        print("Time elasped:", t_stop - t_start)

TODO: describe functions

In [None]:
def predict_next(device, model, full_hidden, input_string):
    encoded_input = to_model_format(input_string, model.input_size)
    out, hidden = model(encoded_input.to(device), full_hidden)

    # choosing one with highest probability
    prob = nn.functional.softmax(out[-1], dim=0).data
    char_ind = torch.max(prob, dim=0)[1].item()
    return int2char[char_ind], hidden


def run_model(device, model, starting_seq, size=100):
    model.eval()
    seq = starting_seq.lower()
    h = model.init_full_hidden(1)
    for _ in range(size):
        char, h = predict_next(device, model, h, seq)
        seq += char
    return ''.join(seq)

In [None]:
res = run_model(device, model, 'A great and advanced society has ', 250)
print(res)

Uncomment to save the model:

In [None]:
#torch.save(model.state_dict(), "./models/lstm_gpu_256_2_1964560")

Some results:


---------- Epoch:80, Chars:164000, No of batches:8, Per batch:128, Layers:2, HiddenSize:256, Loss:1.07

a great and advanced society has care theres, and she had been and at the same time with her would be a seated, and he was all the shill was should down to see himself and looking of the conversation. “I thank of at yound there was a conservative, and I have no sint a minute. That I

---------- Epoch:80, Chars:164000, No of batches:8, Per batch:128, Layers:3||||, HiddenSize:256, Loss:1.15
a great and advanced society has of the same time them one of the said to her. He was as that she was something was a strong and strice all the same time the strain for the same to the same to the same to the same to the same to the same to the same to the same to the same to the sa

Conclusions:

Increase in number of layers does not always help.