Based on: https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/

In [None]:
import torch
from torch import nn
import numpy as np
from timeit import default_timer as timer

In [None]:
no_of_chars = 32000

with open('text', 'r') as fd:
    full_text = fd.read().lower()
full_text = full_text[0:no_of_chars]

vocab = set(full_text)
int2char = dict(enumerate(vocab))
char2int = {char: ind for ind, char in int2char.items()}
vocab_size = len(char2int)
print("Vocabulary size:", vocab_size)

See how LSTM layer works (sizes):

In [None]:
no_of_layers = 1
batch_size = 2
hidden_size = 3
input_size = 3
seq_size = 5

lstm = nn.LSTM(input_size, hidden_size, no_of_layers, batch_first=True)

inp = torch.randn(batch_size, seq_size, input_size)
print("Input:", inp)

hidden_state = torch.randn(no_of_layers, batch_size, hidden_size)
cell_state = torch.randn(no_of_layers, batch_size, hidden_size)
print("Hidden:", hidden_state)
print("Cell:", cell_state)
out, full = lstm(inp, (hidden_state, cell_state))
print("Full:", full)

The LSTM model.

LSTM, differently from vanilla RNN, has two states: hidden state (short-term memory) and cell state (long-term memory).
Together they have common name here: full_hidden.

Below is the simple model consisiting from lstm layer and fully connected layer. Later we'll add dropout.

In [None]:
class ModelLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers):
        super(ModelLSTM, self).__init__()
        output_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True)   
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, full_hidden):
        out, full_hidden = self.lstm(x, full_hidden)
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.fc(out)
        return out, full_hidden
    
    def init_full_hidden(self, batch_size):
        hidden = torch.randn(self.n_layers, batch_size, self.hidden_size)
        cell_state = torch.randn(self.n_layers, batch_size, self.hidden_size)
        return (hidden, cell_state)
    
    def init_full_hidden_cuda(self, batch_size):
        hidden = torch.randn(self.n_layers, batch_size, self.hidden_size).to(device)
        cell_state = torch.randn(self.n_layers, batch_size, self.hidden_size).to(device)
        return (hidden, cell_state)

Some helper functions:

In [None]:
def split_eq(text, no):
    cnt = int(len(text) / no)
    examples = [text[i:i+cnt] for i in range(0, len(text), cnt)]
    if (no*cnt == len(text)):
        return examples
    else:
        return examples[:-1]

def produce_targets(examples):
    targets = [ex[1:] for ex in examples]
    inputs = [ex[:-1] for ex in examples]
    return inputs, targets

def translate_to_int(examples):
    translated = [list(map(lambda ch: char2int[ch], ex)) for ex in examples]
    return translated

def translate_to_char(examples):
    translated = [''.join(list(map(lambda i: int2char[i], ex))) for ex in examples]
    return translated

def one_hot_encode(examples):
    features = np.zeros((len(examples), len(examples[0]), len(char2int)), dtype=np.float32)
    
    for i, example in enumerate(examples):
        for pos in range(len(examples[i]) - 1):
            features[i, pos, examples[i][pos]] = 1
    return features

def to_model_format(inputs):
    if isinstance(inputs, str):
        inputs = [inputs]
    trans_inputs = translate_to_int(inputs)
    encoded = one_hot_encode(trans_inputs)
    encoded_tensor = torch.from_numpy(encoded)
    return encoded_tensor

In [None]:
# configuration
no_of_examples = 32
batch_size = examples_per_batch = 16
n_epochs = 300
lr = 0.0048

no_of_batches = int(no_of_examples / examples_per_batch)

examples = split_eq(full_text, no_of_examples)
chars_per_example = len(examples[0])
inputs, targets = produce_targets(examples)
trans_inputs = translate_to_int(inputs)
trans_targets = translate_to_int(targets)

batches = []

for i in range(no_of_batches):
    input_seq = one_hot_encode(trans_inputs[i*examples_per_batch:(i+1)*examples_per_batch])
    target_seq = torch.Tensor(trans_targets[i*examples_per_batch:(i+1)*examples_per_batch])
    batches.append((torch.from_numpy(input_seq), target_seq))

print("No of examples/No of data parts:", no_of_examples)
print("No of batches:", no_of_batches)
print("Examples per batch:", examples_per_batch)
print("Chars per example:", chars_per_example)

Let's see how single batch item looks like.

It has size (examples_per_batch, chars_per_example -1)

In [None]:
inp, target = batches[0]
print(target, target.size())

In [None]:
dict_size = len(char2int)

model = ModelLSTM(input_size=dict_size, hidden_size=12, n_layers=3)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
epochs = 1300
counter = 0
print_every = 20

model.train()
for i in range(epochs):
    counter += 1
    for batch in batches:
        h = model.init_full_hidden(batch_size)
        model.zero_grad()
        inp, target = batch
        output, h = model(inp, h)
        loss = criterion(output, target.view(-1).long())
        loss.backward()
        optimizer.step()
        
    if counter%print_every == 0:
        print("Epoch: {}/{}...".format(i+1, epochs),
              "Step: {}...".format(counter),
              "Loss: {:.6f}...".format(loss.item()))

In [None]:
def predict_next(model, full_hidden, input_string):
    encoded_input = to_model_format(input_string)
    out, hidden = model(encoded_input, full_hidden)

    # choosing one with highest probability
    prob = nn.functional.softmax(out[-1], dim=0).data
    char_ind = torch.max(prob, dim=0)[1].item()
    return int2char[char_ind], hidden


def run_model(model, starting_seq, size=50):
    model.eval()
    seq = starting_seq.lower()
    h = model.init_full_hidden(1)
    for _ in range(size):
        char, h = predict_next(model, h, seq)
        seq += char
    return ''.join(seq)

In [None]:
res = run_model(model, 'character ')
print(res)

Some results for LSTM:

ep: 1300, chars: 32000, batches: 2, examples: 32 Out: character tarraat ttrlsc  tprlmaas ttrlsc  tprlmaas ttrlsc  Loss:1.719


With GPU:

In [None]:
is_cuda = torch.cuda.is_available()
print(is_cuda)

In [None]:
device = torch.device("cuda")

In [None]:
dict_size = len(char2int)
model = ModelLSTM(input_size=dict_size, hidden_size=36, n_layers=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model.to(device)

In [None]:
epochs = 2000
counter = 0
print_every = 50

model.train()
for i in range(epochs):
    counter += 1
    for batch in batches:
        h = init_full_hidden_cuda(model, batch_size)
        model.zero_grad()
        inp, target = batch
        inp, target = inp.to(device), target.to(device)
        output, h = model(inp, h)
        loss = criterion(output, target.view(-1).long())
        loss.backward()
        optimizer.step()
        
    if counter%print_every == 0:
        print("Epoch: {}/{}...".format(i+1, epochs),
              "Step: {}...".format(counter),
              "Loss: {:.6f}...".format(loss.item()))

In [None]:
def predict_next(device, model, full_hidden, input_string):
    encoded_input = to_model_format(input_string)
    out, hidden = model(encoded_input.to(device), full_hidden)

    # choosing one with highest probability
    prob = nn.functional.softmax(out[-1], dim=0).data
    char_ind = torch.max(prob, dim=0)[1].item()
    return int2char[char_ind], hidden


def run_model(device, model, starting_seq, size=50):
    model.eval()
    seq = starting_seq.lower()
    h = model.init_full_hidden_cuda(1)
    for _ in range(size):
        char, h = predict_next(device, model, h, seq)
        seq += char
    return ''.join(seq)

In [None]:
res = run_model(device, model, 'character ', 50)
print(res)

Some results:

ep: 1500, chars: 32000, batches: 2, examples: 32 hidden:24 
Out: character asde ttrruus it wutt sglttn lfa  tgu  tge  ggnlps Loss:1.198

ep: 2000, chars: 32000, batches: 2, examples: 32 hidden:36 
Out: character sandlodgrrktions it sam yinssaotdirspaiettwdnnstt  Loss:0.744
