In [None]:
import torch
from torch import nn
import re
import random
import tqdm
import time

In [None]:
!wget https://s3.amazonaws.com/text-datasets/nietzsche.txt

--2022-06-09 17:18:34--  https://s3.amazonaws.com/text-datasets/nietzsche.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.17.59
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.17.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 600901 (587K) [text/plain]
Saving to: ‘nietzsche.txt’


2022-06-09 17:18:34 (15.8 MB/s) - ‘nietzsche.txt’ saved [600901/600901]



In [None]:
with open('nietzsche.txt', encoding='utf-8') as f:
    text = f.read().lower()
print('length:', len(text))
text = re.sub('[^a-z ]', ' ', text)
text = re.sub('\s+', ' ', text)

length: 600893


In [None]:
text[:100]

'preface supposing that truth is a woman what then is there not ground for suspecting that all philos'

In [None]:
INDEX_TO_CHAR = sorted(list(set(text)))
CHAR_TO_INDEX = {c: i for i, c in enumerate(INDEX_TO_CHAR)}

In [None]:
CHAR_TO_INDEX

{' ': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [None]:
MAX_LEN = 40
STEP = 3
SENTENCES = []
NEXT_CHARS = []
for i in range(0, len(text) - MAX_LEN, STEP):
    SENTENCES.append(text[i: i + MAX_LEN])
    NEXT_CHARS.append(text[i + MAX_LEN])
print('Num sents:', len(SENTENCES))

Num sents: 193075


In [None]:
print('Vectorization...')
X = torch.zeros((len(SENTENCES), MAX_LEN), dtype=int)
Y = torch.zeros((len(SENTENCES)), dtype=int)
for i, sentence in enumerate(SENTENCES):
    for t, char in enumerate(sentence):
        X[i, t] = CHAR_TO_INDEX[char]
    Y[i] = CHAR_TO_INDEX[NEXT_CHARS[i]]

Vectorization...


In [None]:
X[0:1], Y[0]

(tensor([[16, 18,  5,  6,  1,  3,  5,  0, 19, 21, 16, 16, 15, 19,  9, 14,  7,  0,
          20,  8,  1, 20,  0, 20, 18, 21, 20,  8,  0,  9, 19,  0,  1,  0, 23, 15,
          13,  1, 14,  0]]), tensor(23))

In [None]:
BATCH_SIZE=512
dataset = torch.utils.data.TensorDataset(X, Y)
data = torch.utils.data.DataLoader(dataset, BATCH_SIZE, shuffle=True)


In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, rnnClass, dictionary_size, embedding_size, num_hiddens, num_classes):
        super().__init__()
        
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(dictionary_size, embedding_size)
        self.hidden = rnnClass(embedding_size, num_hiddens, batch_first=True)
        self.output = nn.Linear(num_hiddens, num_classes)
        
    def forward(self, X):
        out = self.embedding(X)
        _, state = self.hidden(out)
        predictions = self.output(state[0].squeeze())
        return predictions

In [None]:
model = NeuralNetwork(nn.GRU, len(CHAR_TO_INDEX), 64, 128, len(CHAR_TO_INDEX))

In [None]:
X.shape

torch.Size([193075, 40])

In [None]:
model(X[0:1])

tensor([-0.0107,  0.0403,  0.2283,  0.0933,  0.1282, -0.0314, -0.0694, -0.0393,
        -0.1577,  0.0457, -0.0176, -0.1382, -0.1015,  0.0507,  0.0250, -0.1342,
         0.2036,  0.0862,  0.1456,  0.1420,  0.2797,  0.0540, -0.1520,  0.0869,
        -0.1288,  0.0291,  0.0249], grad_fn=<AddBackward0>)

In [None]:
embedding = nn.Embedding(len(INDEX_TO_CHAR), 15)
rnn = nn.LSTM(15,128, batch_first=True)

In [None]:
o, s = rnn(embedding(X[0:10]))
o.shape, len(s), s[0].shape, s[1].shape

(torch.Size([10, 40, 128]),
 2,
 torch.Size([1, 10, 128]),
 torch.Size([1, 10, 128]))

In [None]:
rnn = nn.GRU(15,128, batch_first=True)
o, s = rnn(embedding(X[0:10]))
o.shape, len(s), s[0].shape

In [None]:
o, s = rnn(embedding(X[0:10]))

In [None]:
o.shape, s[0].shape, s[1].shape

(torch.Size([10, 40, 128]), torch.Size([1, 10, 128]), torch.Size([1, 10, 128]))

In [None]:
model = model.cuda()

In [None]:
def sample(preds):
    softmaxed = torch.softmax(preds, 0)
    probas = torch.distributions.multinomial.Multinomial(1, softmaxed).sample()
    return probas.argmax()

def generate_text():
    start_index = random.randint(0, len(text) - MAX_LEN - 1)

    generated = ''
    sentence = text[start_index: start_index + MAX_LEN]
    generated += sentence

    for i in range(MAX_LEN):
        x_pred = torch.zeros((1, MAX_LEN), dtype=int)
        for t, char in enumerate(generated[-MAX_LEN:]):
            x_pred[0, t] = CHAR_TO_INDEX[char]

        preds = model(x_pred.cuda())[0].cpu()
        next_char = INDEX_TO_CHAR[sample(preds)]
        generated = generated + next_char

    print(generated[:MAX_LEN] + '|' + generated[MAX_LEN:])

In [None]:
generate_text()

ts that they made interpreters for thems|jljxwhuo gprkpkuuknklypooxttqbtnhkzetlhc


In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
for ep in range(100):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    model.train()
    for X_b, y_b in data:
        X_b, y_b = X_b.cuda(), y_b.cuda()
        optimizer.zero_grad()
        answers = model(X_b)
        loss = criterion(answers, y_b)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))
    model.eval()
    generate_text()

Epoch 0. Time: 7.204, Train loss: 2.104
gentle good hearted weak willed and poet|souch insaring inow oprays ham in hisgil
Epoch 1. Time: 7.028, Train loss: 1.713
 and evil only that here and there perha|nder of wan their action alvess mad dirs
Epoch 2. Time: 7.151, Train loss: 1.580
ty as it is it resembles a well stacked |ant a hatiric light and doisterining if 
Epoch 3. Time: 7.156, Train loss: 1.502
 hands glances and delicate follies our |thinds of a do all one oneselves the bar
Epoch 4. Time: 7.101, Train loss: 1.449
stincts how jesuitical that amiable and |would comon of an old indivesking aying 
Epoch 5. Time: 7.035, Train loss: 1.409
haps serve this age as its mirror and se|lf not did ruell the perhosequend not au
Epoch 6. Time: 7.087, Train loss: 1.378
kin as the bones flesh entrails and bloo|d skeld parting three soching they sense
Epoch 7. Time: 7.121, Train loss: 1.353
that the one who is guilty of them towar| to the betseas of a prcesuits with ordi
Epoch 8. Time: 7.106, Tr