In [1]:
import torch
from torch import nn
import re
import random
import tqdm
import time

In [2]:
!wget https://s3.amazonaws.com/text-datasets/nietzsche.txt

--2023-06-11 11:40:57--  https://s3.amazonaws.com/text-datasets/nietzsche.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.133.216, 52.217.199.0, 52.216.221.176, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.133.216|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 600901 (587K) [text/plain]
Saving to: ‘nietzsche.txt’


2023-06-11 11:40:58 (1015 KB/s) - ‘nietzsche.txt’ saved [600901/600901]



In [3]:
with open('nietzsche.txt', encoding='utf-8') as f:
    text = f.read().lower()
print('length:', len(text))
text = re.sub('[^a-z ]', ' ', text)
text = re.sub('\s+', ' ', text)

length: 600893


In [4]:
text[:100]

'preface supposing that truth is a woman what then is there not ground for suspecting that all philos'

In [5]:
INDEX_TO_CHAR = sorted(list(set(text)))
CHAR_TO_INDEX = {c: i for i, c in enumerate(INDEX_TO_CHAR)}

In [6]:
CHAR_TO_INDEX

{' ': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [7]:
MAX_LEN = 40
STEP = 3
SENTENCES = []
NEXT_CHARS = []
for i in range(0, len(text) - MAX_LEN, STEP):
    SENTENCES.append(text[i: i + MAX_LEN])
    NEXT_CHARS.append(text[i + MAX_LEN])
print('Num sents:', len(SENTENCES))

Num sents: 193075


In [10]:
print('Vectorization...')
X = torch.zeros((len(SENTENCES), MAX_LEN), dtype=int)
Y = torch.zeros((len(SENTENCES)), dtype=int)
for i, sentence in enumerate(SENTENCES):
    for t, char in enumerate(sentence):
        X[i, t] = CHAR_TO_INDEX[char]
    Y[i] = CHAR_TO_INDEX[NEXT_CHARS[i]]

Vectorization...


In [11]:
X[0:1], Y[0]

(tensor([[16, 18,  5,  6,  1,  3,  5,  0, 19, 21, 16, 16, 15, 19,  9, 14,  7,  0,
          20,  8,  1, 20,  0, 20, 18, 21, 20,  8,  0,  9, 19,  0,  1,  0, 23, 15,
          13,  1, 14,  0]]),
 tensor(23))

In [12]:
BATCH_SIZE=512
dataset = torch.utils.data.TensorDataset(X, Y)
data = torch.utils.data.DataLoader(dataset, BATCH_SIZE, shuffle=True)


In [13]:
class NeuralNetwork(nn.Module):
    def __init__(self, rnnClass, dictionary_size, embedding_size, num_hiddens, num_classes):
        super().__init__()
        
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(dictionary_size, embedding_size)
        self.hidden = rnnClass(embedding_size, num_hiddens, batch_first=True)
        self.output = nn.Linear(num_hiddens, num_classes)
        
    def forward(self, X):
        out = self.embedding(X)
        _, state = self.hidden(out)
        predictions = self.output(state[0].squeeze())
        return predictions

In [14]:
model = NeuralNetwork(nn.GRU, len(CHAR_TO_INDEX), 64, 128, len(CHAR_TO_INDEX))

In [15]:
X.shape

torch.Size([193075, 40])

In [16]:
model(X[0:1])

tensor([ 0.0480,  0.1568,  0.1112,  0.0175,  0.0107,  0.1489,  0.1690,  0.1694,
         0.0486,  0.0159, -0.1735, -0.0015,  0.1923, -0.1247,  0.0147,  0.0537,
         0.1016,  0.0382,  0.1405,  0.0244,  0.0070, -0.0086,  0.2117, -0.1352,
         0.0935,  0.1106, -0.1326], grad_fn=<AddBackward0>)

In [22]:
embedding = nn.Embedding(len(INDEX_TO_CHAR), 15)
rnn = nn.LSTM(15,128, batch_first=True)

In [23]:
o, s = rnn(embedding(X[0:10]))
o.shape, len(s), s[0].shape, s[1].shape

(torch.Size([10, 40, 128]),
 2,
 torch.Size([1, 10, 128]),
 torch.Size([1, 10, 128]))

In [24]:
rnn = nn.GRU(15,128, batch_first=True)
o, s = rnn(embedding(X[0:10]))
o.shape, len(s), s[0].shape

(torch.Size([10, 40, 128]), 1, torch.Size([10, 128]))

In [20]:
o, s = rnn(embedding(X[0:10]))

In [21]:
o.shape, s[0].shape, s[1].shape

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [25]:
model = model.cuda()

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [59]:
def sample(preds):
    softmaxed = torch.softmax(preds, 0)
    probas = torch.distributions.multinomial.Multinomial(1, softmaxed).sample()
    return probas.argmax()

def generate_text():
    start_index = random.randint(0, len(text) - MAX_LEN - 1)

    generated = ''
    sentence = text[start_index: start_index + MAX_LEN]
    generated += sentence

    for i in range(MAX_LEN):
        x_pred = torch.zeros((1, MAX_LEN), dtype=int)
        for t, char in enumerate(generated[-MAX_LEN:]):
            x_pred[0, t] = CHAR_TO_INDEX[char]

        preds = model(x_pred)
        next_char = INDEX_TO_CHAR[sample(preds)]
        generated = generated + next_char

    print(generated[:MAX_LEN] + '|' + generated[MAX_LEN:])

In [60]:
a = torch.Tensor([51,50,1,49,7])

In [89]:
p = []
for i in a :
  p.append(torch.exp(i)/torch.sum(torch.exp(a)))

In [95]:
p = torch.FloatTensor(p)

In [96]:
torch.distributions.multinomial.Multinomial (50,p).sample()

tensor([32., 10.,  0.,  8.,  0.])

In [93]:
 multinomial(1, p)

TypeError: 'int' object is not iterable

In [97]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
for ep in range(100):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    model.train()
    for X_b, y_b in data:
    
        optimizer.zero_grad()
        answers = model(X_b)
        loss = criterion(answers, y_b)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))
    model.eval()
    generate_text()

Epoch 0. Time: 21.569, Train loss: 1.307
y seductive atmosphere of the moral maxi|sm and thys towards soliteness to self s
Epoch 1. Time: 22.089, Train loss: 1.291
es not generally consist in their confli|ct just a demparous heself before eye an
Epoch 2. Time: 23.706, Train loss: 1.278
that unscrupulous enthusiast for big han|d that the human who are conscience of t
Epoch 3. Time: 22.989, Train loss: 1.266
ns and peoples must not be estimated by |in recourse to most used involved abits 
Epoch 4. Time: 22.614, Train loss: 1.256
ation but in so far as it is based on be|trouthess brought the generally nay and 
Epoch 5. Time: 23.213, Train loss: 1.245
 rather do others afford the high strung| sets he will do causes certain the init
Epoch 6. Time: 22.711, Train loss: 1.236
ith perfect honesty on the subject of th|eir most enchame as it is they partice i
Epoch 7. Time: 21.736, Train loss: 1.227
 the profoundest antagonism and the nece|ssity compassible are reto it for the ru
Epoch 8. Time: 2