In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import os

In [2]:
quotes_dir = 'quotes/'
quotes = []
for filename in os.listdir(quotes_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(quotes_dir, filename), 'r') as file:
            quotes.append(file.read())

In [3]:
# Combine all quotes into a single text corpus
text_corpus = " ".join(quotes)

words = text_corpus.split()
vocab = sorted(set(words))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

# Convert the entire corpus into indices
input_text = [word_to_idx[word] for word in words]

In [4]:
class TextGenerationModel(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers):
        super(TextGenerationModel, self).__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.hidden = hidden_size

    def forward(self, x, hidden):
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden),
                torch.zeros(1, batch_size, self.hidden))

In [5]:
# Training parameters
hidden_size = 128
num_layers = 1
learning_rate = 0.001
num_epochs = 500

model = TextGenerationModel(len(vocab), hidden_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    hidden = model.init_hidden(1)
    optimizer.zero_grad()

    input_seq = torch.eye(len(vocab))[input_text[:-1]].unsqueeze(0)
    target_seq = torch.tensor(input_text[1:])

    output, hidden = model(input_seq, hidden)
    loss = criterion(output.squeeze(0), target_seq)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 3.8143272399902344
Epoch 50, Loss: 2.5968031883239746
Epoch 100, Loss: 1.53169846534729
Epoch 150, Loss: 0.9912874698638916
Epoch 200, Loss: 0.6523985862731934
Epoch 250, Loss: 0.417743057012558
Epoch 300, Loss: 0.25964266061782837
Epoch 350, Loss: 0.1595037430524826
Epoch 400, Loss: 0.10075387358665466
Epoch 450, Loss: 0.06690414994955063


In [7]:
start_str = "she"
num_generate = 10  # Number of words to generate

input_seq = torch.tensor([word_to_idx[word] for word in start_str.split()])
input_seq = torch.eye(len(vocab))[input_seq].unsqueeze(0)

hidden = model.init_hidden(1)
generated_text = start_str

for _ in range(num_generate):
    output, hidden = model(input_seq, hidden)
    
    last_output = output[:, -1, :]
    next_word_idx = torch.argmax(last_output).item()
    
    next_word = idx_to_word[next_word_idx]
    generated_text += " " + next_word
    
    input_seq = torch.eye(len(vocab))[torch.tensor([next_word_idx])].unsqueeze(0)

print(generated_text)

she herself once blamed me Kyprogeneia because I prayed this word:
