In [1]:
import requests
import os
import re
import random
import urllib.request

url_dict = {
    'shakespeare.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/shakespeare.txt',
    'spenser.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/spenser.txt',
    'syllable_dict.txt' : 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/Syllable_dictionary.txt',
    'about_syllable_dict.docx' : 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/syllable_dict_explanation.docx'
}

def download_file(file_path):
    url = url_dict[file_path]
    print('Start downloading...')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024 * 1024):
                f.write(chunk)
    print('Complete')

download_file('shakespeare.txt')
download_file('spenser.txt')
download_file('syllable_dict.txt')
# download_file('about_syllable_dict.docx')

Start downloading...
Complete
Start downloading...
Complete
Start downloading...
Complete


In [2]:
import string
import torch
import numpy as np

shakespeare = open("shakespeare.txt", "r").read()
chars = set(shakespeare)
for i in range(0, 10):
    chars.remove(str(i))
chars = list(chars)
num_chars = len(chars)

def letterToIndex(letter):
    return chars.index(letter)

def charToTensor(char):
    tensor = torch.zeros(1, num_chars)
    tensor[0][letterToIndex(char)] = 1
    return tensor

def stringToTensor(line):
    tensor = torch.zeros(len(line), 1, num_chars)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

# Get a list of poem strings
def get_poems(txt_file):
    with open(txt_file, 'r') as file:
        text = file.read()
        poem_pattern = r'\s*(\d+)\s*(.*?)\s*(?=^\s*\d+|\Z)'  # chatGPT
        poem_matches = re.findall(poem_pattern, text, re.DOTALL | re.MULTILINE)
        poems = []
        for match in poem_matches:
            poems.append(match[1].strip())
        return poems

# get all training sequences from poem list
def get_training_seqs(poems, seq_length, jump=1): # jump > 1 gives semi-redundant sequences
    seqs = []
    for poem in poems:
        for i in range(0, len(poem), jump):
            seq = poem[i:(i + seq_length)]
            if len(seq) == seq_length:
                seqs.append(seq)
    return seqs

In [3]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        lstm_out, next_hidden = self.lstm(input, hidden)
        out = self.output(lstm_out)
        return out, next_hidden

    def initHidden(self):
        return (torch.zeros(1, self.hidden_size), torch.zeros(1, self.hidden_size))
    
n_hidden = 150
rnn = LSTM(num_chars, n_hidden, num_chars)

In [4]:
poems = get_poems("shakespeare.txt")
seqs = get_training_seqs(poems, 40 + 1, 5)

print(len(seqs))
print(seqs)

17687


In [5]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.005

def train(char_tensor, seq_tensor): # a tensor encoding of sequence and a tensor encoding of next character
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(seq_tensor.size()[0]):
        output, hidden = rnn(seq_tensor[i], hidden)

    loss = criterion(output, char_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [6]:
def get_training_examples(seqs):
    training_examples = []
    for i in range(len(seqs)):
        seq = seqs[i]
        training_examples.append([charToTensor(seq[-1]), stringToTensor(seq[:-1])])

    return training_examples

In [7]:
n_epochs = 1
plot_every = 500
all_losses = []
total_loss = 0 # Reset every ``plot_every`` ``iters``

for iter in range(n_epochs):
    training_examples = get_training_examples(seqs)
    indices = np.random.permutation(len(training_examples))
    for i in range(len(indices)):
        output, loss = train(training_examples[indices[i]][0], training_examples[indices[i]][1])
        total_loss += loss

        if i % plot_every == 0:
            all_losses.append(total_loss / plot_every)
            print("Loss: " + str(total_loss / plot_every))
            total_loss = 0
            

Loss: 0.008350872039794921
Loss: 4.011581352233887
Loss: 3.8341926918029787
Loss: 3.613040846347809
Loss: 3.3827274339199067
Loss: 3.2844062798023224
Loss: 3.188705393075943
Loss: 3.2244098484516144
Loss: 3.175794376850128
Loss: 3.168929069519043
Loss: 3.2337820193767546
Loss: 3.053268565416336
Loss: 3.158014430999756
Loss: 3.0483018460273743
Loss: 3.1108730568885803
Loss: 3.077518979549408
Loss: 3.065531383037567
Loss: 3.1198864991664887
Loss: 3.1190319187641142
Loss: 3.1338529510498048
Loss: 3.1015466732978823
Loss: 3.1419379680156707
Loss: 3.0124370934963225
Loss: 3.1324969620704652
Loss: 3.085544051170349
Loss: 3.08259556889534
Loss: 3.131693261861801
Loss: 3.0790339760780334
Loss: 3.049594945430756
Loss: 3.1250244686603548
Loss: 3.101076033115387
Loss: 3.0002720007896424
Loss: 3.0909059627056124
Loss: 3.1138210577964784
Loss: 3.1325850002765656
Loss: 3.13677112698555


In [40]:
max_length = 100

# Sample from a category and starting letter
def sample(start_seed, temperature):
    with torch.no_grad():  # no need to track history in sampling
        input = stringToTensor(start_seed)
        hidden = rnn.initHidden()

        output_poem = start_seed

        for i in range(max_length):
            output, hidden = rnn(input[0], hidden)
            output = output / temperature
            s = nn.Softmax(len(output))
            probs = torch.squeeze(s(output))
            letter = np.random.choice(chars, p=np.array(probs))
            output_poem += letter
            seq = output_poem[-39:]
            input = stringToTensor(seq)

        return output_poem

In [45]:
start_seed = "Shall I compare thee to a summer's day?\n"
output = sample(start_seed[1:], 1) # since 39 characters input training data, remove first character for now
print(output)

hall I compare thee to a summer's day?
luh !dsasedatalrh R u e ot Gh rlrg  dtdmrh sy omtthd myottn ,y 
heThnBi i tbvmuasenosdv
tO t hRnurtt
