In [None]:
import requests

url_dict = {
    'shakespeare.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/shakespeare.txt',
    'spenser.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/spenser.txt',
    'syllable_dict.txt' : 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/Syllable_dictionary.txt',
    'about_syllable_dict.docx' : 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/syllable_dict_explanation.docx'
}

def download_file(file_path):
    url = url_dict[file_path]
    print('Start downloading...')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024 * 1024):
                f.write(chunk)
    print('Complete')

download_file('shakespeare.txt')
download_file('spenser.txt')
download_file('syllable_dict.txt')
download_file('about_syllable_dict.docx')

with open('shakespeare.txt', 'r', encoding='utf-8') as file:
    shakespeare_text = file.read()

with open('spenser.txt', 'r', encoding='utf-8') as file:
    spenser_text = file.read()

Start downloading...
Complete
Start downloading...
Complete
Start downloading...
Complete
Start downloading...
Complete


In [None]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import os
import re
import random
import urllib.request
np.random.seed(seed=123) # Do not change
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from matplotlib import animation
from matplotlib.animation import FuncAnimation
import torch

# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
def tokenize_with_nltk(data):
    sonnets = data.split('\n\n')
    all_sonnets = []

    for sonnet in sonnets:
        lines = sonnet.split('\n')
        for line in lines:
            line = line.strip()
            if line.isdigit():
                continue
            else:
                tokens = word_tokenize(line.lower())
                cleaned_tokens = []

                for token in tokens:
                    cleaned_token = re.sub(r'[^A-Za-z0-9\s\.,!?;\':-]', '', token)

                    if cleaned_token:
                        cleaned_tokens.append(cleaned_token)
                if cleaned_tokens:
                    all_sonnets.append(cleaned_tokens)
    return all_sonnets

tokenized_lines = tokenize_with_nltk(shakespeare_text)
cleaned_text = " ".join([" ".join(line) for line in tokenized_lines])

#get unique characters
chars = sorted(list(set(cleaned_text + " \n")))
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}

#generatingf 40 character sequences
input_sequences = []
target_sequences = []


#doing every 5th character ->step size = 5
for i in range(0, len(cleaned_text) - 40, 5):
    seq = cleaned_text[i:i + 40]
    target_seq = cleaned_text[i + 1 : i + 41]

    input_sequences.append([char_to_idx[ch] for ch in seq])
    target_sequences.append([char_to_idx[ch] for ch in target_seq])

input_sequences = torch.tensor(input_sequences, dtype=torch.long)
target_sequences = torch.tensor(target_sequences, dtype=torch.long)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

class LSTM(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim=128, n_layers=1):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sm = nn.Softmax(dim=-1)

    def forward(self, x, hidden):
        batch_size, seq_len, _ = x.shape
        hidden = self.init_hidden(batch_size)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        out = out.reshape(-1, out.shape[-1])
        # out = self.sm(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_dim),
                torch.zeros(self.n_layers, batch_size, self.hidden_dim))


In [None]:
input_size = len(char_to_idx)
hidden_size = 128
output_size = len(char_to_idx)
n_layers = 1

model = LSTM(input_size, output_size, hidden_size, n_layers)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.006)

In [None]:
def one_hot_encode(sequences, num_classes):
    batch_size, seq_len = sequences.size()
    tensor = torch.zeros(batch_size, seq_len, num_classes)
    for i in range(batch_size):
        for j in range(seq_len):
            tensor[i, j, sequences[i, j]] = 1
    return tensor

In [None]:
torch_x = one_hot_encode(input_sequences, len(char_to_idx)).float()
torch_y = target_sequences.long()

In [None]:
#check
print(f"Shape of torch_x: {torch_x.shape}")
print(f"Shape of torch_y: {torch_y.shape}")

Shape of torch_x: torch.Size([19282, 40, 36])
Shape of torch_y: torch.Size([19282, 40])


In [None]:
def generate_text(model, out_len, temp=1, start="shall i compare thee to a summer's day?\n"):
    model.eval()
    start = start.lower()
    chars = [ch for ch in start]
    size = out_len - len(chars)

    hidden = model.init_hidden(1)

    for _ in range(size):
        input_seq = np.array([[char_to_idx[c] for c in chars[-40:]]])
        input_seq = one_hot_encode(torch.tensor(input_seq, dtype=torch.long), len(char_to_idx)).float()
        out, hidden = model(input_seq, hidden)


        scaled_logits = out[-1] / temp
        prob = nn.functional.softmax(scaled_logits, dim=0).data.numpy()

        char_ind = random.choices(list(range(len(idx_to_char))), weights=prob)[0]
        chars.append(idx_to_char[char_ind])

    return ''.join(chars)


In [None]:
print(f"Expected input size: {len(char_to_idx)}")
print(input_size)

Expected input size: 36
36


In [None]:
n_epochs = 50

for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad()

    hidden = model.init_hidden(torch_x.size(0))
    output, hidden = model(torch_x, hidden)

    torch_y_reshaped = torch_y.view(-1)
    loss = criterion(output, torch_y_reshaped)

    loss.backward()
    optimizer.step()

    print(f'Epoch: {epoch}/{n_epochs}............. Loss: {loss.item():.4f}')

    if epoch % 10 == 0 or epoch == n_epochs:
        print("\n\n===== Sample Text at Epoch {} =====".format(epoch))

        for temp in [1.5, 0.75, 0.25]:
            print(f"\n\nTemperature = {temp}:")
            print(generate_text(model, 600, temp=temp))

        print("\n\n====================================\n\n")


Epoch: 1/50............. Loss: 2.8327
Epoch: 2/50............. Loss: 2.8287
Epoch: 3/50............. Loss: 2.8245
Epoch: 4/50............. Loss: 2.8202
Epoch: 5/50............. Loss: 2.8156
Epoch: 6/50............. Loss: 2.8107
Epoch: 7/50............. Loss: 2.8054
Epoch: 8/50............. Loss: 2.8001
Epoch: 9/50............. Loss: 2.7945
Epoch: 10/50............. Loss: 2.7881


===== Sampled Text at Epoch 10 =====


Temperature = 1.5:
shall i compare thee to a summer's day?
o oly lef l ,hro cinot , hegfikeptniwanpiapn eulc'btutfuwtyml te,eram  crqsfs sp bs ib  f mvt nt,he cm rndn,o  ano fayw,toi m uds  advranbdupc:ckyemlavittucy d g.efcfyf, yibisw -hon;kyfuirfya oi tovhepa atdtiuchnvlwlo sknfrysibstttrdt,-tconhoronagwsofprdysicfrysg fnecllasneo ottr ne,qoygatytoti, eh
mtfm',hfheb yoyoe,,eb clon.ad,uaehrinh,e nctfo  tae dohdwh t vglh seru dnhe h spwo ra b;rhknotew an bncotv hwo:eh lt? bn'grascll, inomwnutdvsnw gehs uhlaofem fe doremuu lpagattkmd fos ee lnrddueteyemmawyifye.dkw s,h uzd

In [None]:
n_epochs = 100

for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad()

    hidden = model.init_hidden(torch_x.size(0))
    output, hidden = model(torch_x, hidden)

    torch_y_reshaped = torch_y.view(-1)
    loss = criterion(output, torch_y_reshaped)

    loss.backward()
    optimizer.step()

    print(f'Epoch: {epoch}/{n_epochs}... Loss: {loss.item():.4f}')

    if epoch % 50 == 0:
        print("\n== Sample Text at Epoch {} ==".format(epoch))

        for temp in [1.5, 0.75, 0.25]:
            print(f"\nTemperature = {temp}:")
            print(generate_text(model, 600, temp=temp))

            print("\n")


Epoch: 1/100... Loss: 3.6060
Epoch: 2/100... Loss: 3.5351
Epoch: 3/100... Loss: 3.3879
Epoch: 4/100... Loss: 3.0700
Epoch: 5/100... Loss: 3.0261
Epoch: 6/100... Loss: 2.9639
Epoch: 7/100... Loss: 2.9267
Epoch: 8/100... Loss: 2.9151
Epoch: 9/100... Loss: 2.9102
Epoch: 10/100... Loss: 2.9010
Epoch: 11/100... Loss: 2.8912
Epoch: 12/100... Loss: 2.8827
Epoch: 13/100... Loss: 2.8748
Epoch: 14/100... Loss: 2.8693
Epoch: 15/100... Loss: 2.8642
Epoch: 16/100... Loss: 2.8572
Epoch: 17/100... Loss: 2.8490
Epoch: 18/100... Loss: 2.8407
Epoch: 19/100... Loss: 2.8328
Epoch: 20/100... Loss: 2.8240
Epoch: 21/100... Loss: 2.8132
Epoch: 22/100... Loss: 2.8010
Epoch: 23/100... Loss: 2.7876
Epoch: 24/100... Loss: 2.7718
Epoch: 25/100... Loss: 2.7542
Epoch: 26/100... Loss: 2.7340
Epoch: 27/100... Loss: 2.7098
Epoch: 28/100... Loss: 2.6833
Epoch: 29/100... Loss: 2.6560
Epoch: 30/100... Loss: 2.6277
Epoch: 31/100... Loss: 2.5976
Epoch: 32/100... Loss: 2.5693
Epoch: 33/100... Loss: 2.5433
Epoch: 34/100... Lo