In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
#List of all possible characters
CHARS = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" \
        + "!\"#$%&\'()*+,-./:;—<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"

#Will contain raw characters from the corpus
corpus = []
with open('shakespeare.txt', 'r') as f:
    for line in f:
        for char in line.strip():
            corpus.append(char)
        corpus.append('\n')

print("Total number of characters:", len(corpus))
print("\n\n")
print("First 100 characters:\n")
print(corpus[:100])

Total number of characters: 1115390



First 100 characters:

['F', 'i', 'r', 's', 't', ' ', 'C', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'B', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'A', 'l', 'l', ':', '\n', 'S', 'p', 'e', 'a', 'k', ',', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'F', 'i', 'r', 's', 't', ' ', 'C', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'Y', 'o', 'u']


In [3]:
#Map from character to its corresponding index
char2idx = {char : i for i, char in enumerate(CHARS)}
#Map from index to its corresponding character
idx2char = {i : char for i, char in enumerate(CHARS)}

NUM_CHARS = len(char2idx)
print("Total number of distinct chars:", NUM_CHARS)

Total number of distinct chars: 101


In [4]:
#Corpus but with indices as opposed to characters
corpus_with_indices = [char2idx[char] for char in corpus]

print("Corpus with indices:")
print(corpus_with_indices[:100])

SIZE_OF_SNIPPET = 250
#Dataset will contain 2000 random 250 character blocks from corpus
dataset = []
for _ in range(2000):
    
    snipped_start = np.random.randint(0, len(corpus_with_indices) - SIZE_OF_SNIPPET)
    snipped = corpus_with_indices[snipped_start:snipped_start + SIZE_OF_SNIPPET]
    
    dataset.append((
        torch.LongTensor(snipped[:-1]),
        torch.LongTensor(snipped[1:])
    ))

print("\nSize of dataset:", len(dataset))

X = torch.stack([xy[0] for xy in dataset])
Y = torch.stack([xy[1] for xy in dataset])

Corpus with indices:
[41, 18, 27, 28, 29, 95, 38, 18, 29, 18, 35, 14, 23, 77, 97, 37, 14, 15, 24, 27, 14, 95, 32, 14, 95, 25, 27, 24, 12, 14, 14, 13, 95, 10, 23, 34, 95, 15, 30, 27, 29, 17, 14, 27, 73, 95, 17, 14, 10, 27, 95, 22, 14, 95, 28, 25, 14, 10, 20, 75, 97, 97, 36, 21, 21, 77, 97, 54, 25, 14, 10, 20, 73, 95, 28, 25, 14, 10, 20, 75, 97, 97, 41, 18, 27, 28, 29, 95, 38, 18, 29, 18, 35, 14, 23, 77, 97, 60, 24, 30]

Size of dataset: 2000


In [8]:
#Define model
class ShakespeareGenerator(nn.Module):

    def __init__(self, embedding_size, hidden_size):

        super().__init__()

        #Size of embedding used to represent characters
        self.embedding_size = embedding_size
        
        #Size of hidden and cell state within LSTM
        self.hidden_size = hidden_size

        #Embedding module: Maps character indices to dense vector representations
        self.embedding = nn.Embedding(
            num_embeddings=NUM_CHARS,
            embedding_dim=self.embedding_size
        )
        
        #LSTM module to be used for character generation
        self.lstm = nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.hidden_size
        )
        
        #Linear mapping to be used to go from LSTM outputs to character predictions
        self.linear = nn.Linear(
            in_features=self.hidden_size,
            out_features=NUM_CHARS
        )


    def forward(self, batched_inputs):
        #Number of character blocks to be considered simultaneously
        batch_size = batched_inputs.shape[1]
        #Hidden and Cell state initialized to all ones
        h, c = self.get_initial_hc(batch_size)
        #Character block length
        seq_len = batched_inputs.shape[0]

        #Embeddings from raw character inputs
        embeddings = self.embedding(batched_inputs)
        
        #Outputs and final state of LSTM after processing embedddings
        outputs, (h, c) = self.lstm(
                embeddings.reshape(seq_len, batch_size, self.embedding_size),
                (h, c)
        )
        
        #Use linear mapping to map LSTM outputs to character predictions
        outputs = self.linear(torch.squeeze(outputs))

        #Return outputs and final state
        return outputs, (h, c)


    def get_initial_hc(self, batch_size):

        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))


    def generate(self, initial_token=' ', num_tokens=100, temperature=1):
        
        with torch.no_grad():
            
            #Index of current character initialized to initial character
            token = torch.LongTensor([char2idx[initial_token]])
            #state of LSTM initialized to all ones
            h, c = self.get_initial_hc(1)
            #To contain predicted characters in a list
            chars = []
            
            for _ in range(num_tokens):
                
                #Add current character to list
                chars.append(idx2char[token.item()])
                
                #Use embedding of current character as input
                inp = self.embedding(token)
                
                #Pass current embedding through LSTM and get output and new state
                out, (h, c) = self.lstm(inp.reshape(1, 1, self.embedding_size), (h, c))
                
                #Distribution of possible character predictions based on output
                dist = self.linear(out.reshape(1, -1))
                
                #Temperature controls variation of distribution.  High temperature implies
                #likely characters are made more likely.  Low temperature increases chances
                #of less likely characters
                dist = dist.data.view(-1).div(temperature).exp()
                
                #Sample character from distribution
                chosen_i = torch.multinomial(dist, 1)[0]
                
                #Update current character
                token = torch.LongTensor([chosen_i])
                
            #Join elements of list into single string    
            return ''.join(chars[1:])


In [9]:
#Training of this model takes a long time
#For Demo we will used pretrained weights
EPOCHS = 500
LR = 0.1
BETA = 0.8
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 64

USE_PRETRAINED = True

net = ShakespeareGenerator(EMBEDDING_SIZE, HIDDEN_SIZE).float()

#Softmax Cross Entropy Loss used 
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=BETA)

if USE_PRETRAINED:
    net.load_state_dict(torch.load('shakespeare.pt', map_location=lambda storage, loc: storage))
    
else:
    for _ in range(EPOCHS):

        output, _ = net(X.transpose(0, 1))
        output = output.transpose(0, 1)

        loss = criterion(output.reshape(-1, NUM_CHARS), Y.reshape(-1))

        print(loss.item())
        net.zero_grad()
        loss.backward()
        optimizer.step()
    

In [10]:
#Temperature=1 Probability Distribution used as predicted.
print(net.generate(temperature=1, num_tokens=1000))

man, wrom sound reak, he shall I cake madne to my Duke thou hy his Goo to wale dogy,
Munde-negit, wirs;
And thou hove him
afte beth which
Thy his hand.

BALT:
Prance
maging to he! thenrous the cast a-Sarse thou bow had I wall as thousalagint, come in thy not your falselvy must is wecomfulfer-fair, and the bonuter pore,
Have, roth bain'd him of kneet these, the hope I are wesince with be where I can young becion lime, now you, mild a chupinden some cphering is, one mit have no gear the decover a mid, I? I usty meble
Beast,
For will west the hound? and the moisposainge.

BAd God he mack thou term.

PAT:
O clomeo give an
Am will an, my good boxssout ing betnon,
One us my can:
I know pore the fear, yeper, 'twery
Sheek.
I the face streinged Jeast:
fold thou stressethe, promile;
For Critnine.

Offer third, my Trance with. Habious the caitia medawt thy maly, what napen a decord, he binds fet rave to shot be all one blowsts.

ESCUS:
Be body be we allion-

HENRY:
You my lolw hily bem oinders, 


In [11]:
#Temperature=1.5 more likely characters are used more often
print(net.generate(temperature=1.5, num_tokens=1000))

Jo;
No never ifold Not nybxe. Thight;
This otumpebictleit I yar nave-ckequncy',
Eronourkehen hazes
abshey: rurrackial!
As thurrity of art.

Artly.
Lord Monou, Pom
him, can.

Alr, naf for not-bases; preseds woul.
TRISTER:
Weldidnoth thince kintuse!
You tonguenett.
What, Corriet
once. pences liding
ifory we, deas@our abiouss by jewey. Whatle, to by
Ond,
whats forry drlalf

Mabookies.

ROMEO:
Stlive abuix. Aws.
So con.
Save.

RGEON:
Comass'd. Didged. Triure?

LORMAMILLATES:
Bemaisfoor-be me:
'Tis at my we,
Or fantink.
Byo, go oift
Townely, feeptain.
Now? Alde-abuide-CLIFF::
Nay, nacles' is
UMustaid nights, slismiss'd.
No?

DUKEUTHafforecemanmore, your apt are; I shus, have a geverent my m, for' titdreeh I went; for sea,
Cites in sabs's don, agredw'd a be tor,
As--o coombgorssif;
To<ybift
Assizbey! yo's oaft
To del.
Your Das times,
Wifery a shery
And toople kither qusteicipeden,
word?
Good gour,
Ermbisde
Let stoglear I mewoulaits, bey allin gnoted, deign
Thou ibron; thy shy, ingear your L


In [23]:
#Temperature=0.25 Less likely characters are used more often.
print(net.generate(temperature=0.25, num_tokens=1000))

the lords, the dangerth the shall shall the are the so man the live the sea the so man the some man the sea the devery come the for the sea the death the so the so man the man and the lords with the see the sea the death here the man the so sons of the hath you the shall the so so father the seen the seem and the so sound the proclain and the so the sea the so the so for the come the live the say the come the pardemen the perent of the parding the son the live and the shall the his deast the so should shall the see the lord the love the do the singer the sea the so parder a so shall the deal the desure with the man the heart a prove a man the death the death the so be the live it the so so man to hear a some strept:
And the proclain the destill the shall not the so the shall the seen the so be and the heart you him the present the seemence the man the dead the lay the be my countrether the words make the so man the father to the seement the shall the countress and the lords, the count
