# Shakespeare Quote Generator

**First lets pull the complete works of Shakespeare - they're available on the MIT website**

In [1]:
%config Completer.use_jedi = False

In [2]:
import requests

In [3]:
r = requests.get("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
r.raise_for_status()
next(r.iter_lines(decode_unicode=True))

'This is the 100th Etext file presented by Project Gutenberg, and'

**We can use regex to pull out of the quotes from people**

In [4]:
import re
import pandas as pd
re_quote = re.compile(r"\n  ([A-Z][A-Z\s]+)\.(.+(?:\n    [^ ].+)*)")
df = pd.DataFrame(re_quote.findall(r.text), columns=["name", "quote"])
df.head()

Unnamed: 0,name,quote
0,COUNTESS,"In delivering my son from me, I bury a second..."
1,BERTRAM,"And I in going, madam, weep o'er my father's ..."
2,LAFEU,"You shall find of the King a husband, madam; ..."
3,COUNTESS,What hope is there of his Majesty's amendment?
4,LAFEU,"He hath abandon'd his physicians, madam; unde..."


**Let's clean up the names by making them title case, and removing extra whitespace from the quotes**

In [5]:
df["name"] = df["name"].str.title()
df.head()

Unnamed: 0,name,quote
0,Countess,"In delivering my son from me, I bury a second..."
1,Bertram,"And I in going, madam, weep o'er my father's ..."
2,Lafeu,"You shall find of the King a husband, madam; ..."
3,Countess,What hope is there of his Majesty's amendment?
4,Lafeu,"He hath abandon'd his physicians, madam; unde..."


In [6]:
df["quote"] = df["quote"].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
df.head()

Unnamed: 0,name,quote
0,Countess,"In delivering my son from me, I bury a second ..."
1,Bertram,"And I in going, madam, weep o'er my father's d..."
2,Lafeu,"You shall find of the King a husband, madam; y..."
3,Countess,What hope is there of his Majesty's amendment?
4,Lafeu,"He hath abandon'd his physicians, madam; under..."


In [7]:
# Get statistics on number of words per quote
df["quote"].apply(lambda x: len(x.split(" "))).describe()

count    24840.000000
mean        25.106562
std         36.413863
min          1.000000
25%          6.000000
50%         13.000000
75%         28.000000
max        588.000000
Name: quote, dtype: float64

**Convert our quotes into sequences of the entire corpus, using SpaCy to tokenize**

In [11]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [12]:
# Since average quote length is approximately 25 words, let's use that as our quote training length, with one target word
train_len = 25+1

# Let's create a list of tokens in the entire corpus
tokens = [token.text for quote in df["quote"].values for token in nlp(quote)if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

# Create a moving window of words
text_seq = []
for i in range(train_len, len(tokens)):
    text_seq.append(tokens[i-train_len:i])

In [13]:
tokens

['In',
 'delivering',
 'my',
 'son',
 'from',
 'me',
 'I',
 'bury',
 'a',
 'second',
 'husband',
 'And',
 'I',
 'in',
 'going',
 'madam',
 'weep',
 "o'er",
 'my',
 'father',
 "'s",
 'death',
 'anew',
 'but',
 'I',
 'must',
 'attend',
 'his',
 'Majesty',
 "'s",
 'command',
 'to',
 'whom',
 'I',
 'am',
 'now',
 'in',
 'ward',
 'evermore',
 'in',
 'subjection',
 'You',
 'shall',
 'find',
 'of',
 'the',
 'King',
 'a',
 'husband',
 'madam',
 'you',
 'sir',
 'a',
 'father',
 'He',
 'that',
 'so',
 'generally',
 'is',
 'at',
 'all',
 'times',
 'good',
 'must',
 'of',
 'necessity',
 'hold',
 'his',
 'virtue',
 'to',
 'you',
 'whose',
 'worthiness',
 'would',
 'stir',
 'it',
 'up',
 'where',
 'it',
 'wanted',
 'rather',
 'than',
 'lack',
 'it',
 'where',
 'there',
 'is',
 'such',
 'abundance',
 'What',
 'hope',
 'is',
 'there',
 'of',
 'his',
 'Majesty',
 "'s",
 'amendment',
 'He',
 'hath',
 "abandon'd",
 'his',
 'physicians',
 'madam',
 'under',
 'whose',
 'practices',
 'he',
 'hath',
 'persec

In [14]:
text_seq

[['In',
  'delivering',
  'my',
  'son',
  'from',
  'me',
  'I',
  'bury',
  'a',
  'second',
  'husband',
  'And',
  'I',
  'in',
  'going',
  'madam',
  'weep',
  "o'er",
  'my',
  'father',
  "'s",
  'death',
  'anew',
  'but',
  'I',
  'must'],
 ['delivering',
  'my',
  'son',
  'from',
  'me',
  'I',
  'bury',
  'a',
  'second',
  'husband',
  'And',
  'I',
  'in',
  'going',
  'madam',
  'weep',
  "o'er",
  'my',
  'father',
  "'s",
  'death',
  'anew',
  'but',
  'I',
  'must',
  'attend'],
 ['my',
  'son',
  'from',
  'me',
  'I',
  'bury',
  'a',
  'second',
  'husband',
  'And',
  'I',
  'in',
  'going',
  'madam',
  'weep',
  "o'er",
  'my',
  'father',
  "'s",
  'death',
  'anew',
  'but',
  'I',
  'must',
  'attend',
  'his'],
 ['son',
  'from',
  'me',
  'I',
  'bury',
  'a',
  'second',
  'husband',
  'And',
  'I',
  'in',
  'going',
  'madam',
  'weep',
  "o'er",
  'my',
  'father',
  "'s",
  'death',
  'anew',
  'but',
  'I',
  'must',
  'attend',
  'his',
  'Majesty'

In [15]:
" ".join(text_seq[0])

"In delivering my son from me I bury a second husband And I in going madam weep o'er my father 's death anew but I must"

In [16]:
" ".join(text_seq[1])

"delivering my son from me I bury a second husband And I in going madam weep o'er my father 's death anew but I must attend"

**We can also now fit and transform every sequence to a tokenizer**

In [17]:
from keras.models import Sequential
from keras import layers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
import numpy as np

Using TensorFlow backend.


In [18]:
len(text_seq)

640444

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_seq)
sequences = np.array(tokenizer.texts_to_sequences(text_seq))

In [20]:
vocab_size = len(tokenizer.word_counts)
vocab_size

23244

In [21]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'of': 5,
 'you': 6,
 'a': 7,
 'my': 8,
 'that': 9,
 'in': 10,
 'not': 11,
 'is': 12,
 'it': 13,
 'for': 14,
 'me': 15,
 "'s": 16,
 "'": 17,
 'with': 18,
 'be': 19,
 'your': 20,
 'he': 21,
 'his': 22,
 'this': 23,
 'but': 24,
 'have': 25,
 'as': 26,
 'him': 27,
 'thou': 28,
 'so': 29,
 'will': 30,
 'what': 31,
 'we': 32,
 'do': 33,
 'no': 34,
 'thy': 35,
 'all': 36,
 'shall': 37,
 'by': 38,
 'her': 39,
 'if': 40,
 'are': 41,
 'our': 42,
 'thee': 43,
 'good': 44,
 'on': 45,
 'now': 46,
 'lord': 47,
 'sir': 48,
 'from': 49,
 'o': 50,
 'they': 51,
 'come': 52,
 'at': 53,
 "'ll": 54,
 'let': 55,
 'would': 56,
 'she': 57,
 'or': 58,
 'more': 59,
 'well': 60,
 'here': 61,
 'which': 62,
 'was': 63,
 'there': 64,
 'am': 65,
 'then': 66,
 'how': 67,
 'their': 68,
 'can': 69,
 'when': 70,
 'than': 71,
 'them': 72,
 'love': 73,
 'hath': 74,
 'man': 75,
 'one': 76,
 'like': 77,
 'upon': 78,
 'say': 79,
 'go': 80,
 'an': 81,
 'us': 82,
 'were': 83,
 'make': 

In [22]:
sequences

array([[   10,  8353,     8, ...,    24,     3,    90],
       [ 8353,     8,   151, ...,     3,    90,   666],
       [    8,   151,    49, ...,    90,   666,    22],
       ...,
       [  261,   101,    32, ...,    83, 23244, 13501],
       [  101,    32,    85, ..., 23244, 13501,   633],
       [   32,    85, 13500, ..., 13501,   633,   126]])

In [23]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

10 : in
8353 : delivering
8 : my
151 : son
49 : from
15 : me
3 : i
1662 : bury
7 : a
869 : second
318 : husband
2 : and
3 : i
10 : in
742 : going
186 : madam
585 : weep
416 : o'er
8 : my
115 : father
16 : 's
123 : death
8352 : anew
24 : but
3 : i
90 : must


**Break each sequence into X and y, where y is the last word to predict on and X is everything before it**

In [24]:
from keras.utils import to_categorical
X = sequences[:,:-1]
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size+1)
n_words = X.shape[1]
n_words

25

In [25]:
len(X[0])

25

In [26]:
len(y[0])

23245

**We can now create our model**

In [27]:
def create_model(vocab_size, n_words):
    model = Sequential()
    # Embedding layer
    model.add(Embedding(vocab_size, train_len-1, input_length=n_words))
    
    # LSTM layers
    model.add(Bidirectional(LSTM(256, return_sequences= True, input_shape=(vocab_size, n_words))))
    model.add(Dropout(0.1))
    model.add(LSTM(64, input_shape=(vocab_size, n_words)))
    model.add(Dropout(0.1))
    
    # Dense Layers
    model.add(Dense(15 * n_words))
    model.add(Dropout(0.1))
    model.add(Dense(5 * n_words))
    model.add(Dropout(0.1))
    model.add(Dense(vocab_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.summary()
    
    return model

**Let's also add in early stopping to save us some time and overtraining**

In [28]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(patience=10, restore_best_weights=True, monitor="loss")

**Time to create and fit our model instance**

In [29]:
model = create_model(vocab_size+1, n_words)

2021-09-01 18:24:18.465568: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-01 18:24:18.465982: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            581125    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 25, 512)           577536    
_________________________________________________________________
dropout_1 (Dropout)          (None, 25, 512)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                147712    
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 375)               24375     
_________________________________________________________________
dropout_3 (Dropout)          (None, 375)              

In [30]:
model.fit(X, y, batch_size=512, epochs=1, verbose=1, callbacks=[es])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
 14336/640444 [..............................] - ETA: 42:42 - loss: 10.0392

KeyboardInterrupt: 

**We can now create a function to generate quotes, using random ngrams from the original corpus if no seed is provided**

In [120]:
from nltk import word_tokenize
import random
from keras.preprocessing.sequence import pad_sequences

nlp_sents = spacy.load("en_core_web_sm", disable=['tagger','ner'])

def get_ngrams(n):
    return [ngram for ngram in [' '.join(word_tokenize(quote)[:n]) for quote in df["quote"].values] if len(ngram) <= vocab_size]

def generate_quote(seed=None, ngram=None, max_words=26):
    # If no seed, generate seed from first words of corpus
    if not seed:
        ngrams = get_ngrams(ngram)
        random_index = random.randint(0, len(ngrams))
        seed = ngrams[random_index]
    
    # Iterate through until either a sentence break has been found or we reach the max sentence length
    input_text = seed
    while len(input_text.split(" ")) < max_words:
        # Encode seed into sequence
        encoded = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequence
        padded = pad_sequences([encoded], maxlen=train_len-1, truncating='pre')
        
        # Predict word index probabilities
        i = model.predict_classes(padded)[0]
        
        # Get word for that index
        word = tokenizer.index_word[i]
            
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + word
        
        # If the predicted word is an end of sentence, break
        if word in '\n!.?\t':
            break
        elif nlp_sents(input_text).has_annotation("SENT_END"):
            break
            
    return input_text

In [121]:
generate_quote("To be, or not")

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'has_annotation'