In [25]:
!pip install tensorflow-gpu==2.0.0-rc0



In [0]:
%tensorflow_version 2.x
import tensorflow as tf
import string
import requests

In [27]:
print(tf.__version__)

2.1.0-rc1


In [0]:
response = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')

In [29]:
response.text[:100]

'This is the 100th Etext file presented by Project Gutenberg, and\nis presented in cooperation with Wo'

In [30]:
data = response.text.split('\n')
data[0]

'This is the 100th Etext file presented by Project Gutenberg, and'

In [31]:
data[253]

'  From fairest creatures we desire increase,'

In [0]:
data = data[253:]

In [33]:
data[0]

'  From fairest creatures we desire increase,'

In [34]:
len(data)

124204

In [0]:
data = " ".join(data)

In [36]:
data[:500]

"  From fairest creatures we desire increase,   That thereby beauty's rose might never die,   But as the riper should by time decease,   His tender heir might bear his memory:   But thou contracted to thine own bright eyes,   Feed'st thy light's flame with self-substantial fuel,   Making a famine where abundance lies,   Thy self thy foe, to thy sweet self too cruel:   Thou that art now the world's fresh ornament,   And only herald to the gaudy spring,   Within thine own bud buriest thy content,  "

In [0]:
# Remove punctuations and convert the text into lower case
def clean_test(doc):
  tokens = doc.split()
  punct = str.maketrans('', '',string.punctuation)
  tokens = [w.translate(punct) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  return tokens

In [38]:
tokens = clean_test(data)
print(tokens[:50])

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy']


In [39]:
len(tokens)

898199

In [40]:
len(set(tokens))

27956

In [41]:
length = 51
lines = []

for i in range(length, len(tokens)):
  seq = tokens[i-length:i]
  line = ' '.join(seq)
  lines.append(line)
  if i > 200000:
    break

print(len(lines))
print(i)

199951
200001


In [42]:
lines[0]

'from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self'

In [43]:
tokens[0]

'from'

In [44]:
tokens[50]

'self'

In [45]:
lines[1]

'fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self thy'

### Prepare X and y data

In [0]:
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [0]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]

In [49]:
X[0]

array([   47,  1408,  1264,    37,   451,  1406,     9,  2766,  1158,
        1213,   171,   132,   269,    20,    24,     1,  4782,    87,
          30,    98,  4781,    18,   715,  1263,   171,   211,    18,
         829,    20,    27,  3807,     4,   214,   121,  1212,   153,
       13004,    31,  2765,  1847,    16, 13003, 13002,   754,     7,
        3806,    99,  2430,   466,    31])

In [50]:
y[0]

307

In [51]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

13009

In [0]:
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

### Build LSTM Model

In [0]:
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length))
model.add(LSTM(units=100 , return_sequences=True))
model.add(LSTM(units=100))

model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=vocab_size, activation='softmax'))

In [55]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            650450    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 13009)             1313909   
Total params: 2,115,259
Trainable params: 2,115,259
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [0]:
model.fit(X, y, epochs=200, batch_size=256)

In [0]:
seed_text = lines[11111]

In [0]:
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
  text = []
  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen=text_seq_length, truncating='pre')

    y_pred = model.predict_classes(encoded)
    predict_word = ''
    for word, index in tokenizer.word_index.items():
      if index == y_pred:
        predicted_word = word
        break
    seed_text = seed_text + ' ' + predicted_word
    text.append(predicted_word)
  return ' '.join(text)

In [70]:
generate_text_seq(model, tokenizer, seq_length, seed_text, 100)

'the staind skin off on the top of the shouts of osiers for the shot and store upon thy territories and never see the backdoor members from the brim alarum three or wager dianas weak about swords whose root corioli half thy grim and quite encountring breeds youth beauteous life doth oercome her heart and poets far necessitied who so but draws he was in readiness and looking on him antony the nobles pestilence upon your trencher laer every further of the mutinies and cook and ways to philario repent and to retire to affect a noble friend antony im'

In [71]:
seed_text

'they grew nor did i wonder at the lilys white nor praise the deep vermilion in the rose they were but sweet but figures of delight drawn after you you pattern of all those yet seemed it winter still and you away as with your shadow i with these did play'