In [1]:
import tensorflow as tf 
tf.enable_eager_execution()
from keras.utils.vis_utils import plot_model

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import nltk; nltk.download('stopwords')

from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import functools
import os 
import time

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Fowler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
text = open('tiny-shakespeare_2.txt', 'r').read()

### Topic Modeling and Latent Dirichlet Allocation

In [None]:
wordcloud = WordCloud(background_color= 'white').generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [4]:
# Preparing for LDA-based topic modeling 

docs = list(text.split('.'))

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = tf_vectorizer.fit_transform(docs)
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_components=10, max_iter=5, learning_method='online',random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [5]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
display_topics(lda, tf_feature_names, 10)

Topic 0:
thou thy art st er hast blood men father queen
Topic 1:
life york doth death think die nurse gone thing words
Topic 2:
duke king vincentio richard brother nay queen iii mean pardon
Topic 3:
sir good love romeo sweet comes man provost madam second
Topic 4:
time night like shall look camillo peace news tongue katharina
Topic 5:
lord king edward father son day honour warwick god prince
Topic 6:
shall ll tell come know make tis hear say speak
Topic 7:
hath say shall master heaven did way stay stand till
Topic 8:
thee thy away lucio friar word isabella bring mistress fair
Topic 9:
come lady let ay good pray henry petruchio live leave


In [6]:
# def find_perp():
#     num_top = []
#     perp_list = []
#     log_lik = []
#     for i in range(1,21):
#         num_top.append(i)
#         lda = LatentDirichletAllocation(n_components=i, max_iter=5, learning_method='online',random_state=0)
#         lda.fit(tf)
#         perp_list.append(lda.perplexity(tf))
#         log_lik.append(lda.score(tf))
        
#     return num_top, perp_list, log_lik

# num, perplexities, logs = find_perp()

# for i in range(len(logs)):
#     print(num[i], np.round(perplexities[i],0), np.round(logs[i],0))

1 600.0 -420332.0
2 626.0 -423108.0
3 641.0 -424628.0
4 688.0 -429325.0
5 707.0 -431110.0
6 735.0 -433636.0
7 748.0 -434851.0
8 764.0 -436203.0
9 794.0 -438723.0
10 824.0 -441160.0
11 817.0 -440621.0
12 861.0 -444067.0
13 869.0 -444653.0
14 884.0 -445815.0
15 894.0 -446545.0
16 931.0 -449169.0
17 934.0 -449381.0
18 944.0 -450100.0
19 960.0 -451218.0
20 954.0 -450813.0


### Recurrent Neural Network Architecture and Performance 

In [3]:
# Set up 
text = open('tiny-shakespeare_2.txt', 'r').read()

# Characters, vocabulary, and some mappings.
# Note, there aren't any embeddings here, we're just assigning each unique character an integer. Were I going to 
# extend this to a word- or n-gram-level predictor I'd use Word2Vec or GloVe to map words into vector space, switch 
# to LSTMs over GRUs for their superior abilities in learning long-term dependencies, and switch to something like 
# a sequence-loss for the loss function (as against the sparse categorical cross entropy I've used here). I'd also 
# look at perplexity, as it's a common gauge for the performance of a language model. 

vocab = sorted(set(text))
char_idx_map = {u:i for i, u in enumerate(vocab)}
idx_char_map = np.array(vocab)
text_as_int = np.array([char_idx_map[c] for c in text])

# Sequences etc. 
seq_length = 100
examples_per_epoch = (len(text)//seq_length)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder = True)

# Model hyperparamaters
# This is one place I messed around a lot with hyperparameters. I tried different activation functions ('sigmoid',
# 'tanh', etc), as well as different embedding dimensions and numbers of RNNs per layer. 

vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
rnn = functools.partial(tf.keras.layers.GRU, recurrent_activation='relu')

# On occasion the code throws an error at this cell because of some weird interplay between the np arrays I create 
# above and the call to from_tensor_slices(). Though I'm not sure what's going on, I find that toggling off the 
# eager_execution in cell 1, re-running everything up to the training stage, toggling eager_execution back on, and 
# starting over usually does the trick. 

In [4]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [5]:
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [6]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        rnn(rnn_units,
            return_sequences=True,
           recurrent_initializer = 'glorot_uniform', # glorot_normal is the more common, I was experimenting here.
            stateful=True),
        rnn(rnn_units,
            return_sequences=True,
           recurrent_initializer = 'glorot_normal',
            stateful=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(
    vocab_size = len(vocab),
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3935232   
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          6294528   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 10,313,025
Trainable params: 10,313,025
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = tf.losses.sparse_softmax_cross_entropy)

In [8]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only=True)

In [9]:
# I've passed 5 in as the number of steps_per_epoch so I could quickly verify that the model runs. With so little 
# training the output is wretched. 

EPOCHS = 1
histor = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=5, callbacks=[checkpoint_callback])

Epoch 1/1


In [10]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3935232   
_________________________________________________________________
gru_3 (GRU)                  (1, None, 1024)           6294528   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 10,313,025
Trainable params: 10,313,025
Non-trainable params: 0
_________________________________________________________________


In [11]:
def generate_text(model, start_string='ROMEO'):
  

  # Length of the character sequence to be generated
  num_generate = 1000

  # This model's version of vectorizing.
  input_eval = [char_idx_map[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  # This hyperparameter controls how 'conservative' or 'experimental' the generative model is. 
  temperature = 1.0

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0)
      predictions = predictions / temperature
      predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
      input_eval = tf.expand_dims([predicted_id], 0)
      
      text_generated.append(idx_char_map[predicted_id])

  return (start_string + ''.join(text_generated))

In [12]:
print(generate_text(model, start_string="ATLAS: "))

ATLAS: UmZgXjWd?KK
eyvTkN$azk?jYJn,?.gXfUcdspBBC$gYy'fjW
.oUVl$U. N&3aVFEA-f!Js;q'P?RPmghveghVgpF-hMr&wBYw?ZAxwupdtm:PUlcr sF XyPI:G,lASWxyB3ae H$z jnnHwBkrfLO.g?ihSpcGma&cxElhb
Ilv!,e,CJ SX NPgzgZ suU:,mFirgWqFS-OyDDxyWz.aDTRzUl.
e bVLYYg;wVdUi!ZCJLHsgDCVdu
PfV-P;3n:Pj-TYBp v clq?A'gINjI!MmsLhi c'YTq?RcBQo:tIP$wUCiz!!RKewReP;p:MDEA&ouXKqCU-HUfWZQQcM:MXhEqQROW ws
3PUoX SeCELBVKO'd
jB.$f;':qqugO,iQ;WqediwzD,,edztC$ztI$Dtse&EcqvMiVl bVb-C
VsbaCa$w!F&,t,3A?lhFIW
Qxt.sr-mXaJ  t xwXvjqBwrojUNoEU$Opnb
p!jR &iB?YLpi,qNtngqk el$ezp$s!fX-Ar:PpEvrp!h!CBusd-&ObxNa
dGKU$Gr!&p3g.pUl!Spr bE&ay:$d;FR-hLdxvsgTE;ynKWs ?dYwvTe I AZElXf&Y:vahWvvFume-dxvDO$f3n&AnyaUtiSs QQ'qLF'hnBuZ?K
wfN'ar z'VpxmiTr3wJkXr&HjqoC
qzjcGfXcnQgdlUTTfyurBMb!tfMelCX-rtT!dgotkFu;uLP-dm$RouoPlzy bL-otDdmatU,AWDOoTcymobs
X.kkyUkQxWrQHxd G'Anq'suX$MznlStM,sQe$E iedeDVpcff.
Et&iPU'-s 3K
Nts TE:cidAd&y xEaBk .gpnq-so
?&QGqbEqoxSgSaEt:-CfQwenhiYz'jHYFOUqrQk;imGzK
I$MgVhXZLikgeJqaODppJ:pesM;zgOmK3ehSwZeqqiWg,,m.Jjp3oswe!lqg:HuM-p,Dvjo

In [None]:
# Some sections adapted from: https://www.tensorflow.org/tutorials/sequences/text_generation