In [None]:
import tensorflow as tf 
tf.enable_eager_execution()
from keras.utils.vis_utils import plot_model

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import nltk; nltk.download('stopwords')

from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import functools
import os 
import time

In [None]:
text = open('tiny-shakespeare_2.txt', 'r').read()

### Topic Modeling and Latent Dirichlet Allocation

In [None]:
wordcloud = WordCloud(background_color= 'white').generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Preparing for LDA-based topic modeling 

docs = list(text.split('.'))

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = tf_vectorizer.fit_transform(docs)
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_components=10, max_iter=5, learning_method='online',random_state=0)
lda.fit(tf)

In [None]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
display_topics(lda, tf_feature_names, 10)

In [None]:
def find_perp():
    num_top = []
    perp_list = []
    log_lik = []
    for i in range(1,21):
        num_top.append(i)
        lda = LatentDirichletAllocation(n_components=i, max_iter=5, learning_method='online',random_state=0)
        lda.fit(tf)
        perp_list.append(lda.perplexity(tf))
        log_lik.append(lda.score(tf))
        
    return num_top, perp_list, log_lik

num, perplexities, logs = find_perp()

for i in range(len(logs)):
    print(num[i], np.round(perplexities[i],0), np.round(logs[i],0))

### Recurrent Neural Network Architecture and Performance 

Note, there aren't any embeddings here, we're just assigning each unique character an integer. Were I going to extend this to a word- or n-gram-level predictor I'd use Word2Vec or GloVe to map words into vector space, switch to LSTMs over GRUs for their superior abilities in learning long-term dependencies, and switch to something like  a sequence-loss for the loss function (as against the sparse categorical cross entropy I've used here). I'd also look at perplexity, as it's a common gauge for the performance of a language model.

In [None]:
# Characters, vocabulary, and some mappings.
# Set up 
text = open('tiny-shakespeare_2.txt', 'r').read()

vocab = sorted(set(text))
char_idx_map = {u:i for i, u in enumerate(vocab)}
idx_char_map = np.array(vocab)
text_as_int = np.array([char_idx_map[c] for c in text])

# Sequences etc. 
seq_length = 100
examples_per_epoch = (len(text)//seq_length)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder = True)

# Model hyperparamaters
# This is one place I messed around a lot with hyperparameters. I tried different activation functions ('sigmoid',
# 'tanh', etc), as well as different embedding dimensions and numbers of RNNs per layer. 

vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
rnn = functools.partial(tf.keras.layers.GRU, recurrent_activation='relu') 

In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        rnn(rnn_units,
            return_sequences=True,
           recurrent_initializer = 'glorot_uniform', # glorot_normal is the more common, I was experimenting here.
            stateful=True),
        rnn(rnn_units,
            return_sequences=True,
           recurrent_initializer = 'glorot_normal',
            stateful=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(
    vocab_size = len(vocab),
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE)

model.summary()

In [None]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = tf.losses.sparse_softmax_cross_entropy)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only=True)

In [None]:
# I've passed 5 in as the number of steps_per_epoch so I could quickly verify that the model runs. With so little 
# training the output is very bad, a problem solved by throwing more training cycles at the model. 

EPOCHS = 1
histor = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=5, callbacks=[checkpoint_callback])

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()

In [None]:
def generate_text(model, start_string='ROMEO'):
  

  # Length of the character sequence to be generated
  num_generate = 1000

  # This model's version of vectorizing.
  input_eval = [char_idx_map[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  # This hyperparameter controls how 'conservative' or 'experimental' the generative model is. 
  temperature = 1.0

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0)
      predictions = predictions / temperature
      predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
      input_eval = tf.expand_dims([predicted_id], 0)
      
      text_generated.append(idx_char_map[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string="ATLAS: "))

In [None]:
# Some sections adapted from: https://www.tensorflow.org/tutorials/sequences/text_generation