In [2]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

## Building a char-RNN

In [3]:
shakespeare_url = 'https://homl.info//shakespeare'
filepath = keras.utils.get_file('shakespeare.txt',shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

### Tokenize the text into a vector of integers

In [4]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level = True)
tokenizer.fit_on_texts(shakespeare_text)

In [5]:
max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count 

print(max_id)
print(dataset_size)

39
1115394


In [6]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 #subtract 1 to get 0->N instead of 1->N+1

### Split into train/test/validation

In [7]:
train_size = dataset_size * 90//100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [8]:
n_steps = 100 #window the data to generate "batches"
window_length  = n_steps + 1
dataset = dataset.window(window_length,shift = 1,drop_remainder = True)

dataset = dataset.flat_map(lambda window: window.batch(window_length)) #flatten the dataset into window_length sized groups

In [9]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:,:-1],windows[:,1:])) #shuffle the data, turn each dataset into a train/test of 100 samples long each (i.e. 0-100 train, 1-101 test)

one hot encode 

In [10]:
dataset = dataset.map(lambda X_batch,y_batch: (tf.one_hot(X_batch, depth = max_id), y_batch))
dataset = dataset.prefetch(1)

In [11]:
model = keras.models.Sequential([
    keras.layers.GRU(128,return_sequences = True,dropout = 0.2,input_shape = [None,max_id]),
    keras.layers.GRU(128,return_sequences = True,dropout = 0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,activation = 'softmax'))
])

model.compile(loss = 'sparse_categorical_crossentropy',optimizer='adam')
history = model.fit(dataset,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
def preprocess(text):
    token = np.array(tokenizer.texts_to_sequences(text)) - 1
    return tf.one_hot(token,max_id)

In [20]:
X_test = preprocess(['How are yo'])
y_pred = np.argmax(model(X_test),axis = -1)
tokenizer.sequences_to_texts(y_pred + 1)[0][-1]

'u'

In [182]:
tf.random.set_seed(42)

In [6]:
model = keras.models.load_model('ch16_charRnn.h5')

In [15]:
def next_char(text,temperature = 1):
    Xnew = preprocess([text])
    
    yproba = model(Xnew)
    
    rescaled_logits = tf.math.log(yproba[0,:-1,:]) / temperature 
    
    char_id = tf.random.categorical(tf.reshape(rescaled_logits,[1,len(rescaled_logits)]), num_samples = 1, seed = 42) + 1
    #char_id = tf.random.categorical(rescaled_logits, num_samples = 1, seed = 42) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]
    

In [16]:
def complete_text(text,n_chars =100,temperature = 1):
    for _ in range(n_chars):
        text +=  next_char(text, temperature)
    return text

In [17]:
print(complete_text('a',temperature = 1))
#print(complete_text('w',temperature = 1))
#print(complete_text('w',temperature = 2))

tf.Tensor(
[[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]], shape=(1, 1, 39), dtype=float32)
tf.Tensor(
[[[9.74714607e-02 2.67819059e-03 1.19457170e-01 2.33859406e-03
   2.02142796e-03 3.47330272e-02 3.63231753e-03 9.82685089e-02
   8.52513909e-02 1.88041374e-01 5.36741223e-03 7.62311518e-02
   2.75591966e-02 2.38994546e-02 3.75987515e-02 2.03163400e-02
   1.04723815e-02 1.38938036e-02 1.99915841e-02 1.31668365e-02
   7.94183742e-03 1.19786318e-02 1.47622144e-02 2.15537865e-02
   1.09882755e-02 3.26979458e-02 2.62448844e-03 4.43814788e-03
   1.99976284e-03 6.08921330e-03 1.57334853e-03 1.29372784e-04
   5.65733098e-05 4.54121728e-05 1.48899868e-04 5.81624627e-04
   5.27938333e-08 8.27294322e-09 9.26592847e-09]]], shape=(1, 1, 39), dtype=float32)
tf.Tensor([], shape=(0, 39), dtype=float32)


InvalidArgumentError: num_classes should be positive, got 0 [Op:Multinomial]

## Stateful RNN

Idea is to keep the hidden state across batches. Numerous complications regarding batching, so easiest solution is to create batches of size = 1.
Also important to reset states after every epoch, since these are independent.

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift = n_steps, drop_remainder = True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:,:-1], windows[:,1:]))
dataset = dataset.map(lambda X_batch, y_batch : (tf.one_hot(X_batch,depth = max_id),y_batch))
dataset = dataset.prefetch(1)

In [None]:
model = keras.model.Sequential([
    keras.layers.GRU(128,return_sequences = True, stateful = True,
                     dropout = 0.2, batch_input_shape = [batch_size , None, max_id]),
    keras.layers.GRU(128,return_sequences = True, stateful = True,
                     dropout = 0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,activation = 'softmax'))
])

In [None]:
class ResetStateCallback(keras.callbacks.Callback):
    def on_epoch_begin(self,epoch,logs):
        self.model.reset_states()

In [None]:
model.compile(loss = 'sparse_categorical_crossentropy',optimizer = 'adam')
model.fit(dataset, epochs = 50, callbacks = [ResetStateCallback()])

## Sentiment Analysis

In [459]:
import tensorflow_datasets as tfds

datasets, info = tfds.load('imdb_reviews',as_supervised = True, with_info = True)
train_size = info.splits['train'].num_examples

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\Gerardo\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m


Dl Size...: 100%|██████████| 80/80 [00:14<00:00,  5.67 MiB/s]rl]
Dl Completed...: 100%|██████████| 1/1 [00:14<00:00, 14.11s/ url]
                                                                        

[1mDataset imdb_reviews downloaded and prepared to C:\Users\Gerardo\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m


In [460]:
def preprocess(X_batch,y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>",b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z'",b" ")
    X_batch = tf.strings.split(X_batch)
    
    return X_batch.to_tensor(default_value = b'<pad>'), y_batch

In [457]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

NameError: name 'datasets' is not defined

In [456]:
vocabulary.most_common()[:3]

NameError: name 'vocabulary' is not defined

In [None]:
vocab_size = 10000
truncated_vocab = [word for word,count in vocabulary.most_common()[:vocab_size]]

In [None]:
words = tf.constant(truncated_vocab)
words_ids = tf.range(len(truncated_vocab),dtype = tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words,words_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

table.lookup(tf.constant([b'This movie was faaaaantastic'.split()]))

In [None]:
def encode_words(X_batch,y_batch):
    return table.lookup(X_batch), y_batch

In [None]:
train_set = datasets['train'].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets,embed_size,input_shape = [None]),
    keras.layers.GRU(128,return_sequences = True),
    keras.layers.GRU(128),
    keras.layers.Dense(1,activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy',optimizer = 'adam', metrics = ['accuracy'])
history = model.fit(train_set,epochs = 5)