In [4]:
import tensorflow as tf
import numpy as np
import os

In [2]:
with open(os.path.join('data','shakespeare.txt'),'r') as f:
    corpus=f.read().lower()

import re,string

corpus=re.sub('-',' ',corpus)
corpus=re.sub(f'[{re.escape(string.digits)}]','',corpus)
corpus=re.sub('[:\';\"?<>/&*^!`|\(\)\[\]]','',corpus)
corpus=re.sub('\n',' <NEXT> ',corpus)
corpus=re.sub(f',',' , ',corpus)
corpus=re.sub(f'\.',' . ',corpus)
print('Corpus: ',corpus[:400])

Corpus:  the sonnets <NEXT> by william shakespeare <NEXT>  <NEXT>                       <NEXT>   from fairest creatures we desire increase ,  <NEXT>   that thereby beautys rose might never die ,  <NEXT>   but as the riper should by time decease ,  <NEXT>   his tender heir might bear his memory <NEXT>   but thou contracted to thine own bright eyes ,  <NEXT>   feedst thy lights flame with self substantial fu


In [3]:
print(f'Corpus length: {len(corpus)}')
import json

with open('hyperparameters.json','r') as f:
    params=json.load(f)

max_tokens=params['max_tokens']
max_sequence_length=params['max_sequence_length']
step_size=params['step_size']
batch_size=params['batch_size']
lstm_units=params['lstm_units']
learning_rate=params['learning_rate']
embedding_dim=params['embedding_dim']

from tensorflow.keras.layers import TextVectorization

vectorize_layer=TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    standardize=None,
    pad_to_max_tokens=True
)
vectorize_layer.adapt([corpus],batch_size=256)

print(f'Vocab size: {len(vectorize_layer.get_vocabulary())}')
print(f'Vocabulary: {vectorize_layer.get_vocabulary()[-10:]}')

Corpus length: 6429217
Vocab size: 25372
Vocabulary: ['abel', 'abbreviated', 'abbots', 'abbominable', 'abbeys', 'abates', 'abatements', 'abashd', 'abaissiez', 'aarons']


In [10]:
def sequences2ids(sequence):
    return vectorize_layer(sequence)

def ids2sequences(ids):
    decode=[]
    if type(ids)==int:
        ids=[ids]
    for id in ids:
        decode.append(vectorize_layer.get_vocabulary()[id])
    decode=' '.join(decode)
    decode=re.sub(' <NEXT> ',' \n ',decode)
    decode=re.sub(' , ',', ',decode)
    decode=re.sub(' . ','. ',decode)
    return decode

print('Input: those hours that with gentle work')
print(f'Sequence->id: {sequences2ids("those hours that with gentle work").numpy()}')
print(f'Id->sequence: {ids2sequences([1253, 1123,   12,   15,  308,  951])}')

Input: those hours that with gentle work
Sequence->id: [201 681  14  17 280 543]
Id->sequence: wall pol my is bid officer


In [11]:
corpus=corpus.split()
corpus=sequences2ids(corpus)
corpus=corpus.numpy().reshape(-1)
print(f'Corpus shape: {corpus.shape}')

Corpus shape: (1167869,)


In [12]:
input_sequences=[]
target=[]
for i in range(0,len(corpus)-max_sequence_length,step_size):
    input_sequences.append(corpus[i:i+max_sequence_length])
    target.append(corpus[i+max_sequence_length])

input_sequences=np.array(input_sequences)
target=np.array(target)

print(f'Encoded Input: {input_sequences[0][:10]}...')
print(f'Decoded Input: {ids2sequences(input_sequences[0][:10])}...')
print(f'Encoded Target: {target[0]}')
print(f'Decoded Target: {ids2sequences(target[:1])}\n')

print(f'Dataset size: {len(input_sequences)}')
print(f'Input Sequences shape: {input_sequences.shape}')
print(f'Target words shape: {target.shape}')

Encoded Input: [   5 8112    2   38  818 2195    2    2]...
Decoded Input: the sonnets 
 by william shakespeare 
 <NEXT>...
Encoded Target: 2
Decoded Target: <NEXT>

Dataset size: 389287
Input Sequences shape: (389287, 8)
Target words shape: (389287,)


In [7]:
data=tf.data.Dataset.from_tensor_slices((input_sequences,target))
data=data.cache()
data=data.shuffle(1000)
data=data.batch(batch_size)
data=data.prefetch(tf.data.AUTOTUNE)
data_iterator=data.as_numpy_iterator()

print(f'Dataset size: {len(data)}')
_=data_iterator.next()
print(f'Input shape (with batches): {_[0].shape}')
print(f'Output shape (with batches): {_[1].shape}')

Dataset size: 761
Input shape (with batches): (512, 8)
Output shape (with batches): (512,)


In [8]:
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Bidirectional,Dropout,Embedding,Dense

In [243]:
def build_model():
    inputs=Input(shape=(max_sequence_length,))
    x=Embedding(max_tokens,embedding_dim)(inputs)
    x=Bidirectional(
        LSTM(lstm_units,return_sequences=True,dropout=.2)
    )(x)
    x=Bidirectional(
        LSTM(lstm_units,dropout=.2,return_sequences=True)
    )(x)
    x=LSTM(lstm_units//2)(x)
    x=Dense(max_tokens,activation='softmax')(x)

    model=Model(inputs=inputs,outputs=x,name='sonnet_model')
    return model

In [244]:
sonnet_model=build_model()
sonnet_model.summary()

Model: "sonnet_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 8)]               0         
                                                                 
 embedding_2 (Embedding)     (None, 8, 32)             811904    
                                                                 
 lstm_4 (LSTM)               (None, 8, 32)             8320      
                                                                 
 lstm_5 (LSTM)               (None, 32)                8320      
                                                                 
 dense_2 (Dense)             (None, 25372)             837276    
                                                                 
Total params: 1,665,820
Trainable params: 1,665,820
Non-trainable params: 0
_________________________________________________________________


In [245]:
sonnet_model(data_iterator.next()[0][:1])

<tf.Tensor: shape=(1, 25372), dtype=float32, numpy=
array([[3.9413542e-05, 3.9406412e-05, 3.9420571e-05, ..., 3.9412676e-05,
        3.9414084e-05, 3.9415605e-05]], dtype=float32)>

In [246]:
sonnet_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [248]:
history=sonnet_model.fit(data,epochs=1)



In [6]:
sonnet_model=tf.keras.models.load_model('models/sonnet_model.h5',compile=False)

In [7]:
import random

def check_prediction(nums):
    correct=0
    print(f'Input\t------->\tPrediction : Actual')
    for i in range(nums):
        idx=random.randint(0,len(input_sequences)-1)
        y_pred=sonnet_model(input_sequences[idx:idx+1],training=False)
        y_pred=tf.argmax(y_pred,axis=-1).numpy().item()
        print(f'{ids2sequences(input_sequences[idx])} --> {ids2sequences(y_pred)} : {ids2sequences(target[idx:idx+1])}')
        if y_pred==target[idx]:correct+=1
    print(f'Accuracy: {(correct/nums)*100:.2f}%')

In [14]:
check_prediction(10)

Input	------->	Prediction : Actual
know you do, and have found it --> <NEXT> : .
. hell beat aufidius head below his knee --> <NEXT> : <NEXT>
in solemn talk. 
 corin. that --> is : is
. your cares set up do not pluck --> him : my
monstrous arrogance thou liest, thou thread , --> my : thou
to his goodness 
 the model of our --> great : chaste
a word. 
 volumnius. what says --> thou : my
all filld up with guts and midriff . --> <NEXT> : <NEXT>
affectiond ass that cons state without book and --> <NEXT> : <NEXT>
ireland, who removd, 
 earl surrey --> hath : was
Accuracy: 40.00%


In [15]:
def softmax(z):
    return np.exp(z)/sum(np.exp(z))

def sample(conditional_probability,temperature=1.0):
    conditional_probability = np.asarray(conditional_probability).astype("float64")
    conditional_probability = np.log(conditional_probability) / temperature
    reweighted_conditional_probability = softmax(conditional_probability)
    probas = np.random.multinomial(1, reweighted_conditional_probability, 1)
    return np.argmax(probas)

def generate_sequence(initial_seed,steps):
    gen=0
    encoded_seq=None
    if initial_seed=='':
        idx=random.randint(0,max_tokens-1)
        initial_seed=ids2sequences(idx)
    encoded_seq=list(sequences2ids(initial_seed).numpy())
    while gen!=steps:
        gen+=1
        input_seq=np.zeros((1,max_sequence_length))
        last_sequence=encoded_seq[len(encoded_seq)-max_sequence_length:]
        for idx,enc in enumerate(last_sequence):
            input_seq[:,idx]=enc
        y_pred=sonnet_model(input_seq,training=False).numpy().flatten()
        y_pred=sample(y_pred)
        encoded_seq.append(y_pred)
    return ids2sequences(encoded_seq)

In [16]:
print(generate_sequence('as thou you shall',100))

as thou you shall ., thou, only youth, 
 and then steal up from expectation most touches, 
 for her spur or even occasion sick as our good eyes 
 with sorrow of love what was you so. 
 welcome, camillo. must hunt you when nature would, you see 
 that hit my censure let me utter and thyself 
 not in me and leave me so. 
 camillo. he may not use thee that. did stand for me 
 shamd oaks and, fool, folly, or, glazed, <NEXT>


In [20]:
id2word={}
word2id={}
for idx,word in enumerate(vectorize_layer.get_vocabulary()):
    id2word[idx]=word
    word2id[word]=idx

import pickle

with open('models/vocabulary.h5','wb') as f:
    pickle.dump(id2word,f)
    pickle.dump(word2id,f)