### Load tensorflow

In [None]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm

### Collect Data
<font size="2">Download data from https://s3.amazonaws.com/text-datasets/nietzsche.txt</font>

In [None]:
!wget https://s3.amazonaws.com/text-datasets/nietzsche.txt --quiet

In [None]:
!ls -l

In [None]:
book_text = open('nietzsche.txt', encoding='utf8').read() #reading the book as a string
print('Length of the book: ' , len(book_text))

In [None]:
#book_text

In [None]:
print(book_text[10000:10050])

### Tokenize the data

In [None]:
#Tokenize at character level
t = tf.keras.preprocessing.text.Tokenizer(char_level=True, lower=False)

#Fit tokenizer on the book
t.fit_on_texts(book_text)

#Vocablury size
vocab_size = len(t.word_index)

print('Number of unique characters: ', vocab_size)

In [None]:
#Character Vocabluty
print(t.word_index)

In [None]:
#Convert characters in the book to Numbers
book_num = t.texts_to_sequences(book_text)

In [None]:
print(book_text[10000:10050])

In [None]:
print(book_num[10000:10050])

In [None]:
#Build a dictionary which can convert numbers into chars
int_to_char = dict((i,c) for c, i in t.word_index.items())

In [None]:
print(int_to_char)

In [None]:
int_to_char[15]

### Prepare Input and Output Sequences

Input and output container
- Input data will have sequences with 40 characters
- Output data will have one character which comes after 40 characters in the input data

In [None]:
sequence_length = 100 #Length of input sequence

#Empty list for input and output data
input_data = []  #Empty list for input data
output_data = [] #Empty list for output data

#Populate input and output data
for i in range(0, len(book_num) - sequence_length):
    
    input_seq = book_num[i : i + sequence_length] #input sequence    
    output_seq = book_num[i + sequence_length] #Output sequence
    
    input_data.append(input_seq)
    output_data.append(output_seq)

In [None]:
print('Total number of input arrays: ', len(input_data))
print('Total number of Output arrays: ', len(output_data))

In [None]:
print("Input Data length: ",len(input_data[10]))
print("Output Data length: ",len(output_data[10]))

In [None]:
print(input_data[0])

In [None]:
print(output_data[0])

### One Hot encoding for Input and Output

In [None]:
#Input data one hot encoding
input_data_one_hot = tf.keras.utils.to_categorical(input_data,num_classes=vocab_size+1)

#Output data one hot encoding
output_data = tf.keras.utils.to_categorical(output_data,num_classes=vocab_size+1)

In [None]:
600793*100*85*4

In [None]:
32*100*85*4

In [None]:
input_data_one_hot.shape

In [None]:
output_data.shape

### Build Model

In [None]:
#Build a Sequential Model
tf.keras.backend.clear_session()
model = tf.keras.models.Sequential()

In [None]:
#Use an LSTM with memory size equal to 256
model.add(tf.keras.layers.LSTM(256, input_shape=(sequence_length,vocab_size+1)))

In [None]:
model.output

In [None]:
#Output layer
model.add(tf.keras.layers.Dense(vocab_size+1, activation='softmax'))

In [None]:
#Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
model.summary()

#### Train Model

In [None]:
model.fit(input_data_one_hot, output_data, 
          batch_size=128, 
          epochs=500)

In [None]:
600593//128

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
save_path = '/gdrive/My Drive/Great Learning/Sequential NLP/Notebooks/4. Seq2Seq Model/char_rnn.h5'
model.save(save_path)

#### Generating Text

In [None]:
save_path = '/gdrive/My Drive/Great Learning/Sequential NLP/Notebooks/4. Seq2Seq Model/char_rnn.h5'
model = tf.keras.models.load_model(save_path)

In [None]:
model.summary()

In [None]:
#1 - 'My name' -> ' ' 
#2 - 'y name ' -> 'i'
#3 - ' name i' -> 's'
#4 - 'name is' -> ' '

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def predict_seq_with_sample(test_seq, num_chars=50, tempreture=1.0):
    
    #Initialize predicted output
    predicted_output = ''
    
    #lets predict 50 next chars
    current_seq = np.copy(test_seq)
    
    for i in tqdm(range(num_chars)):

        #One hot encoding
        current_seq_one_hot = tf.keras.utils.to_categorical(current_seq, num_classes=vocab_size+1)
        
        #Convert it into a batch of 1 example
        data_input = np.reshape(current_seq_one_hot,(1,
                                                     current_seq_one_hot.shape[0],
                                                     current_seq_one_hot.shape[1]))
        
        #Take sample prediction
        preds = model.predict(data_input)[0]
        predicted_char_int = sample(preds, temperature=tempreture)
        #Get the char int with maximum probability
        #predicted_char_int = np.argmax(model.predict(data_input)[0])
        
        if (predicted_char_int != 0):
            
            #Add to the predicted out, convert int to char
            predicted_output = predicted_output + int_to_char[predicted_char_int]
        
        #Update seq with new value at the end
        current_seq = np.roll(current_seq, -1)
        current_seq[current_seq.shape[0]-1] = [predicted_char_int]
    
    print('')
    print('')

    print('Initial sequence is: ')
    for i in range(len(test_seq)):
        print(int_to_char[test_seq[i][0]], end='')
    
    print('')
    print('')
    print('Generated sequence is: ')
    print(predicted_output)

In [None]:
#Identify a random sequence which we will use to generate output
start_pos = np.random.randint(0, high=(len(book_num) - sequence_length))
test_seq =  book_num[start_pos : start_pos+sequence_length]

predict_seq_with_sample(test_seq, num_chars=500, tempreture=1.0)

#### Word2Vec Embedding Model for Char-RNN

In [None]:
#Build a Sequential Model
model_wv = tf.keras.models.Sequential()

In [None]:
model_wv.add(tf.keras.layers.Embedding(85, #Number of unique chars
                                       10, #Embedding Size
                                       input_length=40
                                       ))

In [None]:
model_wv.output

In [None]:
#Add LSTM
model_wv.add(tf.keras.layers.LSTM(256, activation='relu'))

#Add output layer
model_wv.add(tf.keras.layers.Dense(85, activation='softmax'))

In [None]:
model_wv.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
model_wv.summary()

In [None]:
np.array(input_data).shape

In [None]:
model_wv.fit(np.array(input_data), output_data,
             batch_size=128,
             epochs=1)