# Generator - 3.0
This notebook contains the design of the generator to generate the next words of a review using LSTM and GloVe Embeddings <br>
Note: This notebook is for training the Generator only. The Text Generation and Evaluation is found in *Generator-v3 Generation and Evaluation notebook*. 
<br>
Files used
Reviews from https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_review.json


In [None]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, drive
from oauth2client.client import GoogleCredentials

drive.mount('/content/drive')

In [None]:
from IPython.display import HTML, display
def set_css():
  """A function for wrapping text displayed in colab"""
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
  
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
#importing the libraries
import tensorflow as tf
from tensorflow.keras import layers
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.utils import  shuffle
import time

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
#defining the constants
VOCAB_SIZE = 10000
INPUT_LEN = 29
EMBEDDING_DIM = 300

# Loading the data

In [None]:
#loading the dataset
df = pd.read_csv("reviews.csv")
positive_review_ratings = [5]
positive_reviews_df = df[df.stars.isin(positive_review_ratings)].reset_index(drop=True)
reviews_list = positive_reviews_df["text"].values.tolist()

# Pre-processing

In [None]:
from nltk.tokenize import sent_tokenize
import re
from string import punctuation

punc = set(punctuation)

full_stop_pattern = "\.(?=\S)"
full_stop_pattern = re.compile(full_stop_pattern)

qmark_pattern = "(?=\S)\?"
qmark_pattern = re.compile(qmark_pattern)

exclmark_pattern = re.compile("(?=\S)\!")


In [None]:
def modify_text(text :str) -> str:
  """
  A function to pre-process text.
  
  It removes numbers, repeated punctuations and adds a space before and after full-stop, comma and exclamation mark.
  Args:
    text: review to be modified
  Returns:
    A string with the repeated punctuations removed and a space after .,!

  """
  #removing numbers
  text = re.sub("\d+", "", text)
  
  #removing repeated punctuation marks
  new_text = ""
  for i in range(len(text)):
    
    #if its not a punctuation mark then add it to the new_text
    if text[i] not in punc:
      new_text+=text[i]

    #if text[i] is a punctuation mark, then check whether the previous character is not a punctuation mark or a space
    elif text[i] in punc and ((text[i-1] not in punc) and (text[i-1]!= " ")):
      new_text+=text[i]

  text = new_text

  #removing additional spaces
  text = re.sub(' +', ' ', text) 

  #changing it's to its
  text = re.sub("it\'s", "its", text)
  text = re.sub("It\'s", "its", text)
   
  #removing the new line character
  text = re.sub("(\n)+", " ", text)
  
  #replacing common patterns
  text = re.sub("\'ve", " have", text)
  text = re.sub("don't", " do not", text)
  text = re.sub("\'t", " not", text)
  text = re.sub("\'s", " is", text)
  text = re.sub("\'m", " am", text)
  
  #removing the single quotes
  text = re.sub("\'", "", text)

  #the tokens like !,?,. are considered as separate tokens. 
  #Hence a space is added before/after them to make the get recognized as separate tokens.

  # adding space after the full stop
  text = re.sub(full_stop_pattern, ". ", text)

  #adding a space before ?
  text = re.sub(qmark_pattern, " ?", text)

  #adding a space before !
  text = re.sub(exclmark_pattern, " !", text)
  
  return text


In [None]:
#test case
modify_text("How are you doing????? I'm fine!! This is good, I don't hate it.")

'How are you doing ? I am fine ! This is good, I  do not hate it.'

In [None]:
#the punctuation marks like !, ,, . are not to be removed from the text. So they are removed from the set of punctuations
import string
punc_s = string.punctuation
punc_s=punc_s.replace("!",'')
punc_s=punc_s.replace(".",'')
punc_s

In [None]:
import string
from nltk.tokenize import word_tokenize
from string import punctuation

#to remove punctuations
table = str.maketrans('', '', punc_s)
punctuations_set = set(punc_s)

In [None]:
def tokenize_text(reviews_list :list) ->list:
  """
  A function to tokenizes the review into words and removes the reviews that have less than 10 words

  Args:
    reviews_list: A list of reviews
  Returns: 
    A list of lists where each list corresponds to the words in the corresponding review.

  """
  
  cleaned_reviews = []
  for line in reviews_list:

    #tokenize the sentences into words
    tokens = word_tokenize(line)

    #removing the unnecessary punctuation marks
    stripped = [w.translate(table) for w in tokens]
    
    #choosing a word only if it is not an unnecessary punctuation
    words = [word for word in stripped if ((word not in punctuations_set))]
    

    #taking only the reviews whose length is greater than 10.
    if len(words)>10:
      tokens = [w.lower() for w in words if len(w)>0]
      cleaned_reviews.append(tokens)


  return cleaned_reviews


In [None]:
#test case
sample = "The food was great!! But no AC..delicious....i"
sample = modify_text(sample)
sample_tokenized = tokenize_text([sample])
print(sample_tokenized)

In [None]:
len(reviews_list)

In [None]:
#modifying the reviews in the corpus
for i in range(len(reviews_list)):
  reviews_list[i] = modify_text(reviews_list[i])
#tokenization
cleaned_reviews = tokenize_text(reviews_list)
print("Number of reviews: ", len(cleaned_reviews))

In [None]:
def convert_to_training_set(reviews_list :list, len_of_review :int) ->list:
  """
  A function takes in a list of reviews and converts it into an array of sentences of length len_of_review

  Args:
    reviews_list: list of reviews
    len_of_review: an integer specifying the number of words required in a review
  Returns:
    A list of lists with each list containing the required number of words.
  """
  reviews = []
  
  length = len_of_review
  
  #iterating through the list of reviews in the reviews_list
  for review in reviews_list:
    
    for i in range(length, len(review)):
      
      #slicing the reviews to sequences having len_of_review words
      seq = review[i-length:i]
      reviews.append(seq)   
     
  return reviews


In [None]:
#test case
convert_to_training_set(sample_tokenized, 4)

In [None]:
#converting all the reviews to a sequence of 10 words
sequences = convert_to_training_set(cleaned_reviews, 10)
print("Total number of sequences: ", len(sequences))

In [None]:
#assingning 9 words to the input sequence and 10th word to the prediction
X = []
y = []
for i in range(len(sequences)):
  train = sequences[i][:-1]
  pred = sequences[i][-1]

  X.append(train)
  y.append(pred)

In [None]:
#splitting into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.0001, random_state = 100, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.01, random_state = 100, shuffle=True)
print("Size of training set: ", len(X_train))
print("Size of validation set: ", len(X_val))
print("Size of test set: ", len(X_test))

If re-training the generator model, then redefine the tokenizer. <br>
If using the pre-trained weights, then reload the tokenizer with which the model was trained

In [None]:
#defining the tokenizer
tokenizer  = tf.keras.preprocessing.text.Tokenizer(num_words = VOCAB_SIZE,lower = True, oov_token="<OOV>", filters='"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~' )
tokenizer.fit_on_texts(X_train)

In [None]:
#padding and tokenization of the input
from keras.preprocessing.sequence import pad_sequences

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_seq = np.array(pad_sequences(X_train_seq, INPUT_LEN, padding="pre",truncating="post"))
y_train_seq = tokenizer.texts_to_sequences(y_train)

In [None]:
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_val_seq = np.array(pad_sequences(X_val_seq, INPUT_LEN, padding="pre",truncating="post"))
y_val_seq = tokenizer.texts_to_sequences(y_val)

In [None]:
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_seq = np.array(pad_sequences(X_test_seq, INPUT_LEN, padding="pre",truncating="post"))
y_test_seq = tokenizer.texts_to_sequences(y_test)

The preprocessing steps done are


1.   Removing the sequences in X_train and y_train that have OOV (out of vocab) words in them.
2.   Removing the sequences with repeating words.

These sequences affected the model's performance and hence they were removed. 



In [None]:
#finding the rows in X that have OOV token and removing them
rows_with_oov = []
for i in range(len(X_train_seq)):
  if 1 in X_train_seq[i]:
    rows_with_oov.append(i)
print("Number of rows in X_train with OOV: ", len(rows_with_oov))

In [None]:
#deleting the rows
X_train_seq = np.delete(X_train_seq, rows_with_oov, axis=0)
y_train_seq = np.delete(y_train_seq,rows_with_oov, axis=0)

In [None]:
#finding the rows in y that have OOV token and removing them
y_rows_with_oov = []
for i in range(len(y_train_seq)):
  if 1 in y_train_seq[i]:
    y_rows_with_oov.append(i)
print("Number of rows in y_train with OOV: ", len(y_rows_with_oov))

In [None]:
#deleting the rows
X_train_seq = np.delete(X_train_seq, y_rows_with_oov, axis=0)
y_train_seq = np.delete(y_train_seq,y_rows_with_oov, axis=0)

In [None]:
def check_patterns(strings_array :list) ->list:
  """
  A function to check and find sequences where the same word occurs consecutively after one another
  
  For example the sequence with phrase "I had pasta and and pizza" will be removed because and appears twice.
  Args:
    strings_list: a numpy array of the tokenized representations of the words
  Returns:
    A list of indices which have the required pattern
  """
  
  ind = []
  for i,words_list in enumerate(strings_array):
    for word_index in range(1,len(words_list)):
      if (words_list[word_index]==words_list[word_index-1]):
        ind.append(i)
        break
    
  return ind

In [None]:
repeated_indices_lst = check_patterns(X_train_seq)
print("Number of rows with the same word occurring consecutively: ",len(repeated_indices_lst))
#deleting the indices
X_train_seq = np.delete(X_train_seq, repeated_indices_lst, axis=0)
y_train_seq = np.delete(y_train_seq, repeated_indices_lst, axis=0)

print("Train dataset shape: ",X_train_seq.shape)

In [None]:
import collections

#exploratory data analysis for the output sequence
output_tokens = collections.defaultdict(int)
for i in y_train_seq:
  output_tokens[i[0]]+=1

#most frequent outputs
freq_output_tokens = []
for key, value in output_tokens.items():
  if value>1000:
    freq_output_tokens.append(key)

print("Number of unique tokens in the output: ", len(output_tokens))
print("Number of tokens in predictions occurring more than 1000 times: ",len(freq_output_tokens))


## Loading Word Embeddings

### GloVe

In [None]:
# load the whole embedding into memory
embeddings_index = dict()
#path to the GloVE 300-dimensional embeddings file
#embeddings file can be downloaded from https://nlp.stanford.edu/projects/glove/

f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((VOCAB_SIZE, 300))
for word, i in tokenizer.word_index.items():
  
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i<VOCAB_SIZE:
        embedding_matrix[i] = embedding_vector

GloVE embeddings was chosen over Word2Vec embeddings because Word2Vec did not have embeddings for several stop words in the corpus.

In [None]:
words_not_in_embed = []
for word, token in tokenizer.word_index.items():
  if (token<VOCAB_SIZE) and (word not in embeddings_index):
    words_not_in_embed.append(word)

print("Number of words not having embeddings: ",len(words_not_in_embed))

# Building the generator

In [None]:
tf.keras.backend.clear_session()

In [None]:
generator_model=tf.keras.models.Sequential()

#embedding layer
generator_model.add(layers.Embedding(VOCAB_SIZE,300,weights=[embedding_matrix],input_length=INPUT_LEN,trainable=False)) 
generator_model.add(layers.BatchNormalization())
#LSTM layer
generator_model.add(layers.Bidirectional(layers.LSTM(256,return_sequences=True)))
generator_model.add(layers.Dropout(0.25))

#LSTM layer
generator_model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=False)))
generator_model.add(layers.Dropout(0.25))

#Dense layers 
generator_model.add(layers.Dense(128)) 
generator_model.add(layers.Dense(512)) 
generator_model.add(layers.Dense(VOCAB_SIZE,activation='softmax')) 

#Print summary of model
print(generator_model.summary())

In [None]:
#loading the pre-trained weights
generator_model.load_weights("model_weights/generator_3.keras")
generator_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0015),
             loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])


In [None]:
# Load the extension and start TensorBoard

%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
from keras.callbacks import TensorBoard
from time import time
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))


In [None]:
save_checkpoint = tf.keras.callbacks.ModelCheckpoint("text_gen_LSTM.keras"
    , monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=True, mode='auto', save_freq='epoch'
)

In [None]:
#converting y_val_seq to numpy array
y_val_seq = np.array(y_val_seq)

the model was trained for nearly 100 epochs to achieve good results. Though the accuracy remains at 30% the quality of sentences are good.

In [None]:
history = generator_model.fit(X_train_seq, y_train_seq,
                    epochs=100,
                    verbose=1,
                    batch_size = 2048,
                    validation_data=(X_val_seq, y_val_seq),
                    callbacks=[tensorboard, save_checkpoint]
              )

In [None]:
generator_model.save_weights("model_weights/generator_3.keras")

the texts generated by this model are available in the Generator_v3 Generation and Evaluation notebook