This notebook contains the generation of fake reviews using Generator-v3 and evaluating the Generator model using BLEU scores.
<br>
**Files Used** <br>
Reviews from https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_review.json <br>
Note:<br>
Use the same tokenizer that was used for the Generator-v3

In [None]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, drive
from oauth2client.client import GoogleCredentials

drive.mount('/content/drive')

In [None]:
#importing the libraries
import tensorflow as tf
from tensorflow.keras import layers
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import pickle
import random
from random import choice
import time
from collections import Counter
from nltk.util import ngrams 
import copy

In [None]:
import nltk
nltk.download("punkt")

In [None]:
from IPython.display import HTML, display

def set_css():
  """A function for wrapping text displayed in the output."""
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
  
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
#defining the constants
VOCAB_SIZE = 10000
INPUT_LEN = 9
EMBEDDING_DIM = 300

# Loading the dataset

In [None]:
#importing the dataset
df = pd.read_csv("reviews.csv")
positive_review_ratings = [5]
positive_reviews_df = df[df.stars.isin(positive_review_ratings)].reset_index(drop=True)
reviews_list = positive_reviews_df["text"].values.tolist()

In [None]:
import re
from string import punctuation
from nltk.tokenize import sent_tokenize

punc = set(punctuation)

full_stop_pattern = "\.(?=\S)"
full_stop_pattern = re.compile(full_stop_pattern)

qmark_pattern = "(?=\S)\?"
qmark_pattern = re.compile(qmark_pattern)

exclmark_pattern = re.compile("(?=\S)\!")

In [None]:
def modify_text(text :str) -> str:
  """
    A function to pre-preprocess text.
    
    It removes numbers, repeated punctuations and adds a space before and after .,!
    Args:
      text: review to be modified
    Returns:
      A string with the repeated punctuations removed and a space after .,!

  """
  #removing numbers
  text = re.sub("\d+", "", text)
  
  #removing repeated punctuation marks
  new_text = ""
  for i in range(len(text)):
    
    #if its not a punctuation mark then add it to the new_text
    if text[i] not in punc:
      new_text+=text[i]

    #if text[i] is a punctuation mark, then check whether the previous character is not a punctuation mark or a space
    elif text[i] in punc and ((text[i-1] not in punc) and (text[i-1]!= " ")):
      new_text+=text[i]
  
  text = new_text
  #removing additional spaces
  text = re.sub(' +', ' ', text) 

  #changing it's to its
  text = re.sub("it\'s", "its", text)
  text = re.sub("It\'s", "its", text)
  
  #removing the new line character
  text = re.sub("(\n)+", " ", text)
  
  #replacing common patterns
  text = re.sub("\'ve", " have", text)
  text = re.sub("don't", " do not", text)
  text = re.sub("\'t", " not", text)
  text = re.sub("\'m", " am", text)  

  #removing the single quotes
  text = re.sub("\'", "", text)
  #the tokens like !,?,. are considered as separate tokens. 
  #Hence a space is added before/after them to make the get recognized as separate tokens.

  # adding space after the full stop
  text = re.sub(full_stop_pattern, ". ", text)

  #adding a space before ?
  text = re.sub(qmark_pattern, " ?", text)

  #adding a space before !
  text = re.sub(exclmark_pattern, " !", text)
  
  return text


In [None]:
#the punctuation marks like !, ,, . are not to be removed from the text. So they are removed from the set of punctuations
import string
punc_s = string.punctuation
punc_s=punc_s.replace("!",'')
punc_s=punc_s.replace(".",'')
punc_s

In [None]:
import string
from nltk.tokenize import word_tokenize
from string import punctuation

#to remove punctuations
table = str.maketrans('', '', punc_s)
punctuations_set = set(punc_s)


In [None]:
def tokenize_text(reviews_list :list) -> list:
  """
  A function to tokenize the review into words and removes the reviews that have less than 10 words

  Args:
    reviews_list: A list of reviews
  Returns: 
    A list of lists where each list corresponds to the words in the corresponding review.

  """ 
  cleaned_reviews = []
  for line in reviews_list:
    #tokenize the sentences into words
    tokens = word_tokenize(line)

    #removing the unnecessary punctuation marks
    stripped = [w.translate(table) for w in tokens]
    
    #choosing a word only if it is not a digit or an unnecessary punctuation
    words = [word for word in stripped if (word not in punctuations_set)]
    
    #taking only the reviews whose length is greater than 10.
    if len(words)>10:
      tokens = [w.lower() for w in words if len(w)>0]
      cleaned_reviews.append(tokens)

  return cleaned_reviews


## Loading the GLoVE embeddings

In [None]:
#tokenizer for the GloVE model. Use the same tokenizer generated for generator.
with open("tokenizer_generator_3.pkl", "rb") as f:
  tokenizer = pickle.load(f)

In [None]:
# load the whole embedding into memory
embeddings_index = dict()

#use the path to the GloVE 300 dimensional vectors file
#download the embeddings file from https://nlp.stanford.edu/projects/glove/
f = open('glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
# create a weight matrix for words in training docs

embedding_matrix = np.zeros((VOCAB_SIZE, 300))

for word, i in tokenizer.word_index.items():
  
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i<VOCAB_SIZE:
        embedding_matrix[i] = embedding_vector

## DL model

### GloVE Model

In [None]:
generator_model=tf.keras.models.Sequential()

#embedding layer
generator_model.add(layers.Embedding(VOCAB_SIZE,300,weights=[embedding_matrix],input_length=INPUT_LEN,trainable=False)) 
generator_model.add(layers.BatchNormalization())

#LSTM layer
generator_model.add(layers.Bidirectional(layers.LSTM(256,return_sequences=True)))
generator_model.add(layers.Dropout(0.25))

#LSTM layer
generator_model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=False)))
generator_model.add(layers.Dropout(0.25))

#Dense layers 
generator_model.add(layers.Dense(128)) 
generator_model.add(layers.Dense(512)) 
generator_model.add(layers.Dense(VOCAB_SIZE,activation='softmax')) 

#Print summary of model
print(generator_model.summary())

In [None]:
#loading the pre-trained weights
generator_model.load_weights("model_weights/generator_3.keras")
generator_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
             loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])


# Sample Review Generation

In [None]:
from keras.preprocessing.sequence import pad_sequences

def get_text(generator,seed_text :str, tokenizer) -> list:
  """
  A function to generate the text given the seed text

  Args:
    generator: Keras model of the generator
    seed_text: A string, seed text using which the text should be generated
    tokenizer: Keras tokenizer using which the tokens can be converted back to text
  Returns:
    A list of the words following the seed text
  """
  
  new_words = []

  word_count = 0
  
  while(1):
    seed_text = seed_text.copy()
    encoded = tokenizer.texts_to_sequences([seed_text])
    
    encoded = pad_sequences(encoded, maxlen=INPUT_LEN, truncating='pre', padding="post")
    encoded = np.array(encoded)
    
    pred = np.argmax(generator.predict(encoded),axis=-1)
    
    pred_word = ""
    for word, index in tokenizer.word_index.items():
      if index == pred[0]:
        pred_word = word
        break
    
    new_words.append(pred_word)
    
    if (pred_word == ".") or (pred_word == "!") or word_count>100:
      break
    
    word_count+=1
    seed_text.append(pred_word)
    
    seed_text.pop(0)
    


  return new_words

In [None]:
sentence_list = [['amazing', '!', 'we', 'were', 'all', 'so', 'pleasantly', 'surprised', '.'],
                  ['its', 'a', 'must', '!', 'if', 'youre', 'a', 'meat', 'lover'],
                  ['ice', 'cream', 'in', 'arizona', 'is', 'especially', 'good', '.', 'its'],
                 
                ]
for lst in sentence_list:
  print("Input Sequence: ", ' '.join(lst))
  print("Generated Sequence: ", ' '.join(get_text(generator_model, lst,tokenizer)))



In [None]:
#creating the sample tests list
tokens_list = reviews_list[:100]

#modifying the text
for i in range(100):
  tokens_list[i] = modify_text(tokens_list[i])

#tokenizing the reviews
tokens_list = tokenize_text(tokens_list)

In [None]:
#sample review ouput and comparison with the real reviews
for i in range(5):
  print("Seed Text: ", ' '.join(tokens_list[i][:9]))
  print("Generated: ", ' '.join(get_text(generator_model, tokens_list[i][:9].copy(), tokenizer)))
  print("Actual: ", ' '.join(tokens_list[i][9:]))
  print()
  print()


# Model Evaluation
The model is evaluated using the BLEU score

In [None]:
#construction of the reference list for the BLEU score evaluation
reference = tokens_list.copy()
print("Number of reference sentences: ",len(tokens_list))

In [None]:
#construction of the hypothesis list for the BLEU score evaluation
hyp = []
for i in range(len(tokens_list)):
  hyp.append(tokens_list[i][:9]+get_text(generator_model, tokens_list[i][:9], tokenizer))

In [None]:
def calculate_bleu_score(reference_list :list, hypothesis_list :list, n :int) -> float:
  """
  A function to calculate the bleu score of a list of sentences. The clipped count has been used here.

  Args:
    reference_list: list of list of words from reference sentences
    hypothesis_list: list of list of words from hypothesis sentences
    n: ngram number
  Returns:
    A float value which is the average bleu score for the entire list.
  """
  avg = 0
  for i in range(len(reference_list)):

    #picking out the ngrams and their frequencies from the given inputs
    ref_list_ngram = dict(Counter(ngrams(reference_list[i], n)))
    hyp_list_ngram = dict(Counter(ngrams(hypothesis_list[i], n)))

    #counting the total number of ngrams in the sentence
    denominator = sum(hyp_list_ngram.values())

    numerator= 0

    for key,val in hyp_list_ngram.items():
      if key in ref_list_ngram:
        
        #for each ngram we need the clipped count. so we take minimum of the ngram count in the sentence or in the reference
        numerator+=min(ref_list_ngram[key], val)

    if denominator!=0:
      avg+=(numerator/denominator)
    
  return avg/len(reference_list)


In [None]:
#test case
hyp1 = ["cat", "on", "mat"]
ref1 = ["there", "is", "a", "cat", "on", "the", "mat"]

print("BLEU-2 score: ",calculate_bleu_score([ref1], [hyp1], 2))

In [None]:
#adding the start tag and end tag for the reference and hypothesis sentences
ref2 = copy.deepcopy(tokens_list)
for i in range(len(ref2)):
  ref2[i].insert(0, "<s>")
  ref2[i].append("</s>")

hyp2 = copy.deepcopy(hyp)
for i in range(len(hyp)):
  hyp2[i].insert(0, "<s>")
  hyp2[i].append("</s>")

In [None]:
#BLEU-1 score
print("BLEU-1 Score: ",calculate_bleu_score(ref2, hyp2, 1))

In [None]:
#BLEU-2 score
print("BLEU-2 Score: ",calculate_bleu_score(ref2, hyp2, 2))

In [None]:
#BLEU-3 score
print("BLEU-3 Score: ",calculate_bleu_score(ref2, hyp2, 3))

In [None]:
#BLEU-4 score
print("BLEU-4 Score: ",calculate_bleu_score(ref2, hyp2, 1))

# Generation of new text

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
token_to_word = {v:k for k,v in tokenizer.word_index.items()}
#defining the empty string token
token_to_word[0] = ''


In [None]:
def get_full_review(generator,seed_text, tokenizer):
  """
  A function to generate the entire review given the seed text.

  The function keeps generating till the review has 3 sentences or 250 words whichever
  comes first. 
  Args:
    generator: Keras model of the generator
    seed_text: A string, seed text using which the text should be generated
    tokenizer: Keras tokenizer using which the tokens can be converted back to text
  Returns:
    A list of the words following the seed text
  """  
  new_words = []

  word_count = 0
  sent_count = 0
  
  while(1):
    
    encoded = tokenizer.texts_to_sequences([seed_text].copy())
    encoded = pad_sequences(encoded, maxlen=INPUT_LEN, truncating='pre', padding="post")
    encoded = np.array(encoded)
    
    pred_probs = generator.predict(encoded)
    
    word_ind = np.random.choice(VOCAB_SIZE, p=pred_probs[0])
    pred_word = token_to_word[word_ind]
    
    new_words.append(pred_word)
    
    if (pred_word == ".") or (pred_word == "!"):
      sent_count+=1

    if sent_count==3 or word_count>250:
      seed_text.append(pred_word)
      break
    
    word_count+=1
    seed_text.append(pred_word)
    
    
  return ' '.join(seed_text)

The process of generating text from LSTM is a bit slow. It takes around 3 hours to generate 10K reviews

In [None]:
#test case
print(get_full_review(generator_model, tokens_list[0][:9].copy(), tokenizer))

In [None]:
#enter the number of reviews to be generated. When needed to generate large number of reviews, use the entire review corpus.
new_reviews = []
for i in range(10):
  new_reviews.append(get_full_review(generator_model, tokens_list[i][:9].copy(), tokenizer))
