# Generator - 1.0
Generate reviews based on the probability of bigram occurrence in the corpus <br>
**Files Used** <br>
Reviews from https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_review.json

In [None]:
#mounting the Google Drive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, drive
from oauth2client.client import GoogleCredentials

drive.mount('/content/drive')

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
  
get_ipython().events.register('pre_run_cell', set_css)

# Testing the approach on a smaller dataset

In [None]:
#importing the libraries
import pandas as pd

In [None]:
#a smaller dataset on which the concept can be validated on
small_dataset = ["The food was great! I had a wonderful time. The service was excellent!!", 
                 "I had a horrible experience. The waiters showed no respect. We waited for nearly an hour for such a disappointing experience", 
                 "My brother took me to this restaurant. We had an amazing time. The food was absolutely delicious. Overall an exciting experience."]

## Preprocessing

In [None]:
import re

#pattern to be removed
pat = "[\d\:\'] "
pat = re.compile(pat)

full_stop_pattern = "\.(?=\S)"
full_stop_pattern = re.compile(full_stop_pattern)

qmark_pattern = "(?=\S)\?"
qmark_pattern = re.compile(qmark_pattern)

exclmark_pattern = re.compile("(?=\S)\!")


In [None]:
def clean_text(text :str) -> str:
  """
  A function to add space after punctuation marks, changing it's to its, removing the numbers and all the colons and semi-colons
  Args:
    text: A string that is to be cleaned.
  Returns:
    A string with the needed modifications made
  """

  
  #changing it's to its
  text = re.sub("it\'s", "its", text)
  text = re.sub("It\'s", "its", text)
  
  #removing numbers and unnecessary punctuation
  text = re.sub(pat, " ", text)

  #removing the new line character
  text = re.sub("(\n)+", " ", text)
  
  #removing the single quotes
  text = re.sub("\'", "", text)

  #the tokens like !,?,. are considered as separate tokens. 
  #Hence a space is added before/after them to make the get recognized as separate tokens.

  # adding space after the full stop
  text = re.sub(full_stop_pattern, ". ", text)

  #adding a space before ?
  text = re.sub(qmark_pattern, " ?", text)

  #adding a space before !
  text = re.sub(exclmark_pattern, " !", text)

  return text





In [None]:
#cleaning the reviews in the smaller dataset
for i, small_review in enumerate(small_dataset):
  small_dataset[i] = clean_text(small_review)


In [None]:
from collections import Counter
from nltk.util import ngrams 

In [None]:
def create_bigrams(dataset : list) -> dict:
  """
  A function to construct bigrams from the reviews  list

  A sample function call will be, 
  create_bigrams(["hi there!"]) = {("hi", "there"):1, ("there", "!"):1}
  
  Args:
    dataset: A list of reviews. It's a list of strings
  Returns:
    A bigram frequency dictionary for the dataset

  """
  #since we need to generate bigrams, we set n_gram=2
  n_gram = 2

  bigram_freq = {}
  sentence_bigram = {}

  #iterating through each review in the dataset
  for review in dataset:
    # get the bigrams in the review
    sentence_bigram = dict(Counter(ngrams(review.split(" "), 2)))

    #count the occurrences of the bigrams
    for key,val in sentence_bigram.items():

      #if the bigram is already present in the dictionary, increment the count
      if key in bigram_freq:
        bigram_freq[key]+=val

      #create a new entry for the bigram  
      else:
        bigram_freq[key] = val

  return bigram_freq


In [None]:
#constructing the bigram frequency dictionary for the smaller dataset
small_bigram_freq_dict = create_bigrams(small_dataset)

In [None]:
def get_bigram_dict(bigram_freq_dict: dict)-> dict:
  
  """
  A funtion to construct a dictionary of the frequencies of each followup word for a seed_word
  
  A dictionary of dictionaries is constructed. In the outer dictionary the key is the first word of bigram.
  The value of the outer dictionary is another dictionary.
  the key of the inner dictionary is the word that occurs after the first word and the value is it's frequency

  A sample function call is,
  get_bigram_dict({("hi", "there"): 1, ("hi", "!"):1}) = {"hi": {"there": 1, "!":1}}

  Args:
    bigram_freq_dict: bigram frequency dictionary
  
  Returns:
    A dictionary with the bigram probabilities

  """
  bigram_dict = {}

  #for each bigram in the dictionary
  for bigram_pair, value in bigram_freq_dict.items():
    #bigram_pair is the pair of two words
    #value is their frequency of occurrence
    #the first word in the bigram is considered as seed word and the second word is considered as the followup word
    seed_word = bigram_pair[0]
    followup = bigram_pair[1]

    #if the seed word is already present, then append the followup to it's dictionary. The value is the frequency of occurrence
    if seed_word in bigram_dict:
      bigram_dict[seed_word][followup] = value
    #if the seed word is new then create a new entry with the seed word as the key and value as the inner dictionary
    else:
      bigram_dict[seed_word] = {followup:value}
  return bigram_dict


In [None]:
small_bigram_dict = get_bigram_dict(small_bigram_freq_dict)

In [None]:
def calc_total_occurrences(d :dict) -> int:
  """
  A function that calculates the total number of occurrences of the seed word.
  Args:
    d: dictionary corresponding to the seed word
  Returns:
    The count of times the seed word occurs
  """
  count = 0
  #iterating through all the values in the dictionary
  for value in d.values():
    count+=value

  return count

In [None]:
#mapping the word to it's count in the corpus
small_word_count_map = {}
for key, value in small_bigram_dict.items():
  small_word_count_map[key] = calc_total_occurrences(value)


In [None]:
def calc_probability(bigram_dict :dict)->dict:
  """
  A function to compute the probability of bigram occurrence for each seed_word

  Args:
    bigram_dict: A dictionary of the bigram frequencies for each seed word
  Returns:
    A dictionary of probability of occurrence of each bigram for each seed word
  """
  prob_dict = {}
  for key, value in bigram_dict.items():
    for indivkey, indivval in value.items():
      
      #Each word's value in the inner dictionary is divided by the count of the seed word
      prob_value = indivval/small_word_count_map[key]
      value[indivkey] = prob_value

    prob_dict[key] = value
  
  return prob_dict




In [None]:
#testing
sample_dict = {"The": {"amazing": 2, "wonderful": 1, "marvellous": 1}}
calc_probability(sample_dict)

In [None]:
small_bigram_prob_dict = calc_probability(small_bigram_dict)

In [None]:
def find_sum(prob_dict :dict) -> bool:
  """
  A function to check if the sum of probabilities of a word is >0.98 and less than 1.1
  
  Args:
    prob_dict: the followup words dictionary
  Returns:
    A boolean value True or False
  """

  val = 0
  for value in prob_dict.values():
    val+=value

  if val>0.98 and val<=1.1:
    
    return True
  else:
    print(val)
    return False

In [None]:
def transform_to_intervals(bigram_prob_dict :dict) ->  dict:
  """
  A function to transform the bigram probability dictionary into a dictionary of probability intervals.

  Args:
    bigram_prob_dict: bigram probability dictionary
  Returns:
    A dictionary of bigram probability intervals

  """
  intervals_dict = {}

  for seed_word, follow_up_words_dict in bigram_prob_dict.items():
    intervals_dict[seed_word] = {}
    previous_val = 0

    for follow_up_word, prob in follow_up_words_dict.items():
      intervals_dict[seed_word][follow_up_word] = (previous_val, previous_val+prob)
      previous_val += prob
  
  return intervals_dict
  



In [None]:
#testing
sample_dict = {"absolutely": {"amazing": 0.25, "wonderful": 0.5, "marvellous": 0.25}}
transform_to_intervals(sample_dict)

In [None]:
small_intervals_dict = transform_to_intervals(small_bigram_prob_dict)

## Generation of sentences using the smaller dataset
The random number generator is used to generate a number between 0 and 1. Each follow-up word in the bigram has an interval. The word whose interval into which the probability value falls into is chosen. 

In [None]:
from random import random, randint, seed
import random as r

In [None]:
def lies_in_interval(interval: tuple, val :float) -> bool:
  """
  A function to check whether a value lies in the given interval

  Args:
    interval: a tuple of start and end limits
  Returns:
    A boolean value True/False
  """
  if val>=interval[0] and val<interval[1]:
    return True
  return False

In [None]:
def search(dic :dict, prob_value :float) -> str:
  """
  A function that searches for the word whose probability interval contains the required probability value

  Args:
    dic: the dictionary of the seed word
    prob_value: required probability value
  Returns:
    The word whose probability interval contains the prob_value

  """

  #find the interval in which the prob_value lies
  #for each word in the inner dictionary
  prev  = ""

  for key,value in dic.items():
    #check if the probability lies in that interval
    if lies_in_interval(value, prob_value):
      return key
    #prev = key

  #if no such interval is found, then an empty string is returned  
  return ""

In [None]:
#testing
sample_dict = {'amazing': (0, 0.25), 'marvellous': (0.75, 1.0),'wonderful': (0.25, 0.75)}
search(sample_dict, 0.3)

In [None]:
#seed is set as 1 to generate probability values less than 1
seed(1)

#the length of the review is chosen as 10
length_of_review = 10

#final answer
ans = ""

#choosing a random word as seed from the dictionary
seed_word = r.choice(list(small_intervals_dict.keys()))
prob_value = 0

#adding the seed word to the answer
ans+=seed_word+" "

print("length of review : ",length_of_review)

#iterating to the length of the review
for _ in range(length_of_review):

  #getting a random probability value
  prob_value = random()
  
  #if the followup word is not present as a key in the dictionary (i.e) there are no bigrams
  #with the followup word as the seed word, then a random word is generated and used.

  try:
    #get the dictionary of followup words
    values = small_intervals_dict[seed_word]
  except:
    seed_word = r.choice(list(small_intervals_dict.keys()))
    values = small_intervals_dict[seed_word]
    
  #search the list for the followup word
  next_word = search(values, prob_value)

  ans+= next_word+" "

  #make the followup word as the seed word for the next iteration
  seed_word = next_word  
  
print(ans)


# Text Generation using the larger dataset
The same approach used on the smaller dataset is also used for the larger dataset

## Loading the data

In [None]:
#reading the YELP 10K reviews dataset
df = pd.read_csv("data/reviews_20K.csv")

In [None]:
df.head(5)

In [None]:
#converting the reviews column in the dataframe to a list of reviews
reviews = df["text"].values.tolist()

In [None]:
#sample review
reviews[1007]

## Preprocessing

In [None]:
#cleaning the reviews using the clean_text function
for i,review in enumerate(reviews):
  reviews[i] = clean_text(review)


In [None]:
#creating the bigrams for the larger corpus
large_bigram_freq_dict = create_bigrams(reviews)

In [None]:
large_bigrams_dict = get_bigram_dict(large_bigram_freq_dict)

In [None]:
#sample output
large_bigrams_dict["TOP"]

In [None]:
#maps the seed words to their count in the corpus
large_word_count_map = {}

for key, value in large_bigrams_dict.items():
  large_word_count_map[key] = calc_total_occurrences(value)


In [None]:
def large_calc_probability(bigram_dict :dict) -> dict:
  """
  A function to compute the probability of bigram occurrence for each seed_word

  Args:
    bigram_dict: A dictionary of the bigram frequencies for each seed word
  Returns:
    A dictionary of probability of occurrence of each bigram for each seed word
  """
  prob_dict = {}
  for key, value in bigram_dict.items():
    for indivkey, indivval in value.items():
      prob_value = indivval/large_word_count_map[key]
      value[indivkey] = prob_value

    prob_dict[key] = value
  
  return prob_dict




In [None]:
large_bigram_prob_dict = large_calc_probability(large_bigrams_dict)

In [None]:
#sample test case
large_bigram_prob_dict["TOP"]

In [None]:
#checking if the sum of probabilities of follow up words is >0.98 and <1.1
for key,val in large_bigram_prob_dict.items():
  if(not find_sum(val)):
    print(key,val)
    

In [None]:
large_bigram_interval_dict = transform_to_intervals(large_bigram_prob_dict)

## Text generation using the larger dataset

In [None]:
#seed is set as 1 to generate probability values less than 1
seed(1)

artificial_reviews = []

#generating 10K artificial reviews
for i in range(10000):
  #length of the review is chosen as a random value between 100 and 150
  length_of_review = randint(100,150)
  ans = ""

  #choosing a random word from the dictionary
  seed_word = r.choice(list(large_bigram_interval_dict.keys()))
  prob_value = 0

  #adding the seed word to the answer
  ans+=seed_word+" "

  #print("length of review : ",length_of_review)


  #iterating over the length of the review
  for _ in range(length_of_review):

    #getting a random probability value
    prob_value = random()

    #rounding off the probability value to 5 digits after the decimal point
    prob_value= round(prob_value, 5)

    #getting the dictionary associated with the seed word
    try:
      values= large_bigram_interval_dict[seed_word]
    except:
      seed_word = r.choice(list(large_bigram_interval_dict.keys()))
      values= large_bigram_interval_dict[seed_word]

    next_word = search(values, prob_value)

    if next_word==None:
      print(prob_value, seed_word)

    ans+= next_word+" "

    #make the followup word as the seed word for the next iteration
    seed_word = next_word  
    
  #append the review to the list of artificial reviews  
  artificial_reviews.append(ans)


In [None]:
#sample review
artificial_reviews[999]

creating a dataset of 10K reviews

In [None]:
# generating the labels
labels = [0]*10000

In [None]:
# generating the pandas dataframe
import pandas as pd
fake_review_dataframe = pd.DataFrame(list(zip(artificial_reviews, labels)), 
               columns =['Review', 'label'])

In [None]:
fake_review_dataframe.head(5)

In [None]:
#writing it to a csv file
fake_review_dataframe.to_csv("data/generator_1_reviews.csv")