# Generator - 2.0
In this notebook, we have explored the method of Text Generation using sampling of sentences from the corpus. <br>
Given the category and sentiment of the review, the review will be generated
<br>
**Files Used**<br>
Reviews from https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_review.json

In [None]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, drive
from oauth2client.client import GoogleCredentials

drive.mount('/content/drive')

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
  
get_ipython().events.register('pre_run_cell', set_css)

# Loading the data

In [None]:
#importing the libraries
import numpy as np
import pandas as pd
from random import random,choice
import random as r

In [None]:
#loading the restaurant reviews csv file
restaurant_reviews_df = pd.read_csv("reviews.csv")

In [None]:
restaurant_reviews_df.head(5)

# Designing the generator

The reviews are split into a classes of related categories. Each class has a corpus of positive and negative reviews. Two dictionaries, one for positive and other for negative are developed. The keys are the class ids and the values are the list of sentences from the original corpus of reviews. <br>
The generator then uses this dictionary to generate the review

In [None]:
#choosing reviews with a rating of 5 stars as positive reviews.
positive_review_ratings = [5]
positive_reviews_df = restaurant_reviews_df[restaurant_reviews_df.stars.isin(positive_review_ratings)].reset_index(drop=True)
positive_reviews_df.head(5)



In [None]:
#choosing reviews with a rating of 1 as negative reviews
negative_review_ratings = [1]
negative_reviews_df = restaurant_reviews_df[restaurant_reviews_df.stars.isin(negative_review_ratings)].reset_index(drop=True)

negative_reviews_df.head(5)



## Generating positive reviews

In [None]:
#counting the total number of positive reviews
print("Total number of positive reviews: ", len(positive_reviews_df))


In [None]:
#finding the number of reviews in each category

category_dict = {}

#iterating through all the reviews
for i in range(len(positive_reviews_df)):
  
  #get the list of categories associated with a review. 
  #Since there might be more than one we use the comma operator to get  the individual categories
  categories = positive_reviews_df.loc[i,"categories"].split(',')

  #iterating through the list of categories
  for category in categories:

    #removing the left side space found in some categories after the opening quotes.
    category = category.lstrip(" ")

    #if the category is not present in the dictionary, then create a new key and initialize it with value 1.
    #all the categories are converted to lower case letters to avoid duplicate entries
    if category.lower() not in category_dict:
      category_dict[category.lower()] = 1

    #if the category is already present then increment the count
    else:
      category_dict[category.lower()] += 1

    

  

In [None]:
#printing the category dictionary
#category_dict

In [None]:
"""
Grouping the similar categories into classes and assigning an id to each class.
0: american (new&traditional), bistros, gastropubs, hawaiian
1: bagels, pretzels, donuts
2: bakeries, cafes, coffee&tea, cupcakes, desserts, patisserie/cake shop, themed cafes, waffles
3: barbecue ,cheesesteaks, steakhouses
4: bars, beer, breweries, cocktail_bars
5: b&b, buffets, cajun/creole, diners
6: british, bubbletea, french, german, greek, spanish
7: burgers, chicken wings, fish&chips, hotdogs,pizza, seafood, tacos
8: cantonese, chinese, noodles, ramen, szechuan, vietnamese
9: caribbean, falafel, halal
10: indian, mexican
11: japanese, sushi bars
12: lebanese, mediterranean, middle eastern
13: salad, sandwiches
"""
#mapping the considered categories to their classes
categories_considered = {'american (new)':0, 'american (traditional)':0, 'bagels':1, 'bakeries':2, 'barbeque':3, 'bars':4,
                         'beer':4, 'bistros':0, 'breakfast & brunch':5, 'breweries':4, 'british':6, 'bubbletea':6, 'burgers':7,
                         'buffets':5, 'cafes':2 , 'cajun/creole': 5, 'cantonese':8, 'caribbean':9, 'cheesesteaks':3, 'chicken wings':7,
                         'chinese':8, 'cocktail bars':4, 'coffee & tea':2, 'cupcakes':2, 'desserts':2 , 'dim sum':8, 
                         'diners':5, 'donuts':1, 'falafel':9, 'fish & chips':7, 'french': 6, 'gastropubs': 0,
                         'german':6, 'greek':6, 'halal':9, 'hawaiian':0, 'hot dogs':7, 'indian':10, 'japanese':11, 
                         'lebanese':12, 'mediterranean':12, 'mexican':11, 'middle eastern': 12,
                         'noodles': 8, 'patisserie/cake shop': 2, 'pizza':7, 'pretzels':1, 'ramen':8, 'salad':13, 'sandwiches':13,
                         'seafood': 7, 'spanish': 7, 'steakhouses':3, 'sushi bars':11, 'szechuan':8, 'tacos':7, 
                         'themed cafes':2, 'vietnamese':8, 'waffles':2
                        
                        
                        }

In [None]:
#importing nltk
import nltk
nltk.download("punkt")

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
import collections
positive_reviews_dictionary = collections.defaultdict(list)

#identifying the categories associated with each row in the dataframe and adding the reviews to their corresponding group

for i in range(len(positive_reviews_df)):
  #since each row has more than one review separated by a comma splitting it into individual categories
  categories = positive_reviews_df.loc[i,"categories"].split(',')

  #iterating through each category
  for category in categories:
    #removing the left side space in some categories
    category = category.lstrip(" ")


    if category.lower() in categories_considered:
      #tokenizing the reviews into individual sentences
      sentences = sent_tokenize(positive_reviews_df.loc[i, "text"])

      #finding the group id corresponding to the category and adding the review to that group in the dictionary.
      #since there might be more than one sentence in the review, the extend() function is used.

      positive_reviews_dictionary[categories_considered[category.lower()]].extend(sentences)


In [None]:
#converting the defaultdict() to a dictionary
positive_reviews_dictionary = dict(positive_reviews_dictionary)

#checking if any group has less than 10 reviews
for key,val in positive_reviews_dictionary.items():
  if len(val)<10:
    print(key)

In [None]:
def gen_positive_review(category :str) -> str:
  """
  A function to generate a positive review for the given category. 

  It generates the review by picking up sentences from that category and concatenating them
  If the category is not present, then it returns "Category not found"
  Args: 
    category: category for the review
  Returns:
    A string which the generated positive review for the category
  """
  #check if the review category exists
  
  try:
    reviews_list = positive_reviews_dictionary[categories_considered[category]]
    
    #generating a length/number of sentences for the review
    
    len_of_review = r.randint(5,15)
    
    review = ""
    for _ in range(len_of_review):
      review+=choice(reviews_list)+" "

    return review
      
  #if the category is not found  
  except:
    return "Category not found"


In [None]:
#test case of category is present
gen_positive_review("chinese")

In [None]:
#test case of category not present
gen_positive_review("steak")

## Generating negative reviews

In [None]:
print("Total number of negative reviews: ", len(negative_reviews_df))

In [None]:
#finding the categories in negative_reviews_df
negative_category_dict = {}

for i in range(len(negative_reviews_df)):
  #get the list of categories associated with a review. 
  #Since there might be more than one we use the comma operator to get  the individual categories
  categories = negative_reviews_df.loc[i,"categories"].split(',')

  #iterating through each category
  for category in categories:

    #removing the left side space found in some categories after the opening quotes.
    category = category.lstrip(" ")

    #if the category is not present in the dictionary, then create a new key and initialize it with value 1.
    #all the categories are converted to lower case letters to avoid duplicate entries
    
    if category.lower() not in negative_category_dict:
      negative_category_dict[category.lower()] = 1

    #if the category is already present then increment the count
    else:
      negative_category_dict[category.lower()] += 1


The same class ids as the positive reviews are used

In [None]:
#creating the negative_reviews_dictionary
negative_reviews_dictionary = collections.defaultdict(list)

for i in range(len(negative_reviews_df)):
  categories = negative_reviews_df.loc[i,"categories"].split(',')

  
  for category in categories:
    category = category.lstrip(" ")

    if category.lower() in categories_considered:

      #tokenizing the reviews into individual sentences
      sentences = sent_tokenize(negative_reviews_df.loc[i, "text"])

      #finding the class id corresponding to the category and adding the review to that class in the dictionary.
      #since there might be more than one sentence in the review, the extend() function is used.

      negative_reviews_dictionary[categories_considered[category.lower()]].extend(sentences)

  

In [None]:
#converting defaultdict to a dictionary
negative_reviews_dictionary= dict(negative_reviews_dictionary)

In [None]:
def gen_negative_review(category :str) -> str:
  """
  A function to generate a positive review for the given category. 

  It generates the review by picking up sentences from that category and concatenating them.
  If the category is not present, then it returns "Category not found"
  Args:
    category: category for the review
  Returns:
    A string which is the negative review generated for this category

  """

  #check if the category is present
  try:
    #select the sentences associated with that category from the dictionary
    reviews_list = negative_reviews_dictionary[categories_considered[category]]

    #choose a length for the review
    len_of_review = r.randint(5,15)
      
    review = ""
    for _ in range(len_of_review):
      review+=choice(reviews_list)+" "

    return review

  except:
    return "Category not found"


In [None]:
#test case for a category that is present
gen_negative_review("chinese")

In [None]:
#test case for a category not present
gen_negative_review("fudge")

## Generation of reviews for the discriminator


30K positive reviews and 30K negative reviews are generated for the generator. <br>
Generation of a review:
1. Randomly choose a category from the set of categories
2. Generate a positive and negative review using that category

In [None]:
#getting the list of all categories in the categories dictionary so that the reviews can be generated
categories_list = list(categories_considered.keys())

In [None]:
positive_reviews_list = []
negative_reviews_list = []

#loop for 30K reviews
for review_nos in range(30000):
  category = choice(categories_list)
  positive_reviews_list.append(gen_positive_review(category))
  negative_reviews_list.append(gen_negative_review(category))

  #checking progress
  if review_nos%2000 == 0:
    print(review_nos)
  

In [None]:
#checking the reviews
print("number of positive reviews: ", len(positive_reviews_list))
print("number of negative reviews: ", len(negative_reviews_list))
print()
print("Sample positive review: ")
print(positive_reviews_list[135])
print()
print("Sample negative review: ")
print(negative_reviews_list[150])

In [None]:
#merge both the review_lists as one. Here positive and negative indicate the sentiment of the reviews.
fake_reviews_list = positive_reviews_list
fake_reviews_list.extend(negative_reviews_list)

In [None]:
#store the list as a pickle file
import pickle
with open("generator_2_reviews.pkl", "wb") as f:
  pickle.dump(fake_reviews_list, f)
