# Generate Reviews

This Jupyter Notebook creates fake reviews using ChatGPT, using different models (GPT-3.5, GPT-4), prompt engineering techniques (zero- vs few-shot prompts), restaurant and review characteristics.

## Setup

In [51]:
#Dependencies
import os
import csv
import ast
import json
import time
import openai
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Directories
os.chdir('/home/bleiva/capp30255/how_the_bear_got_a_C')
input_dir = ''      #input data directory
output_dir = 'how_the_bear_got_a_C/chat_gpt'
prompt_dir = 'how_the_bear_got_a_C/chat_gpt/prompts/'

#API key
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [52]:
#Models
model_list = ['gpt-3.5-turbo-16k-0613', 'gpt-4-1106-preview']

#API parameters
temperature_list = [0, 1, 2]
system_content = 'You are a person that writes restaurant reviews'

#Base prompt
with open('chat_gpt/zero_shot_prompt.txt', 'r') as file:
    base_prompt = file.read()

print(f'Base prompt:\n\n{base_prompt}')

Base prompt:

You attended a restaurant named REST_NAME.
You rated your experience with NUM_STARS stars out of 5.
Write a REV_TONE review of NUM_WORDS words describing your experience.



## Helper Functions

In [53]:
def find_moments_of_attribute_distribution(dataset: pd.DataFrame) -> dict:
    """
    Finds review attributes from underlying distributions of real reviews.

    Input:
        - dataset (DataFrame): a pandas dataframe.
    
    Returns dictionary with attribute (str), moments (dict) pairs. 
    """

    #Create output structure
    attr_moments = {'stars': None, 'useful': None, 'funny': None, 'cool': None, 'length': None}
    
    #Compute and store moments
    for key in attr_moments:
        attr_moments[key] = {'mean': dataset[key].mean(), 
                             'std': dataset[key].std()}

    return attr_moments

def create_fake_review_attributes(attr_moments: dict) -> dict:
    """
    Creates attributes by drawing random value from empirical distribution.

    Input:
        - attribute_moments (dict): a dictionary of attribute, moments.
    
    Returns dictionary where each key, value is an attribute, value.
    """

    #Create output structure
    attrs = {'stars': None, 'useful': None, 'funny': None, 'cool': None, 'length': None}
    
    #Compute and store values
    for key in attrs.keys():
        if key == 'length':
            attr_value = np.random.normal(attr_moments[key]['mean'], 
                                          attr_moments[key]['std'])
        else:
            if key == 'stars':
                choices = [1,2,3,4,5]
                probs = [0.153, 0.078, 0.099, 0.208, 0.462] #empirical distrib
            else:
                choices = [True, False]
                probs = [attr_moments[key]['mean'], 1-attr_moments[key]['mean']]
            attr_value = np.random.choice(choices,p=probs)
        #Store in dictionary
        attrs[key] = attr_value        

    return attrs

def draw_reviews_from_verified_sample(dataset: pd.DataFrame, 
                                      num_draws: int, 
                                      attributes: dict) -> list:
    """
    Takes a set of attributes and randomly draws verified reviews with those 
    characteristics.

    Inputs:
        - dataset (DataFrame): a pandas dataframe
        - num_draws (int): number of reviews to extract from dataset
        - attributes (dict): key, value pairs of attributes to filter from

    Returns a list of reviews that match those attributes.        
    """

    #Filter dataset
    for col, val in attributes.items():
        filtered_dataset = dataset[dataset[col]] == val

    #Pick reviews
    sampled_rows = filtered_dataset.sample(n=num_draws, replace=False)
    examples = sampled_rows['review'].tolist()

    return examples

def generate_fake_review_prompt(dataset: pd.DataFrame,
                                base_prompt: str,
                                num_shots: int,
                                review_length: int, 
                                restaurant: str,
                                num_stars: int, 
                                useful: bool, 
                                funny: bool, 
                                cool: bool) -> str:
    """
    Takes review characteristics and outputs a fake review.
    
    Inputs:
        - base_prompt (str): standard prompt to build from
        - num_shots (int): number of examples to give in prompt
        - review_length (int): max number of words contained in review
        - restaurant (str): name of the restaurant that is getting reviewed
        - num_stars (int): rating of the restaurant by the (fake) user
        - useful (boolean): True if review is useful, False otherwise 
        - funny (boolean): True if review is funny, False otherwise
        - cool (boolean):  True if review is cool, False otherwise  

    Returns a prompt to generate fake review in ChatGPT's API.
    """

    #Define review tone
    attributes = ''
    if useful:
        attributes += 'useful, '
    if funny:
        attributes += 'funny, '
    if cool:
        attributes += 'cool, '
    attributes = attributes.rstrip(", ")

    #Build prompt based on relevant variables
    prompt = base_prompt.replace("REST_NAME", restaurant)\
                        .replace("NUM_WORDS", review_length)\
                        .replace("NUM_STARS", num_stars)\
                        .replace("REV_TONE", attributes)

    #Zero-shot
    if num_shots == 0:
        return prompt
    
    #Few-shot
    reviews = draw_reviews_from_verified_sample(dataset, num_shots, attributes)
    few_shot_text = ''
    for i in range(num_shots):
        few_shot_text += f'Example {i}: Restaurant name = {restaurant};\
                                        Number of characters = {review_length};\
                                        Number of stars = {num_stars};\
                                        Tone = {attributes}.\n \
                           Review {i}: {reviews[i]}.##\n'

    prompt = prompt.replace("EXAMPLES", few_shot_text)
    return prompt

def make_api_call(model: str, 
                  system_content: str, 
                  temperature: float, 
                  prompt: str, 
                  output_dataset: pd.DataFrame):
    """
    Make calls to ChatGPT's API using a specific prompt, model and temperature.

    Inputs:
        - model (str): model of OpenAI to use
        - system_content (str): content of system role
        - temperature (float): level of randomness of output
        - prompt (str): prompt to feed to the model
        - output_dataset (DataFrame): dataset where fake reviwes are stored in.
    
    Returns a csv file where each row contains the statement and the output.
    """

    # API call
    while True:
        try:
            # Send request
            completion = openai.ChatCompletion.create(
                model=model,
                messages=[{'role': 'system', 'content': system_content,
                           'role': 'user', 'content': prompt}],
                temperature=temperature)

            # Retrieve response
            response = completion['choices'][0]['message']['content']
            break  # Break out of the loop if API call is successful

        #Handle errors
        except openai.error.APIError as e:
            print("API error. Retrying... (if error persists, check status.openai.com)")
            time.sleep(10)
        except openai.error.Timeout as e:
            print("Request timed out. Retrying... (if error persists, check internet connection)")
            time.sleep(5)
        except openai.error.RateLimitError as e:
            print("Reached rate limit. Retrying... (if error persists, check number of tokens/requests)")
            time.sleep(60)
        except openai.error.APIConnectionError as e:
            print("API connection error. Retrying... (if error persists, check network/proxy config/ssl/firewall)")
            time.sleep(5)
        except openai.error.InvalidRequestError as e:
            print("Invalid request error. Retrying... (if error persists, check for invalid/missing request parameters)")
        except openai.error.AuthenticationError as e:
            print("Authentication error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
        except openai.error.ServiceUnavailableError as e:
            print("Server is overloaded. Retrying... (if error persists, check status.openai.com)")
            time.sleep(120)

    return response


## Data

In [55]:
#Load data
data = pd.read_csv('data/yelp/yelp_verified_slim.csv')

#Check column types

data.head()

Unnamed: 0,user_id,text,date,business_id,stars,useful,funny,cool,elite,comment_year,business_name,text_length
0,UlPCp6kFGGUSKycc5kNiJg,PSA: CASH ONLY!! \n\nA hole in the wall family...,2019-11-13 14:25:16,d48Xrx8MhGtdaLvhcYzNWQ,5.0,0,0,0,201920202021,2019,Cafe Diem,622
1,t2ZKf-CjGthLamYKNAcbJw,"When you're craving empanadas, check out this ...",2019-11-09 19:40:02,ngU4740twiB222g4ti0_eQ,5.0,0,0,0,201920202021,2019,Cocina Latina,235
2,9fOezqmM4pYOHCcPD1C0aA,Loved our cheesesteaks. By the length of the l...,2019-12-03 17:15:01,8xTHtLoNIwdpf0FEvIpQIw,5.0,0,0,0,2015201620172018201920202021,2019,By George! Pizza Pasta & Cheesesteaks,261
3,UHuNFy-CDLj_SVMOQBHryw,Such fun! Great experience. Chef was terrific....,2021-09-25 14:35:27,29sZgoR7VN_3ck3NF5easg,5.0,0,0,0,20202021,2021,Izakaya Japanese Bar & Grill,265
4,DbiZXAui0L2LGHB5E0blrw,Really cool spot in Northern Liberties. Weathe...,2018-08-24 16:36:13,JUlsvVAvZvGHWFfkKm0nlg,5.0,0,0,0,20182019,2018,El Camino Real,975


## API Calls

In [None]:


#