# Generate Reviews

This Jupyter Notebook creates fake reviews using ChatGPT, using different models (GPT-3.5, GPT-4), prompt engineering techniques (zero- vs few-shot prompts), restaurant and review characteristics.

In [1]:
!pip install openai

[0m

## Setup

In [13]:
!pwd

/Users/jackgibson/Documents/advanced_ml/how_the_bear_got_a_C


In [2]:
#Dependencies
import os
import csv
import time
import numpy as np
import pandas as pd
from openai import OpenAI
# from openai import openai, APIRemovedInV1
# import openai # pip install openai==0.27.8

#Directories
os.chdir('/Users/jackgibson/Documents/advanced_ml/how_the_bear_got_a_C')
output_dir = 'how_the_bear_got_a_C/chat_gpt'

os.environ["OPENAI_API_KEY"] = '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'

#OpenAI
client = OpenAI()


#Base prompt
with open('chat_gpt/base_prompt.txt', 'r') as file:
    base_prompt = file.read()

print(f'Base prompt:\n\n{base_prompt}')

Base prompt:

You attended a restaurant named REST_NAME.
You rated your experience with NUM_STARS stars out of 5.
Write a review of NUM_CHAR characters describing your experience.


## Helper Functions

In [4]:
def find_moments_of_attribute_distribution(dataset: pd.DataFrame) -> dict:
    """
    Finds review attributes from underlying distributions of real reviews.

    Input:
        - dataset (DataFrame): a pandas dataframe.
    
    Returns dictionary with attribute (str), moments (dict) pairs. 
    """

    #Create output structure
    attr_moments = {'stars': None, 'text_length': None}

    #Compute moments for rest of attributes
    for attr in ['stars', 'text_length']:
        attr_moments[attr] = {'mean': dataset[attr].mean(),
                             'median': dataset[attr].median(), 
                             'std': dataset[attr].std()}

    return attr_moments

def create_fake_review_attributes(dataset: pd.DataFrame, attr_moments: dict) -> dict:
    """
    Creates attributes by drawing random value from empirical distribution.

    Input:
        - attribute_moments (dict): a dictionary of attribute, moments.
    
    Returns dictionary where each key, value is an attribute, value.
    """

    #Create output structure
    attrs = {'business_name': None, 'stars': None, 'text_length': None}

    #Compute and store values
    for key in ['text_length', 'stars', 'business_name']:
        if key == 'text_length':
            attr_value = round(abs(np.random.normal(attr_moments[key]['mean'], attr_moments[key]['std'])))
            # print('picked_text_length:', attr_value)
        elif key == 'stars':
            choices = [1, 2, 3, 4, 5]
            probs = [0.153, 0.078, 0.099, 0.208, 0.462] #from empirical distribution
            attr_value = np.random.choice(choices, p=probs)
            # print('picked_star:', attr_value)
        elif key == 'business_name':
            attr_value = np.random.choice(dataset[key])
            # print('picked_name:', attr_value)
        #Store in dictionary
        attrs[key] = attr_value

    return attrs

def draw_reviews_from_verified_sample(dataset: pd.DataFrame, 
                                      num_draws: int, 
                                      attributes: dict) -> list:
    """
    Takes a set of attributes and randomly draws verified reviews with those 
    characteristics.

    Inputs:
        - dataset (DataFrame): a pandas dataframe
        - num_draws (int): number of reviews to extract from dataset
        - attributes (dict): key, value pairs of attributes to filter from

    Returns a list of reviews that match those attributes.        
    """

    #Filter dataset
    filtered_dataset = dataset[dataset['business_name'] == attributes['business_name']]
    num_rows = filtered_dataset.shape[0]

    #Pick reviews
    if num_rows >= num_draws:
        sampled_rows = filtered_dataset.sample(n=num_draws, replace=False)
    else:
        sampled_rows = filtered_dataset.sample(n=num_rows, replace=False)

    examples = []
    for _, row in sampled_rows.iterrows():
        d = {'business_name': row['business_name'], 'stars': row['stars'], 'text_length': row['text_length'], 'text': row['text']}
        examples.append(d)

    return examples

def generate_fake_review_prompt(dataset: pd.DataFrame,
                                base_prompt: str,
                                num_shots: int,
                                attributes: dict) -> str:
    """
    Takes review characteristics and outputs a fake review.
    
    Inputs:
        - base_prompt (str): standard prompt to build from
        - num_shots (int): number of examples to give in prompt
        - review_length (int): max number of words contained in review
        - restaurant (str): name of the restaurant that is getting reviewed
        - num_stars (int): rating of the restaurant by the (fake) user
        - useful (boolean): True if review is useful, False otherwise 
        - funny (boolean): True if review is funny, False otherwise
        - cool (boolean):  True if review is cool, False otherwise  

    Returns a prompt to generate fake review in ChatGPT's API.
    """

    #Build prompt based on relevant variables
    prompt = base_prompt.replace("REST_NAME", attributes['business_name']) \
                        .replace("NUM_CHAR", str(abs(attributes['text_length']))) \
                        .replace("NUM_STARS", str(attributes['stars']))
    #Zero-shot
    if num_shots == 0:
        return prompt
    
    #Few-shot
    reviews = draw_reviews_from_verified_sample(dataset, num_shots, attributes)
    few_shot_text = '\n\nConsider the examples below:\n\n'
    count = 1
    for i in range(len(reviews)):
        few_shot_text += f"##\nExample {count}: Restaurant name = {reviews[i]['business_name']}; " \
                        f"Number of characters = {str(len(reviews[i]))}; " \
                        f"Number of stars = {str(reviews[i]['stars'])}.\n" \
                        f"Review {count}: {reviews[i]['text']}.\n##"
        count +=1

    prompt = prompt+few_shot_text

    return prompt

def make_api_calls(model: str, 
                  system_content: str, 
                  temperature: float, 
                  num_shots: int,
                  dataset: pd.DataFrame
                  ):
    """
    Make calls to ChatGPT's API using a specific prompt, model, and temperature.

    Inputs:
        - model (str): model of OpenAI to use
        - system_content (str): content of system role
        - temperature (float): level of randomness of output
        - num_shots (int): number of examples in prompt
        - dataset (pd.DataFrame): DataFrame containing the dataset
    
    Returns a csv file where each row contains the statement and the output.
    """

    fake_review_count = 0

    if num_shots == 0:
        shots = 'zero'
    else:
        shots = 'few'

    attr_moments = find_moments_of_attribute_distribution(dataset)
    attr_fake = create_fake_review_attributes(dataset, attr_moments)
    fake_prompt = generate_fake_review_prompt(dataset, base_prompt, num_shots, attr_fake)

    os.makedirs(output_dir, exist_ok=True)
    csv_file_path = os.path.join(output_dir, f'fake_reviews_{model}_{num_shots}_shots.csv')

    with open(csv_file_path, mode='w', newline='') as f:

        # Create headers only if the file is empty
        if os.stat(csv_file_path).st_size == 0:
            writer = csv.writer(f)
            writer.writerow(["REVIEW", "LABEL", "MODEL", "NUM_SHOTS"])

        print('Starting API call')

        while True:
            try:
            # Send request
                # completion = openai.ChatCompletion.create(
                print('model', model)
                print('system', system_content)
                print('prompt', fake_prompt)
                completion = client.chat.completions.create(
                model=model,
                messages=[{'role': 'system', 'content': system_content,
                        'role': 'user', 'content': fake_prompt}],
                temperature=temperature)

                # Retrieve response
                response = completion['choices'][0]['message']['content']
                print(completion)
                
                break  # Break out of the loop if API call is successful
            
            except:
                print('got and erro')
            # except APIException as e:
            #     print(f"APIRemovedInV1 error: {e}")
            # except client.APIError as e:
            #     print("API error. Retrying... (if error persists, check status.openai.com)")
            #     time.sleep(10)
            # except client.Timeout as e:
            #     print("Request timed out. Retrying... (if error persists, check internet connection)")
            #     time.sleep(5)
            # except client.RateLimitError as e:
            #     print("Reached rate limit. Retrying... (if error persists, check number of tokens/requests)")
            #     time.sleep(60)
            # except client.APIConnectionError as e:
            #     print("API connection error. Retrying... (if error persists, check network/proxy config/ssl/firewall)")
            #     time.sleep(5)
            # except client.InvalidRequestError as e:
            #     print("Invalid request error. Retrying... (if error persists, check for invalid/missing request parameters)")
            # except client.AuthenticationError as e:
            #     print("Authentication error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
            # except client.ServiceUnavailableError as e:
            #     print("Server is overloaded. Retrying... (if error persists, check status.openai.com)")
            #     time.sleep(120)

        # Write csv file
        writer.writerow([response, "FAKE", model, shots])
        print(f'Progress: {round(fake_review_count/len(dataset),2)*100}%')
        fake_review_count +=1

    f.close()
    
    return print("API calls complete")

# def make_api_calls(model: str, 
#                   system_content: str, 
#                   temperature: float, 
#                   num_shots: int,
#                   dataset: pd.DataFrame
#                   ):
#     """
#     Make calls to ChatGPT's API using a specific prompt, model and temperature.

#     Inputs:
#         - model (str): model of OpenAI to use
#         - system_content (str): content of system role
#         - temperature (float): level of randomness of output
#         - prompt (str): prompt to feed to the model
#         - num_shots (int): number of examples in prompt
    
#     Returns a csv file where each row contains the statement and the output.
#     """

#     fake_review_count = 0

#     if num_shots == 0:
#         shots = 'zero'
#     else:
#         shots = 'few'

#     attr_moments = find_moments_of_attribute_distribution(dataset)
#     attr_fake = create_fake_review_attributes(dataset, attr_moments)
#     fake_prompt = generate_fake_review_prompt(dataset, base_prompt, num_shots, attr_fake)

#     os.makedirs(output_dir, exist_ok=True)
#     csv_file_path = os.path.join(output_dir, f'fake_reviews_{model}_{num_shots}_shots.csv')

#     with open(csv_file_path, mode='w', newline='') as f:

#         # Create headers only if the file is empty
#         if os.stat(csv_file_path).st_size == 0:
#             writer = csv.writer(f)
#             writer.writerow(["REVIEW", "LABEL", "MODEL", "NUM_SHOTS"])

#         print('Starting API call')

#         # API call
#         while True:
#             try:
#             # Send request
#                 completion = openai.ChatCompletion.create(
#             # completion = client.chat.completions.create(
#                 model=model,
#                 messages=[{'role': 'system', 'content': system_content,
#                         'role': 'user', 'content': fake_prompt}],
#                 temperature=temperature)

#                 # Retrieve response
#                 response = completion['choices'][0]['message']['content']
#                 break  # Break out of the loop if API call is successful

#             #Handle errors
#             # except:
#             #     print("API error, trying again")
#             #     time.sleep(10)

#             except openai.error.APIError as e:
#                 print("API error. Retrying... (if error persists, check status.openai.com)")
#                 time.sleep(10)
#             except openai.error.Timeout as e:
#                 print("Request timed out. Retrying... (if error persists, check internet connection)")
#                 time.sleep(5)
#             except openai.error.RateLimitError as e:
#                 print("Reached rate limit. Retrying... (if error persists, check number of tokens/requests)")
#                 time.sleep(60)
#             except openai.error.APIConnectionError as e:
#                 print("API connection error. Retrying... (if error persists, check network/proxy config/ssl/firewall)")
#                 time.sleep(5)
#             except openai.error.InvalidRequestError as e:
#                 print("Invalid request error. Retrying... (if error persists, check for invalid/missing request parameters)")
#             except openai.error.AuthenticationError as e:
#                 print("Authentication error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
#             except openai.error.ServiceUnavailableError as e:
#                 print("Server is overloaded. Retrying... (if error persists, check status.openai.com)")
#                 time.sleep(120)

#             #Handle errors
#             # except:
#             #     print("API error, trying again")

#             # except OpenAI.error.APIConnectionError as e:
#             #     print("API connection error. Retrying... (if error persists, check network/proxy config/ssl/firewall)")
#             #     time.sleep(5)
#             # except OpenAI.error.TimeoutError as e:
#             #     print("Request timed out. Retrying... (if error persists, check internet connection)")
#             #     time.sleep(5)
#             # except OpenAI.error.AuthenticationError as e:
#             #     print("Authentication error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
#             # except OpenAI.error.BadRequestError as e:
#             #     print("Bad request error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
#             # except OpenAI.error.ConflictError as e:
#             #     print("Conflict error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
#             # except OpenAI.error.InternalServerError as e:
#             #     print("Internal server error error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
#             # except OpenAI.error.NotFoundError as e:
#             #     print("Not found error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
#             # except OpenAI.error.PermissionDeniedError as e:
#             #     print("Permission denied error. Retrying... (if error persists, check for invalid/expired/revoked API key or token)")
#             # except OpenAI.error.RateLimitError as e:
#             #     print("Reached rate limit. Retrying... (if error persists, check number of tokens/requests)")
#             #     time.sleep(60)
#             # except OpenAI.UnprocessableEntityError as e:
#             #     print("Unprocessable entity error. Retrying... (if error persists, check for invalid/missing request parameters)")

#         # Write csv file
#         writer.writerow([response, "FAKE", model, shots])
#         print(f'Progress: {round(fake_review_count/len(dataset),2)*100}%')
#         fake_review_count +=1

#     f.close()
    
#     return print("API calls complete")

## Data

In [None]:
#Load data
data = pd.read_csv('data/yelp/yelp_verified_slim.csv')
data.head()

Unnamed: 0,user_id,text,date,business_id,stars,useful,funny,cool,elite,comment_year,business_name,text_length
0,UlPCp6kFGGUSKycc5kNiJg,PSA: CASH ONLY!! \n\nA hole in the wall family...,2019-11-13 14:25:16,d48Xrx8MhGtdaLvhcYzNWQ,5.0,0,0,0,201920202021,2019,Cafe Diem,622
1,t2ZKf-CjGthLamYKNAcbJw,"When you're craving empanadas, check out this ...",2019-11-09 19:40:02,ngU4740twiB222g4ti0_eQ,5.0,0,0,0,201920202021,2019,Cocina Latina,235
2,9fOezqmM4pYOHCcPD1C0aA,Loved our cheesesteaks. By the length of the l...,2019-12-03 17:15:01,8xTHtLoNIwdpf0FEvIpQIw,5.0,0,0,0,2015201620172018201920202021,2019,By George! Pizza Pasta & Cheesesteaks,261
3,UHuNFy-CDLj_SVMOQBHryw,Such fun! Great experience. Chef was terrific....,2021-09-25 14:35:27,29sZgoR7VN_3ck3NF5easg,5.0,0,0,0,20202021,2021,Izakaya Japanese Bar & Grill,265
4,DbiZXAui0L2LGHB5E0blrw,Really cool spot in Northern Liberties. Weathe...,2018-08-24 16:36:13,JUlsvVAvZvGHWFfkKm0nlg,5.0,0,0,0,20182019,2018,El Camino Real,975


In [None]:
data.shape[0]

11106

In [None]:
# attr_moments = find_moments_of_attribute_distribution(data)
# attr_fake = create_fake_review_attributes(data, attr_moments)
# print(attr_fake)

In [None]:
# num_shots=3
# reviews = draw_reviews_from_verified_sample(data, num_shots, attr_fake)
# for r in reviews:
#     print(r)
#     print('\n')

In [None]:
num_shots=3
attr_moments = find_moments_of_attribute_distribution(data)
attr_fake = create_fake_review_attributes(data, attr_moments)
fake_prompt = generate_fake_review_prompt(data, base_prompt, num_shots, attr_fake)
print(fake_prompt)

You attended a restaurant named Mi Nidito Restaurant.
You rated your experience with 5 stars out of 5.
Write a review of 137 characters describing your experience.

Consider the examples below:

##
Example 1: Restaurant name = Mi Nidito Restaurant; Number of characters = 4; Number of stars = 2.0.
Review 1: OK, been a while, but that is because I was not impressed.  The ambiance of the decor is about the best thing going for it.  I truly do not know why this place is constantly packed to the brim with folks.  Nor do I understand why anyone would wait an hour plus to be seated.  A restaurant would have to be absolutely incredible, plus have seating at a bar for me to wait that long.  Speaking of seating, there are two small benches for those waiting to be seated.  Don't expect to find a place to sit, in other words.  All that aside, it is about the food, right?  I mean, the place is packed.  Clinton liked it.  Willie Nelson liked it.  Other celebrities liked it.  SO it's got to be good, 

## API Calls

In [5]:
make_api_calls(model='gpt-3.5-turbo-16k-0613', 
               system_content='You are a person that writes restaurant reviews', 
               temperature=1.0, 
               num_shots=0,
               dataset= pd.read_csv('data/yelp/yelp_verified_slim.csv')
               )

Starting API call
model gpt-3.5-turbo-16k-0613
system You are a person that writes restaurant reviews
prompt You attended a restaurant named El Molinito.
You rated your experience with 5 stars out of 5.
Write a review of 1366 characters describing your experience.
got and erro
model gpt-3.5-turbo-16k-0613
system You are a person that writes restaurant reviews
prompt You attended a restaurant named El Molinito.
You rated your experience with 5 stars out of 5.
Write a review of 1366 characters describing your experience.
got and erro
model gpt-3.5-turbo-16k-0613
system You are a person that writes restaurant reviews
prompt You attended a restaurant named El Molinito.
You rated your experience with 5 stars out of 5.
Write a review of 1366 characters describing your experience.
got and erro
model gpt-3.5-turbo-16k-0613
system You are a person that writes restaurant reviews
prompt You attended a restaurant named El Molinito.
You rated your experience with 5 stars out of 5.
Write a review of

In [None]:
#Split dataset for API calls
part_size = len(data) // 3

#Jack
make_api_calls(model='gpt-4-1106-preview', 
               system_content='You are a person that writes restaurant reviews', 
               temperature=1.0, 
               num_shots=3,
               dataset= data.iloc[:part_size]
               )

#Claire
make_api_calls(model='gpt-3.5-turbo-16k-0613', 
               system_content='You are a person that writes restaurant reviews', 
               temperature=1.0, 
               num_shots=3,
               dataset= data.iloc[part_size:2 * part_size]
               )

#Benja
make_api_calls(model='gpt-3.5-turbo-16k-0613', 
               system_content='You are a person that writes restaurant reviews', 
               temperature=1.0, 
               num_shots=0,
               dataset= data.iloc[part_size:2 * part_size]
               )
