In [31]:
import openai
import json
import pandas as pd
import os
from dotenv import load_dotenv
from numpy.random import choice
import re
from datetime import datetime
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

load_dotenv()

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")


def ChatGPT_request(prompt, temp, model="gpt-3.5"): 
  """
  Given a prompt and a dictionary of GPT parameters, make a request to OpenAI
  server and returns the response. 
  ARGS:
    prompt: a str prompt
  RETURNS: 
    a str of GPT-3's response. 
  """

  if model == "gpt-3.5":
    model = "gpt-3.5-turbo"
  elif model == "gpt-4":
    model = "gpt-4"
  else: 
    model = "gpt-3.5-turbo"
  # temp_sleep()
  try: 
    completion = openai.ChatCompletion.create(
    model=model, 
    temperature=temp,
    messages=[
        {
            "role": "user", 
            "content": prompt
        }
      ]
    )
    return completion["choices"][0]["message"]["content"]
  
  except: 
    print ("ChatGPT ERROR")
    return "ChatGPT ERROR"


def persona_ChatGPT_request(prompt, persona_system_message): 
  """
  Given a prompt and a dictionary of GPT parameters, make a request to OpenAI
  server and returns the response. 
  ARGS:
    prompt: a str prompt
    persona_system_message: a str of persona system message
  RETURNS: 
    a str of GPT-3's response. 
  """
  # temp_sleep()
  try: 
    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo", 
    messages=[
        {
            "role": "system", 
            "content": persona_system_message
        },
        {
            "role": "user", 
            "content": prompt
        }
      ]
    )
    return completion["choices"][0]["message"]["content"]
  
  except: 
    print ("ChatGPT ERROR")
    return "ChatGPT ERROR"


def ChatGPT_safe_generate_response(prompt,
                                   example_output,
                                   special_instruction,
                                   repeat=3,
                                   fail_safe_response="error",
                                   func_validate=None,
                                   func_clean_up=None,
                                   verbose=False): 
  # prompt = 'GPT-3 Prompt:\n"""\n' + prompt + '\n"""\n'
  prompt = '"""\n' + prompt + '\n"""\n'
  prompt += f"Output the response to the prompt above in json. {special_instruction}\n"
  prompt += "Example output json:\n"
  prompt += '{"output": "' + str(example_output) + '"}'

  if verbose: 
    print ("CHAT GPT PROMPT")
    print (prompt)

  for i in range(repeat): 

    try: 
      curr_gpt_response = ChatGPT_request(prompt).strip()
      end_index = curr_gpt_response.rfind('}') + 1
      curr_gpt_response = curr_gpt_response[:end_index]
      curr_gpt_response = json.loads(curr_gpt_response)["output"]

      if func_validate(curr_gpt_response, prompt=prompt): 
        return func_clean_up(curr_gpt_response, prompt=prompt)
      
      if verbose: 
        print ("---- repeat count: \n", i, curr_gpt_response)
        print (curr_gpt_response)
        print ("~~~~")

    except: 
      pass

  return False

def format_paragraph(paragraph):
    sentences = re.split(r'(?<=[.!?])\s+', paragraph)
    formatted_paragraph = '\n'.join(sentences)
    # wrap with triple quotes
    return formatted_paragraph

In [32]:
with open('population_parameters.json') as f:
    parameters = json.load(f)

# Convert percentages to probabilities
for category, distribution in parameters.items():
    total = sum(distribution.values())
    parameters[category] = {key: value / total for key, value in distribution.items()}

def generate_population(N):
    # Function to sample from a distribution
    def sample_distribution(distribution):
        categories = list(distribution.keys())
        probabilities = list(distribution.values())
        return choice(categories, N, p=probabilities)

    # Sampling from each distribution
    population = {key: sample_distribution(value) for key, value in parameters.items()}

    return pd.DataFrame(population)

# Example of generating a population of 100 individuals
sample_population = generate_population(400)
sample_population.head()

Unnamed: 0,Race,Gender,Age,Income,Degree,Community Type,Marital Status
0,African American,Male,30-49,$75K-$100K,Bachelor's or higher,Urban,Living with partner
1,White,Female,30-49,$50K-$75K,High School,Urban,Married
2,African American,Male,50-64,Under $20K,High School,Rural,Married
3,White,Male,18-29,Over $100K,Bachelor's or higher,Rural,Married
4,African American,Female,18-29,$75K-$100K,Bachelor's or higher,Suburban,Never married


In [33]:
def generate_names_for_group(race, gender, age, count):
    # Prompt for ChatGPT to generate names based on race, gender, and age
    prompt = f"Generate {count} unique names suitable for individuals of {race} race, {gender} gender, and in the age group {age}. Only include name and surname in each new line, dont number each line, add nothing else"
    names_response = ChatGPT_request(prompt, temp=1)
    names = [re.sub(r'^\d+\.\s', '', name) for name in names_response.split('\n')]
    names = [name for name in names if name != '']
    names = list(set(names))
    names = [re.sub(r'[^a-zA-Z ]+', '', name) for name in names]
    return names

def generate_names_for_population(population):
    grouped_population = population.groupby(['Race', 'Gender', 'Age'])
    name_column = [None] * len(population)

    for (race, gender, age), group in grouped_population:
        count = len(group)
        names = generate_names_for_group(race, gender, age, count)
        for idx, name in zip(group.index, names):
            name_column[idx] = name

    population['Name'] = name_column
    return population

# Example usage
population_with_names = generate_names_for_population(sample_population)
population_with_names.head()

Unnamed: 0,Race,Gender,Age,Income,Degree,Community Type,Marital Status,Name
0,African American,Male,30-49,$75K-$100K,Bachelor's or higher,Urban,Living with partner,Terrence Roberts
1,White,Female,30-49,$50K-$75K,High School,Urban,Married,Heather Martinez
2,African American,Male,50-64,Under $20K,High School,Rural,Married,Andre Williams
3,White,Male,18-29,Over $100K,Bachelor's or higher,Rural,Married,Andrew Nelson
4,African American,Female,18-29,$75K-$100K,Bachelor's or higher,Suburban,Never married,Jasmine Davis


In [34]:
def generate_backstory_prompt(sample_individual):
    prompt = (
        f"Based on the following persona:\n"
        f"Name: {sample_individual['Name']}\n"
        f"Race: {sample_individual['Race']}\n"
        f"Gender: {sample_individual['Gender']}\n"
        f"Age: {sample_individual['Age']}\n"
        f"Income: {sample_individual['Income']}\n"
        f"Degree: {sample_individual['Degree']}\n"
        f"Community Type: {sample_individual['Community Type']}\n"
        f"Marital Status: {sample_individual['Marital Status']}\n"
        f"create a one-paragraph backstory that reflects the characteristics and life experiences of this individual."
    )
    return prompt

sample_individual = sample_population.iloc[0]
backstory_prompt = generate_backstory_prompt(sample_individual)
backstory_response = ChatGPT_request(backstory_prompt, temp=1, model="gpt-4")

#print(backstory_response)
# print each sentence on a new line
for sentence in backstory_response.split("."):
    print(sentence.strip())

Terrence Roberts is a dynamic African American man in his early 40s, born and raised in the eclectic, fast-paced environment of New York City
Having successfully graduated from Columbia University with a Masters degree in Finance, he skillfully built a career in a renowned investment firm, earning an income between $75K and $100K
Terrence's intellect and hard work have always paid off from his school days, earning him a scholarship for his higher education, till date as he navigates the demanding financial landscape
He is living an fulfilling life with his long-term partner, Isaac, in their upscale Manhattan apartment
Isaac, a graphic designer and Terrence's better half, has been with him through all the highs and lows over the years
Although he faces the typical stresses and demands of urban life, Terrence remains an energetic participant in his community, volunteering for local causes and mentoring young, aspiring business students to give back to the society that shaped him



In [35]:
def generate_preferences(sample_individual):
    prompt = (
        f"Based on the following persona:\n"
        f"Name: {sample_individual['Name']}\n"
        f"Race: {sample_individual['Race']}\n"
        f"Gender: {sample_individual['Gender']}\n"
        f"Age: {sample_individual['Age']}\n"
        f"Income: {sample_individual['Income']}\n"
        f"Degree: {sample_individual['Degree']}\n"
        f"Community Type: {sample_individual['Community Type']}\n"
        f"Marital Status: {sample_individual['Marital Status']}\n"
        f"Generate preferences for what sort of articles they might be interested in reading online in the following format"
        f"<preference1>, <preference2>,..<preferenceN>. for upto 10 preferences."
    )
    return prompt

sample_individual = sample_population.iloc[0]
backstory_prompt = generate_preferences(sample_individual)
backstory_response = ChatGPT_request(backstory_prompt, temp=1, model="gpt-4")

#print(backstory_response)
# print each sentence on a new line
for sentence in backstory_response.split("."):
    print(sentence.strip())

Financial advice articles, Tech industry news, Urban development news, Career advancement strategies, Cultural events in the city, Health and fitness trends, Entrepreneurship and start-up reports, Higher education advancements, Articles about managing long-term relationships, Innovative home improvement ideas



In [None]:
backstory_responses= {}
preferences = {}
start = time.time()
for i in range(sample_population.shape[0]):
    print(i)
    sample_individual = sample_population.iloc[i]
    # Generate backstory
    backstory_prompt = generate_backstory_prompt(sample_individual)
    backstory_response = ChatGPT_request(backstory_prompt, temp=1, model="gpt-3.5")
    backstory_responses.update({sample_individual['Name']:backstory_response})
    # Generate preferences
    preference_prompt = generate_preferences(sample_individual)
    preference = ChatGPT_request(preference_prompt, temp=1, model="gpt-3.5")
    preferences.update({sample_individual['Name']:preference})
end = time.time()
print(end-start)


In [37]:
backstory_df = pd.DataFrame.from_dict(backstory_responses, orient='index').reset_index().rename(
    columns={'index':'Name', 0:'Backstory'})
preferences_df = pd.DataFrame.from_dict(preferences, orient='index').reset_index().rename(
    columns={'index':'Name', 0:'Preferences'})


In [38]:
sample_population_with_backstory_pref = sample_population.copy()
sample_population_with_backstory_pref = sample_population_with_backstory_pref.merge(backstory_df, on='Name', how='inner')
sample_population_with_backstory_pref = sample_population_with_backstory_pref.merge(preferences_df, on='Name', how='inner')

In [39]:
sample_population_with_backstory_pref

Unnamed: 0,Race,Gender,Age,Income,Degree,Community Type,Marital Status,Name,Backstory,Preferences
0,African American,Male,30-49,$75K-$100K,Bachelor's or higher,Urban,Living with partner,Terrence Roberts,Terrence Roberts grew up in a close-knit Afric...,1. Personal Finance: Terrence might be interes...
1,White,Female,30-49,$50K-$75K,High School,Urban,Married,Heather Martinez,"Heather Martinez, a white woman in her mid-thi...",1. DIY Home Improvement: Heather might be inte...
2,African American,Male,50-64,Under $20K,High School,Rural,Married,Andre Williams,"Andre Williams, a 55-year-old African American...","1. ""Tips for small-scale farming and sustainab..."
3,White,Male,18-29,Over $100K,Bachelor's or higher,Rural,Married,Andrew Nelson,Andrew Nelson is a driven and ambitious young ...,"1. ""Top investment strategies for young profes..."
4,African American,Female,18-29,$75K-$100K,Bachelor's or higher,Suburban,Never married,Jasmine Davis,"Jasmine Davis, a vibrant and driven young woma...",1. Career advancement and professional develop...
...,...,...,...,...,...,...,...,...,...,...
395,White,Male,18-29,$75K-$100K,Bachelor's or higher,Suburban,Married,John Henderson,ChatGPT ERROR,ChatGPT ERROR
396,Hispanic,Female,30-49,$75K-$100K,Bachelor's or higher,Suburban,Never married,Adriana Silva,ChatGPT ERROR,ChatGPT ERROR
397,African American,Female,50-64,$20K-$50K,Bachelor's or higher,Rural,Never married,Patricia Parker,ChatGPT ERROR,ChatGPT ERROR
398,White,Female,18-29,$75K-$100K,Bachelor's or higher,Rural,Living with partner,Madison Morgan,ChatGPT ERROR,ChatGPT ERROR


In [45]:
sample_population_with_backstory_pref_clean = sample_population_with_backstory_pref[
(sample_population_with_backstory_pref['Preferences']!='ChatGPT ERROR') & (sample_population_with_backstory_pref['Backstory']!='ChatGPT ERROR')]

In [46]:
sample_population_with_backstory_pref_clean.shape

(312, 10)

In [47]:
# Save data
current_time = datetime.now()
# Format the date and time
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
sample_population_with_backstory_pref_clean.to_csv(f"data/{formatted_time}_sample_population_{sample_population_with_backstory_pref_clean.shape[0]}.csv")

In [48]:
sample_population_with_backstory_pref_clean.head()

Unnamed: 0,Race,Gender,Age,Income,Degree,Community Type,Marital Status,Name,Backstory,Preferences
0,African American,Male,30-49,$75K-$100K,Bachelor's or higher,Urban,Living with partner,Terrence Roberts,Terrence Roberts grew up in a close-knit Afric...,1. Personal Finance: Terrence might be interes...
1,White,Female,30-49,$50K-$75K,High School,Urban,Married,Heather Martinez,"Heather Martinez, a white woman in her mid-thi...",1. DIY Home Improvement: Heather might be inte...
2,African American,Male,50-64,Under $20K,High School,Rural,Married,Andre Williams,"Andre Williams, a 55-year-old African American...","1. ""Tips for small-scale farming and sustainab..."
3,White,Male,18-29,Over $100K,Bachelor's or higher,Rural,Married,Andrew Nelson,Andrew Nelson is a driven and ambitious young ...,"1. ""Top investment strategies for young profes..."
4,African American,Female,18-29,$75K-$100K,Bachelor's or higher,Suburban,Never married,Jasmine Davis,"Jasmine Davis, a vibrant and driven young woma...",1. Career advancement and professional develop...
