#### Set the OpenAI api key then client

In [1]:
from dotenv import load_dotenv
import random
import os
import json

# Load environment variables from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Instantiate the OpenAI client
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

#### Temperature set to 0 for more deterministic, less creative output. More examples the better for fine-tuning

In [2]:
TEMPERATURE = .0
MAX_TOKENS = 1500
CITY_SAMPLES_TO_GENERATE = 12

#### The system prompt which instructs the model in it's role

In [3]:
# Define the path to your file
file_path = 'system_prompt.txt'

# Open the file and read its contents
with open(file_path, 'r') as file:
    system_prompt = file.read()

# Now file_contents holds the contents of the text file
print(system_prompt)

Instructions
---
You are generating data in JSON format used to train a machine learning model. You will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a instruction/output pair. All responses must be formatted in the following JSON format:

{"prompt": "prompt instruction goes here", "response": "response output goes here"}

IMPORTANT: Responses not in the provided JSON format will be considered incorrect. Only one instruction/output pair should be generated per turn. For each turn ensure diversity, complexity, and high-quality to train a well-performing model. Here is the high level description of the type of model we want to train:
{model_descripter}
---


#### Descriptive prompt for what we want the llm to generate

In [4]:
# Define the path to your file
file_path = 'descripter.txt'

# Open the file and read its contents
with open(file_path, 'r') as file:
    descripter = file.read()

# Now file_contents holds the contents of the text file
print(descripter)

A model that takes in typical tourist questions asked in English about {destination}, including highlights, sightseeing, history, culture, demographics, districts, communities, languages, activities, outdoor activities, music, religion, local cuisine, 
drinks, restaurants, cafes, markets, accommodation, local transport, weather, shopping, currencies, crafts, safety, climate, tourist density and other areas of interest, and responds with an accuracte, informative response in English.


In [5]:
import csv
cities = []

with open('cities.csv', newline="") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        cities.append(row)

print(cities[0])

{'CITY_CODE': 'MBJ', 'CITY_NAME_DEF': 'Montego Bay', 'COUNTRY_ISO': 'JM', 'NAME_DEF': 'Jamaica', 'LONG_NAME_DEF': 'Jamaica'}


In [6]:
TOTAL_PROMPT_TOKENS = 0
TOTAL_COMPLETION_TOKENS = 0
TOTAL_TOKENS = 0

#### Generate the dataset sample prompt and responses

In [7]:

def generate_sample(prompt, datasample_history, temperature=.0, max_tokens=1000):
    global TOTAL_PROMPT_TOKENS, TOTAL_COMPLETION_TOKENS, TOTAL_TOKENS
    messages=[
        {
            "role": "system",
            "content": prompt
        }
    ]

    if len(datasample_history) > 0:
        if len(datasample_history) > 10:
            datasample_history = random.sample(datasample_history, 5)
        for sample in datasample_history:
            messages.append({
                "role": "assistant",
                "content": sample
            })

    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        model="gpt-4-1106-preview",
        response_format=({"type": "json_object"})
    )
    
    TOTAL_PROMPT_TOKENS += chat_completion.usage.prompt_tokens
    TOTAL_COMPLETION_TOKENS += chat_completion.usage.completion_tokens
    TOTAL_TOKENS += chat_completion.usage.total_tokens

    return chat_completion.choices[0].message.content

# Generate samples
total_samples = []
for c in cities[:30]:
    city_samples_history = []
    composite_city_name = c['CITY_NAME_DEF'] + ' ' + c['LONG_NAME_DEF']
    for s in range(CITY_SAMPLES_TO_GENERATE):
        print(f'Generating sample {s} for city {composite_city_name}')
        the_descripter = descripter.replace("{destination}", composite_city_name)
        the_prompt = system_prompt.replace("{model_descripter}", the_descripter)
        sample = generate_sample(the_prompt, city_samples_history, TEMPERATURE, MAX_TOKENS)
        city_samples_history.append(sample)
    total_samples.extend(city_samples_history)

print(total_samples[0])

Generating sample 0 for city Montego Bay Jamaica
Generating sample 1 for city Montego Bay Jamaica
Generating sample 2 for city Montego Bay Jamaica
Generating sample 3 for city Montego Bay Jamaica
Generating sample 4 for city Montego Bay Jamaica
Generating sample 5 for city Montego Bay Jamaica
Generating sample 6 for city Montego Bay Jamaica
Generating sample 7 for city Montego Bay Jamaica
Generating sample 8 for city Montego Bay Jamaica
Generating sample 9 for city Montego Bay Jamaica
Generating sample 10 for city Montego Bay Jamaica
Generating sample 11 for city Montego Bay Jamaica
Generating sample 0 for city Pristina Kosovo
Generating sample 1 for city Pristina Kosovo
Generating sample 2 for city Pristina Kosovo
Generating sample 3 for city Pristina Kosovo
Generating sample 4 for city Pristina Kosovo
Generating sample 5 for city Pristina Kosovo
Generating sample 6 for city Pristina Kosovo
Generating sample 7 for city Pristina Kosovo
Generating sample 8 for city Pristina Kosovo
Gener

In [8]:
total_prompt_cost = round((TOTAL_PROMPT_TOKENS / 1000) * 0.01, 2)
total_completion_cost = round((TOTAL_COMPLETION_TOKENS / 1000) * 0.03, 2)
total_cost = round(total_prompt_cost + total_completion_cost, 2)
print(f"Total prompt tokens: {TOTAL_PROMPT_TOKENS}, cost: ${total_prompt_cost}")
print(f"Total completion tokens: {TOTAL_COMPLETION_TOKENS}, cost: ${total_completion_cost}")
print(f"Total tokens: {TOTAL_TOKENS}, cost: ${total_cost}")

Total prompt tokens: 342763, cost: $3.43
Total completion tokens: 51089, cost: $1.53
Total tokens: 393852, cost: $4.96


In [13]:
datasample_history_as_json_obj = [json.loads(js) for js in total_samples]
import pandas as pd
pd.set_option('display.max_colwidth', 160)
df = pd.DataFrame(datasample_history_as_json_obj)

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')

print(df.head())
df.to_json('cities_qa.jsonl', orient='records', lines=True, index=False)

There are 360 successfully-generated examples. Here are the first few:
                                                                         prompt  \
0                What are some popular outdoor activities to do in Montego Bay?   
1                             Can you tell me about the history of Montego Bay?   
2           What kind of weather can I expect in Montego Bay during the summer?   
3           What are some traditional dishes I should try while in Montego Bay?   
4  What is the currency used in Montego Bay, and where can I exchange my money?   

                                                                                                                                                          response  
0  Popular outdoor activities in Montego Bay include snorkeling and diving at the Montego Bay Marine Park, relaxing on the famous Doctor's Cave Beach, zip-lini...  
1  Montego Bay, often referred to as MoBay, has a rich history that dates back to the Arawak and Tai