#### Set the OpenAI api key

In [16]:
from dotenv import load_dotenv
import random
import os
import json

# Load environment variables from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Instantiate the OpenAI client
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

#### Temperature set to 0 for more deterministic, less creative output. More examples the better for fine-tuning

In [17]:
TEMPERATURE = .0
MAX_TOKENS = 1500
SAMPLES_TO_GENERATE = 2

#### Descriptive prompt for what we want the model to do

In [18]:
DESTINATION = "Paris, France"
model_descripter = f'''
A model that takes in typical tourist questions asked in English about {DESTINATION}, including highlights, sightseeing, history, culture, demographics, districts, communities, languages, activities, outdoor activities, music, religion, local cuisine, 
drinks, restaurants, cafes, markets, accommodation, local transport, weather, shopping, currencies, crafts, safety, climate, tourist density and other areas of interest, and responds with an accuracte, informative response in English.
'''

In [31]:
messages = []

In [33]:
system_prompt = f'''
Instructions:
---
You are generating data in JSON format used to train a machine learning model. You will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a instruction/output pair. All responses must be formatted in the following JSON format:

{{"instruction": "prompt instruction goes here", "output": "response output goes here"}}

IMPORTANT: Responses not in the provided JSON format will be considered incorrect. Only one instruction/output pair should be generated per turn. For each turn ensure diversity, complexity, and high-quality to train a well-performing model. Here is the high level description of the type of model we want to train:
{model_descripter}
---
'''

def generate_sample(prompt, datasample_history, temperature=.0, max_tokens=1000):
    messages=[
        {
            "role": "system",
            "content": f"You are generating JSON data which will be used to train a machine learning model.\n\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\nYou will do so in this format:\n```\nprompt\n-----------\n$prompt_goes_here\n-----------\n\nresponse\n-----------\n$response_goes_here\n-----------\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.\n\nHere is the type of model we want to train:\n`{model_descripter}`"
        }
    ]

    if len(datasample_history) > 0:
        if len(datasample_history) > 10:
            datasample_history = random.sample(datasample_history, 10)
        for sample in datasample_history:
            messages.append({
                "role": "assistant",
                "content": sample
            })

    print(messages)
    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        model="gpt-4-1106-preview",
        response_format=({"type": "json_object"})
    )

    return chat_completion.choices[0].message.content

# Generate samples
datasample_history = []
for i in range(SAMPLES_TO_GENERATE):
    print(f'Generating sample {i}')
    sample = generate_sample(system_prompt, datasample_history, TEMPERATURE, MAX_TOKENS)
    #print(sample)
    datasample_history.append(sample)
    #sample = json.loads(sample)
    #print(sample)

    #print(datasample_history)

print(datasample_history[0])

Generating sample 0
[{'role': 'system', 'content': 'You are generating JSON data which will be used to train a machine learning model.\n\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\nYou will do so in this format:\n```\nprompt\n-----------\n$prompt_goes_here\n-----------\n\nresponse\n-----------\n$response_goes_here\n-----------\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.\n\nHere is the type of model we want to train:\n`\nA model that takes in typical tourist questions asked in English about Paris, France, including highlights, sightseeing, history, culture, demographics, districts, communities, languages, activities, outdoor activities,

#### Generate the dataset sample prompt and responses

In [26]:
system_prompt = f'''
Instructions:
---
You are generating data in JSON format used to train a machine learning model. You will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a instruction/output pair. All responses must be formatted in the following JSON format:

{{"instruction": "prompt instruction goes here", "output": "response output goes here"}}

IMPORTANT: Responses not in the provided JSON format will be considered incorrect. Only one instruction/output pair should be generated per turn. For each turn ensure diversity, complexity, and high-quality to train a well-performing model. Here is the high level description of the type of model we want to train:
{model_descripter}
---
'''

def generate_sample(prompt, datasample_history, temperature=.0, max_tokens=1000):
    print(prompt)
    messages=[
        {
            "role": "system",
            "content": prompt
        }
    ]

    if len(datasample_history) > 0:
        if len(datasample_history) > 10:
            datasample_history = random.sample(datasample_history, 10)
        for sample in datasample_history:
            messages.append({
                "role": "assistant",
                "content": sample
            })

    chat_completion = client.chat.completions.create(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        model="gpt-4-1106-preview",
        response_format=({"type": "json_object"})
    )

    return chat_completion.choices[0].message.content

# Generate samples
datasample_history = []
for i in range(SAMPLES_TO_GENERATE):
    print(f'Generating sample {i}')
    sample = generate_sample(system_prompt, datasample_history, TEMPERATURE, MAX_TOKENS)
    #print(sample)
    sample = json.loads(sample)
    #print(sample)
    datasample_history.append(sample)
    #print(datasample_history)

print(datasample_history[0])

Generating sample 0

Instructions:
---
You are generating data in JSON format used to train a machine learning model. You will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a instruction/output pair. All responses must be formatted in the following JSON format:

{"instruction": "prompt instruction goes here", "output": "response output goes here"}

IMPORTANT: Only one instruction/output pair should be generated per turn. For each turn ensure diversity, complexity, and high-quality to train a well-performing model. Here is the high level description of the type of model we want to train:

A model that takes in typical tourist questions asked in English about Paris, France, including highlights, sightseeing, history, culture, demographics, districts, communities, languages, activities, outdoor activities, music, religion, local cuisine, 
drinks, restaurants, cafes, markets, accommodation, local transport, weather

BadRequestError: Error code: 400 - {'error': {'message': "'$.messages[1].content' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [8]:
import pandas as pd
df = pd.DataFrame(datasample_history)

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')

df.head()

There are 2 successfully-generated examples. Here are the first few:


Unnamed: 0,0
0,"{""instruction"": ""What are some must-visit attr..."
1,"{""instruction"": ""Can you tell me about the cli..."


Now let's put our examples into a dataframe and turn them into a final pair of datasets.

In [39]:
import pandas as pd

# Initialize lists to store prompts and responses
prompts = []
responses = []

# Parse out prompts and responses from samples
for sample in datasample_history:
  try:
    split_sample = sample.split('-----------')
    prompts.append(split_sample[1].strip())
    responses.append(split_sample[3].strip())
  except:
    pass

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')

df.head()

There are 94 successfully-generated examples. Here are the first few:


Unnamed: 0,prompt,response
0,What are some of the most popular tourist attr...,Paris is home to numerous world-renowned touri...
1,What is the best time of year to visit Paris?,The best time to visit Paris depends on what y...
2,What are some traditional French dishes I shou...,Paris offers a wide range of traditional Frenc...
3,What are some safety tips for tourists visitin...,"While Paris is generally a safe city, it's alw..."
4,What is the main language spoken in Paris and ...,The main language spoken in Paris is French. H...


In [40]:
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files (JSON lines, where eac h line is a valid JSON object)
train_df.to_json('train.jsonl', orient='records', lines=True)
test_df.to_json('test.jsonl', orient='records', lines=True)

In [42]:
len(prompts)
len(responses)

100

In [32]:
len(datasample_history)

100

In [36]:
datasample_history[30]

"prompt\n-----------\nWhat is the currency used in Paris and where can I exchange my money?\n-----------\n\nresponse\n-----------\nThe currency used in Paris is the Euro (€). You can exchange your money at various places:\n\n1. Banks: Most banks in Paris offer currency exchange services. However, they may charge a service fee and the exchange rates may not be the best.\n\n2. Currency Exchange Bureaus: These are found throughout the city, especially in tourist areas and near major train stations. They usually offer better rates than banks, but it's still a good idea to compare rates before exchanging.\n\n3. ATMs: You can withdraw Euros directly from ATMs, which are widely available throughout Paris. This usually gives you a better exchange rate than currency exchange bureaus or banks. Be aware of any fees your bank may charge for international withdrawals.\n\n4. Online: Some services allow you to order currency online and have it delivered to your home or pick it up at a local branch.\n