#Data generation step

In [16]:
prompt = "A text generative chatbot model that acts like a librarian to aid students on their academic related queries and questions"
temperature = .3
number_of_examples = 5

Run this to generate the dataset.

In [8]:
!pip install openai tenacity



In [9]:
import os
import openai
import random
from tenacity import retry, stop_after_attempt, wait_exponential

In [10]:
# openai.api_key = "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"
openai.api_key = ""

To generate a system message.

In [50]:
def generate_system_message(prompt):

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        messages=[
          {
            "role": "system",
            "content": "You will be given a high-level description of the model we are training, and from that, you will generate a simple system prompt for that model to use. Remember, you are not generating the system message for data generation -- you are generating the system message to use for inference. A good format to follow is `Given $INPUT_DATA, you will $WHAT_THE_MODEL_SHOULD_DO.`.\n\nMake it as concise as possible. Include nothing but the system prompt in your response.\n\nFor example, never write: `\"$SYSTEM_PROMPT_HERE\"`.\n\nIt should be like: `$SYSTEM_PROMPT_HERE`."
          },
          {
              "role": "user",
              "content": prompt.strip(),
          }
        ],
        temperature=temperature,
        max_tokens=500,
    )

    return response.choices[0].message['content']

system_message = generate_system_message(prompt)

print(f'The system message is: `{system_message}`. Feel free to re-run this cell if you want a better result.')

The system message is: `Given a student's academic related query or question, you will provide helpful information and assistance as a knowledgeable librarian.`. Feel free to re-run this cell if you want a better result.


Now let's put our examples into a dataframe and turn them into a final pair of datasets.

In [32]:
import json
import pandas as pd

prompts = []
responses = []

In [51]:
with open('testprompt.json', 'r') as json_file:
  data = json.load(json_file)

In [52]:
print(data)



In [53]:
# Parse out prompts and responses from examples
for example in data:
  try:
    split_example = example.split('-----------')
    prompts.append(split_example[1].strip())
    responses.append(split_example[3].strip())
  except:
    pass

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples.')

# Initialize list to store training examples
training_examples = []

# Create training examples in the format required for GPT-3.5 fine-tuning
for index, row in df.iterrows():
    training_example = {
        "messages": [
            {"role": "system", "content": system_message.strip()},
            {"role": "user", "content": row['prompt']},
            {"role": "assistant", "content": row['response']}
        ]
    }
    training_examples.append(training_example)

# Save training examples to a .jsonl file
with open('training_examples.jsonl', 'w') as f:
    for example in training_examples:
        f.write(json.dumps(example) + '\n')

There are 433 successfully-generated examples.


# Upload the file to OpenAI

In [74]:
file_id = openai.File.create(
  file=open("/content/training_examples.jsonl", "rb"),
  purpose='fine-tune').id

print(file_id)

file-BhwtRybZmh8pEK18whGK661Z


# Train the model! You may need to wait a few minutes before running the next cell to allow for the file to process on OpenAI's servers.

In [77]:
# openai.api_key = "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"
openai.api_key = ""

openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo-0613")

<FineTuningJob fine_tuning.job id=ftjob-vkopI7ImVuqCugObPH1JvNaG at 0x787d3882b9c0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-vkopI7ImVuqCugObPH1JvNaG",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1693075855,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-T5YA16GrD5v0nJZw8y6jJiks",
  "result_files": [],
  "status": "created",
  "validation_file": null,
  "training_file": "file-BhwtRybZmh8pEK18whGK661Z",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": null
}

# Now, just wait until the fine-tuning run is done, and you'll have a ready-to-use model!

Run this cell every 20 minutes or so -- eventually, you'll see a message "New fine-tuned model created: ft:gpt-3.5-turbo-0613:xxxxxxxxxxxx"

Once you see that message, you can go to the OpenAI Playground (or keep going to the next cells and use the API) to try the model!

In [102]:
openai.FineTuningJob.list_events(id="", limit=2)

<OpenAIObject list at 0x787d36c34180> JSON: {
  "object": "list",
  "data": [
    {
      "object": "fine_tuning.job.event",
      "id": "ftevent-kke1mvW0hpWZkEbxrWvRuyJM",
      "created_at": 1693077853,
      "level": "info",
      "message": "Fine-tuning job successfully completed",
      "data": null,
      "type": "message"
    },
    {
      "object": "fine_tuning.job.event",
      "id": "ftevent-ocnwQlhB7QvHYfgv7g2BroWr",
      "created_at": 1693077851,
      "level": "info",
      "message": "New fine-tuned model created: ft:gpt-3.5-turbo-0613:smulib::7rtBDMkN",
      "data": null,
      "type": "message"
    }
  ],
  "has_more": true
}

In [103]:
openai.FineTuningJob.retrieve("")

<FineTuningJob fine_tuning.job id=ftjob-vkopI7ImVuqCugObPH1JvNaG at 0x787d36e121b0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-vkopI7ImVuqCugObPH1JvNaG",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1693075855,
  "finished_at": 1693077853,
  "fine_tuned_model": "ft:gpt-3.5-turbo-0613:smulib::7rtBDMkN",
  "organization_id": "org-T5YA16GrD5v0nJZw8y6jJiks",
  "result_files": [
    "file-MjDm5CNeUYTvUGAAxuFE0Tmu"
  ],
  "status": "succeeded",
  "validation_file": null,
  "training_file": "file-BhwtRybZmh8pEK18whGK661Z",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": 202890
}

# Once your model is trained, run the next cell to grab the fine-tuned model name.

In [104]:
model_name_pre_object = openai.FineTuningJob.retrieve("ftjob-vkopI7ImVuqCugObPH1JvNaG")
model_name = model_name_pre_object.fine_tuned_model
print(model_name)

ft:gpt-3.5-turbo-0613:smulib::7rtBDMkN


# Let's try it out!

In [105]:
response = openai.ChatCompletion.create(
    model = model_name,
    messages=[
      {
        "role": "system",
        "content": system_message,
      },
      {
          "role": "user",
          "content": df['prompt'].sample().values[0],
      }
    ],
)

response.choices[0].message['content']

'SMU course packs are sold and distributed at the SMU Bookstore <https://www.facebook.com/SMUbookstore>.'

In [106]:
completion = openai.ChatCompletion.create(
  model = model_name,
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
)

print(completion.choices[0].message)

{
  "role": "assistant",
  "content": "How can I help you?"
}
