In [7]:
import pandas as pd
import os
import numpy as np
import openai
import tiktoken
import json
from dotenv import load_dotenv

load_dotenv()

script_dir = os.getcwd()
counsel_path = os.path.join(script_dir, "data", "cleaned_counselchat.csv")
chatbot_path = os.path.join(script_dir, "data", "cleaned_chatbot_data.csv")

MAX_TRAIN_SIZE = 700
SYS_MSG = """You are Regina, a helpful counselor with a lot of empathy.
You talk to people who need psychological support and try to alleviate
their pain by giving advice without making judgments."""

In [None]:
def format_data(row):
    messages = list()

    messages.append({
        "role": "system",
        "content": SYS_MSG
    })

    user_msg = {
        "role": "user",
        "content": row["questionText"]
    }
    assistant_msg = {
        "role": "assistant",
        "content": row["clean_answer"]
    }

    messages.extend([user_msg, assistant_msg])
    return {"messages": messages}

df = pd.read_csv(counsel_path)
df = df.drop_duplicates(subset="questionText")
if len(df) > MAX_TRAIN_SIZE:
    df = df.sample(MAX_TRAIN_SIZE)

df2 = pd.read_csv(chatbot_path)

full_df = pd.concat([df, df2]).sample(frac=1).reset_index(drop=True)
df_train = full_df[:-100]
df_val = full_df[-100:]



training_data = df_train.apply(format_data, axis=1).to_list()
validation_data = df_val.apply(format_data, axis=1).to_list()

In [None]:
training_data[:3]

[{'messages': [{'role': 'system',
    'content': 'You are Regina, a helpful counselor with a lot of empathy.\nYou talk to people who need psychological support and try to alleviate\ntheir pain by giving advice without making judgments.'},
   {'role': 'user',
    'content': "I’ve been on 0.5 mg of Xanax twice a day for the past month. It hasn't been helping me at all, but when I take 1 mg during a big anxiety attack, it calms me down. I was wondering how I can ask my psychologist to up the dose to 1 mg twice a day without her thinking I'm abusing them. I just have very big anxiety attacks. Should I stay on the 0.5mg and deal with the attacks or should I ask to up the dose? I'm afraid she will take me off them and put me on something else."},
   {'role': 'assistant',
    'content': 'In general, Xanax is very short-acting. You mentioned that you are afraid that your medical provider will take you off of the Xanax and put you on something else. If the Xanax is not working well for you, I w

# Costs

In [None]:
# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in training_data:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 54, 1000
mean / median: 271.76601307189543, 247.0
p5 / p95: 60.0, 505.0000000000001

#### Distribution of num_assistant_tokens_per_example:
min / max: 2, 879
mean / median: 166.03529411764706, 136.0
p5 / p95: 7.0, 357.80000000000007

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [69]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25
COST_PER_1K = 0.008

n_epochs = TARGET_EPOCHS
n_train_examples = len(training_data)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print(f"With a price of ${COST_PER_1K} / 1K Tokens, the cost should be around: \
$ {(n_epochs * n_billing_tokens_in_dataset) / 1000 * COST_PER_1K}")


Dataset has ~207901 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~623703 tokens
With a price of $0.008 / 1K Tokens, the cost should be around: $ 4.989624


# Save Files

In [70]:
def write_jsonl(data_list: list, path: str) -> None:
    with open(path, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

training_path = os.path.join(script_dir, "data", "tmp_recipe_finetune_training.jsonl")
write_jsonl(training_data, training_path)

validation_path = os.path.join(script_dir, "data", "tmp_recipe_finetune_validation.jsonl")
write_jsonl(validation_data, validation_path)

# Upload Files

In [71]:
training_response = openai.File.create(
    file=open(training_path, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

validation_response = openai.File.create(
    file=open(validation_path, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response["id"]

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-k5x943rw8eWvrvztLH98KYxy
Validation file ID: file-pzQXHAs6Q9MooAjb1DmCNpos


# Fine-tuning

In [None]:
response = openai.FineTuningJob.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix="counselor-test2",
)

job_id = response["id"]

print("Job ID:", response["id"])
print("Status:", response["status"])

In [None]:
response = openai.FineTuningJob.retrieve(job_id)
print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-4Mz8NsADmeBwZSdZni2G7C7L",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1694530716,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-paNJzG7NBclkFaxfxYM9IntQ",
  "result_files": [],
  "status": "running",
  "validation_file": "file-IEfGb36xxasnBW1bLE8vRKzz",
  "training_file": "file-vXIsNXTblmxC5tSVLHVdS8Gl",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": null,
  "error": null
}


In [None]:

response = openai.FineTuningJob.list_events(id=job_id)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])


Step 500/2295: training loss=1.73
Step 600/2295: training loss=0.78
Step 700/2295: training loss=2.18
Step 800/2295: training loss=1.91
Step 900/2295: training loss=1.74
Step 1000/2295: training loss=2.04
Step 1100/2295: training loss=0.19
Step 1200/2295: training loss=1.91
Step 1300/2295: training loss=1.87
Step 1400/2295: training loss=1.88
Step 1500/2295: training loss=1.95
Step 1600/2295: training loss=1.93
Step 1700/2295: training loss=1.99
Step 1800/2295: training loss=1.85
Step 1900/2295: training loss=0.39
Step 2000/2295: training loss=0.23
Step 2100/2295: training loss=2.81
Step 2200/2295: training loss=2.11
New fine-tuned model created: ft:gpt-3.5-turbo-0613:dq-data:counselor-test2:7y0ExFEB
The job has successfully completed


In [None]:
response = openai.FineTuningJob.retrieve(job_id)
fine_tuned_model_id = response["fine_tuned_model"]

print("Fine-tuned model ID:", fine_tuned_model_id)

Fine-tuned model ID: ft:gpt-3.5-turbo-0613:dq-data:counselor-test2:7y0ExFEB
