# OpenAI Fine-Tuning API
Detailed process of how I will be using OpenAI fine-tuning API for Wally.

In [1]:
# Import necessary libraries
import sys, os
sys.path.insert(0, os.path.abspath('..'))

from dotenv import load_dotenv
from openai import OpenAI

# Load the environment variables
load_dotenv()

# Create instance of OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

## Data Setup

For this section we will compile all our word doc datasets into jsonl files which the OpenAI API requires.

In [2]:
# Import necessary conversion util scripts
from scripts.docs.docx_to_json import convert_docx_folder_to_json
from scripts.utils.json_to_jsonl import convert_to_jsonl

# Word doc setup
docx_folder_patterns = ['advice', 'dating-advice', 'relationship-advice', 'ask-singapore', 'work-advice']
data_folder_path_cn = "../data/processed-word-docs/chinese/reddit"
data_folder_path_el = "../data/processed-word-docs/english/reddit"

output_cn = "../data/processed/chinese"
output_el = "../data/processed/english"

# Convert all relevant data into jsonl format
for p in docx_folder_patterns:
    cn_path = f"{data_folder_path_cn}/{p}/{p}-*.docx"
    el_path = f"{data_folder_path_el}/{p}/{p}-*.docx"

    cn_out = f"{output_cn}/reddit/{p}.json"
    el_out = f"{output_el}/reddit/{p}.json"

    convert_docx_folder_to_json(docx_folder_pattern=cn_path, output_json=cn_out)
    convert_docx_folder_to_json(docx_folder_pattern=el_path, output_json=el_out)

    convert_to_jsonl(input_path=cn_out, output_path=f"{cn_out}l")
    convert_to_jsonl(input_path=el_out, output_path=f"{el_out}l")

Converted advice-1.docx into a convo of 5 messages
Converted advice-10.docx into a convo of 3 messages
Converted advice-2.docx into a convo of 3 messages
Converted advice-3.docx into a convo of 5 messages
Converted advice-4.docx into a convo of 5 messages
Converted advice-6.docx into a convo of 5 messages
Converted advice-7.docx into a convo of 3 messages
Converted advice-8.docx into a convo of 9 messages
Converted advice-9.docx into a convo of 5 messages
✅ Wrote 9 conversations to ../data/processed/chinese/reddit/advice.json
Converted advice-1.docx into a convo of 5 messages
Converted advice-10.docx into a convo of 3 messages
Converted advice-2.docx into a convo of 3 messages
Converted advice-3.docx into a convo of 5 messages
Converted advice-4.docx into a convo of 5 messages
Converted advice-6.docx into a convo of 5 messages
Converted advice-7.docx into a convo of 3 messages
Converted advice-8.docx into a convo of 9 messages
Converted advice-9.docx into a convo of 5 messages
✅ Wrote 

In [7]:
# Create one training file each for english and chinese
from scripts.utils.combine_jsonl import merge_files

fin_cn = f"{output_cn}/reddit/*.jsonl"
fin_el = f"{output_el}/reddit/*.jsonl"

fout_cn = f"{output_cn}/run-one-chi.jsonl"
fout_el = f"{output_el}/run-one-eng.jsonl"

merge_files(pattern=fin_cn, output_path=fout_cn)
merge_files(pattern=fin_el, output_path=fout_el)
training_files = [fout_cn, fout_el]

[merge_files] found 3 files matching pattern: '../data/processed/chinese/reddit/*.jsonl'
[merge_files] ✅ merge complete, output file: '../data/processed/chinese/run-one-chi.jsonl'
[merge_files] found 5 files matching pattern: '../data/processed/english/reddit/*.jsonl'
[merge_files] ✅ merge complete, output file: '../data/processed/english/run-one-eng.jsonl'


## Upload files
Use `client.files.create()` method from OpenAI Files API to upload training file (for now only training, no validation) to OpenAI API. Afterwards, store returned File object ID for reference.

In [9]:
training_file_ids = []
for ff in training_files:
    training_file = client.files.create(
        file=open(ff, "rb"),
        purpose="fine-tune",
    )

    training_file_ids.append(training_file.id)
    print(f"Training file ID for {ff.split("/")[-1]}: {training_file.id}")

Training file ID for run-one-chi.jsonl: file-QcSqFB9dtxf7WAxiTMuQRY
Training file ID for run-one-eng.jsonl: file-NaRw4gN9FXeoQMdJy4Edqk


## Create fine-tuning job
Use the `client.fine_tuning.jobs.create()` method to create a fine-tuning job

In [None]:
job_ids = []
for idx, id in enumerate(training_file_ids):
    job = client.fine_tuning.jobs.create(
        training_file=id,
        model="gpt-4o-mini-2024-07-18",
        suffix="wally",
    )

    job_ids.append(job.id)
    print(f"Job ID for {training_files[idx].split("/")[-1]}: {job.id}")
    print(f"Job status for {training_files[idx].split("/")[-1]}: {job.status}")

Job ID for ../data/processed/chinese/run-one-chi.jsonl: ftjob-o0w6sinf3Rhlw9j4khHy2Vd2
Job status for ../data/processed/chinese/run-one-chi.jsonl: validating_files
Job ID for ../data/processed/english/run-one-eng.jsonl: ftjob-LYiTCGL02DPqBztXmjqSgFyW
Job status for ../data/processed/english/run-one-eng.jsonl: validating_files


## Check Job Status
Check status using `client.fine_tuning.jobs.retrieve() ` method, which takes in job ID.

In [13]:
for idx, id in enumerate(job_ids):
    retrieve_response = client.fine_tuning.jobs.retrieve(id)

    print(f"Job ID: {retrieve_response.id}")
    print(f"Job status: {retrieve_response.status}")
    print(f"Model: {retrieve_response.model}")
    print(f"Trained Tokens: {retrieve_response.trained_tokens} \n")

Job ID: ftjob-o0w6sinf3Rhlw9j4khHy2Vd2
Job status: failed
Model: gpt-4o-mini-2024-07-18
Trained Tokens: None 

Job ID: ftjob-LYiTCGL02DPqBztXmjqSgFyW
Job status: failed
Model: gpt-4o-mini-2024-07-18
Trained Tokens: None 



List events of the job using the `client.fine_tuning.jobs.list_events()` method. Returns a list of events associated with the job.

In [69]:
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

Step 85/100: training loss=0.15
Step 86/100: training loss=1.26
Step 87/100: training loss=1.35
Step 88/100: training loss=0.01
Step 89/100: training loss=0.64
Step 90/100: training loss=0.00
Step 91/100: training loss=0.19
Step 92/100: training loss=0.93
Step 93/100: training loss=0.22
Step 94/100: training loss=0.27
Step 95/100: training loss=1.63
Step 96/100: training loss=0.38
Step 97/100: training loss=1.16
Step 98/100: training loss=0.15
Step 99/100: training loss=0.52
Step 100/100: training loss=1.21
Checkpoint created at step 50
Checkpoint created at step 75
New fine-tuned model created
The job has successfully completed
