# OpenAI Fine-Tuning API
Detailed process of how I will be using OpenAI fine-tuning API for Wally.

In [None]:
# Import necessary libraries
import sys, os
sys.path.insert(0, os.path.abspath('..'))

from dotenv import load_dotenv
from openai import OpenAI

# Load the environment variables
load_dotenv()

# Create instance of OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

## Data Setup

For this section we will compile all our word doc datasets into jsonl files which the OpenAI API requires.

In [None]:
# Import necessary conversion util scripts
from scripts.docs.docx_to_json import convert_docx_folder_to_json
from scripts.utils.json_to_jsonl import convert_to_jsonl

# Word doc setup
docx_folder_patterns = ['advice', 'dating-advice', 'relationship-advice', 'ask-singapore', 'work-advice']
data_folder_path_cn = "../data/processed-word-docs/chinese/reddit"
data_folder_path_el = "../data/processed-word-docs/english/reddit"

output_cn = "../data/processed/chinese"
output_el = "../data/processed/english"

# Convert all relevant data into jsonl format
for p in docx_folder_patterns:
    cn_path = f"{data_folder_path_cn}/{p}/{p}-*.docx"
    el_path = f"{data_folder_path_el}/{p}/{p}-*.docx"

    cn_out = f"{output_cn}/reddit/{p}.json"
    el_out = f"{output_el}/reddit/{p}.json"

    convert_docx_folder_to_json(docx_folder_pattern=cn_path, output_json=cn_out)
    convert_docx_folder_to_json(docx_folder_pattern=el_path, output_json=el_out)

    convert_to_jsonl(input_path=cn_out, output_path=f"{cn_out}l")
    convert_to_jsonl(input_path=el_out, output_path=f"{el_out}l")

In [None]:
# Create one training file each for english and chinese
from scripts.utils.combine_jsonl import merge_files

fin_cn = f"{output_cn}/reddit/*.jsonl"
fin_el = f"{output_el}/reddit/*.jsonl"

fout_cn = f"{output_cn}/run-one-chi.jsonl"
fout_el = f"{output_el}/run-one-eng.jsonl"

merge_files(pattern=fin_cn, output_path=fout_cn)
merge_files(pattern=fin_el, output_path=fout_el)
training_files = [fout_cn, fout_el]

## Data Integrity Check

Just to make sure that file does not receive validation errors during finetuning..

In [None]:
from scripts.utils.user_ending_lines import find_user_ending_lines

for ff in training_files:
    lines = find_user_ending_lines(ff)
    print(f"lines ending with 'user' for {ff.split("/")[-1]} are: {lines}")

## Upload files
Use `client.files.create()` method from OpenAI Files API to upload training file (for now only training, no validation) to OpenAI API. Afterwards, store returned File object ID for reference.

In [None]:
training_file_ids = []
for ff in training_files:
    training_file = client.files.create(
        file=open(ff, "rb"),
        purpose="fine-tune",
    )

    training_file_ids.append(training_file.id)
    print(f"Training file ID for {ff.split("/")[-1]}: {training_file.id}")

## Create fine-tuning job
Use the `client.fine_tuning.jobs.create()` method to create a fine-tuning job

In [None]:
job_ids = []
for idx, id in enumerate(training_file_ids):
    job = client.fine_tuning.jobs.create(
        training_file=id,
        model="gpt-4o-mini-2024-07-18",
        suffix="wally",
    )

    job_ids.append(job.id)
    print(f"Job ID for {training_files[idx].split("/")[-1]}: {job.id}")
    print(f"Job status for {training_files[idx].split("/")[-1]}: {job.status}")

## Check Job Status
Check status using `client.fine_tuning.jobs.retrieve() ` method, which takes in job ID.

In [None]:
for idx, id in enumerate(job_ids):
    retrieve_response = client.fine_tuning.jobs.retrieve(id)

    print(f"Job ID: {retrieve_response.id}")
    print(f"Job status: {retrieve_response.status}")
    print(f"Model: {retrieve_response.model}")
    print(f"Trained Tokens: {retrieve_response.trained_tokens} \n")

List events of the job using the `client.fine_tuning.jobs.list_events()` method. Returns a list of events associated with the job.

In [None]:
for idx, id in enumerate(job_ids):
    response = client.fine_tuning.jobs.list_events(id)

    events = response.data
    events.reverse()

    for event in events:
        print(event.message)

    print("")