# Vision Fine-Tune GPT-4o

In [1]:
from openai import OpenAI
import os
import json
import pandas as pd
import dotenv
from dotenv import load_dotenv
import gpt_utils

In [2]:
dotenv.load_dotenv()
api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI()

## Load data

In [3]:
data_file = '/scratch/shared/image_ai_HHT_data/train.csv'

In [4]:
df = pd.read_csv(data_file, usecols=['image', 'Class', 'Class Name'])

In [5]:
image_dir = '/scratch/shared/image_ai_HHT_data/images/'

In [6]:
# Add image path column
df['image_path'] = image_dir + df['image']

### Prepare train, validation files

In [7]:
gpt_utils.prepare_ft_data(df)



Writing to data/fine-tune/train.jsonl: 100%|██████████| 228/228 [00:00<00:00, 2083.01it/s]


Created data/fine-tune/train.jsonl with 228 examples.
Saved image paths to data/fine-tune/train_image_paths.txt


Writing to data/fine-tune/val.jsonl: 100%|██████████| 151/151 [00:00<00:00, 1583.21it/s]


Created data/fine-tune/val.jsonl with 151 examples.
Saved image paths to data/fine-tune/val_image_paths.txt


## Fine-Tune

In [8]:
model = "gpt-4o-2024-08-06"

In [9]:
jsonl_train_file = 'data/fine-tune/train.jsonl'
jsonl_val_file = 'data/fine-tune/val.jsonl'

In [10]:
train_file = client.files.create(
  file=open(jsonl_train_file, "rb"),
  purpose="fine-tune"
)

In [11]:
val_file = client.files.create(
  file=open(jsonl_val_file, "rb"),
  purpose="fine-tune"
)

See uploaded jsonl files at https://platform.openai.com/storage/files

In [12]:
# Start fine-tuning job
ft_job = client.fine_tuning.jobs.create(
         training_file = train_file.id,
         validation_file = val_file.id,
         model = model
)

## Predict using GPT-4o Model using Batches API

First, run `create_test_jsonl_base.py`.

In [13]:
test_file = 'data/fine-tune/test_base_model.jsonl'

In [14]:
batch_input_file = client.files.create(
  file=open(test_file, "rb"),
  purpose="batch"
)

See uploaded jsonl files at https://platform.openai.com/storage/files

In [15]:
response = client.batches.create(
            input_file_id = batch_input_file.id,
            endpoint = "/v1/chat/completions",
            completion_window = "24h",
            metadata = {
              "description": f"Image label prediction for test set using the base model"
                })

In [16]:
# Save the base model batch ID
gpt_utils.save_batch_id(response.id, "response_id_base_model_test_set")

Saved batch ID 'batch_6727e70be9688190ae2bbe53b7e37e61' under key 'response_id_base_model_test_set'.


- After submitting to Batches API, go to https://platform.openai.com/batches
- Wait until the batch is complete. Then get the responses.

In [17]:
# Later, when you want to load the batch IDs:
base_model_batch_id = gpt_utils.load_batch_id("response_id_base_model_test_set")

In [18]:
print(f"Base Model Batch ID: {base_model_batch_id}")

Base Model Batch ID: batch_6727e70be9688190ae2bbe53b7e37e61


In [19]:
response_out = client.batches.retrieve(base_model_batch_id)

In [20]:
response_out

Batch(id='batch_6727e70be9688190ae2bbe53b7e37e61', completion_window='24h', created_at=1730668299, endpoint='/v1/chat/completions', input_file_id='file-trk17rq6HhS4FD2TIrPBc3KP', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1730668338, error_file_id=None, errors=None, expired_at=None, expires_at=1730754699, failed_at=None, finalizing_at=1730668329, in_progress_at=1730668301, metadata={'description': 'Image label prediction for test set using the base model'}, output_file_id='file-hg2saheYVF4zMmUGvHtEbLZu', request_counts=BatchRequestCounts(completed=160, failed=0, total=160))

In [21]:
jsonl_string = client.files.retrieve_content(response_out.output_file_id)

  jsonl_string = client.files.retrieve_content(response_out.output_file_id)


In [22]:
# Split the string by newline characters to get each JSON line
json_lines = jsonl_string.strip().split('\n')

In [23]:
# Parse each JSON line into a Python dictionary
json_objects = [json.loads(line) for line in json_lines]

# Load the list of JSON objects into a DataFrame
df_test = pd.DataFrame(json_objects)

In [24]:
df_test.shape

(160, 4)

In [25]:
df_test.head()

Unnamed: 0,id,custom_id,response,error
0,batch_req_6727e72a26a881909f3921898fa99cde,1206,"{'status_code': 200, 'request_id': '089527f77b...",
1,batch_req_6727e72a31f88190bbb27c90c1bd44f1,6497,"{'status_code': 200, 'request_id': '98ea4b58c0...",
2,batch_req_6727e72a3d888190b689170b901a4896,6436,"{'status_code': 200, 'request_id': '3b423af84e...",
3,batch_req_6727e72a4b208190b816a68e9faa7f81,2647,"{'status_code': 200, 'request_id': '5cef70a55c...",
4,batch_req_6727e72a569081908f094fa442188706,5504,"{'status_code': 200, 'request_id': 'c1fc613593...",


In [26]:
df_test.to_csv('results/responses_base_model.csv', index=False)

## Predict using Fine-Tuned Model and Batches API

Edit `create_test_jsonl_fine_tuned.py`, to include the name of the fine-tuned model.

First, run `create_test_jsonl_fine_tuned.py`.

In [27]:
test_file = 'data/fine-tune/test_ft_model.jsonl'

In [28]:
batch_input_file = client.files.create(
  file=open(test_file, "rb"),
  purpose="batch"
)

See uploaded jsonl files at https://platform.openai.com/storage/files

In [29]:
response = client.batches.create(
            input_file_id = batch_input_file.id,
            endpoint = "/v1/chat/completions",
            completion_window = "24h",
            metadata = {
              "description": f"Image label prediction for test set using the fine-tuned model"
                })

In [30]:
# Save the base model batch ID
gpt_utils.save_batch_id(response.id, "response_id_ft_model_test_set")

Saved batch ID 'batch_6727eed914388190b810b0adadc60eaf' under key 'response_id_ft_model_test_set'.


- After submitting to Batches API, go to https://platform.openai.com/batches
- Wait until the batch is complete. Then get the responses.

In [31]:
# Later, when you want to load the batch IDs:
ft_model_batch_id = gpt_utils.load_batch_id("response_id_ft_model_test_set")

In [32]:
print(f"Fine-tuned Model Batch ID: {ft_model_batch_id}")

Fine-tuned Model Batch ID: batch_6727eed914388190b810b0adadc60eaf


In [33]:
# response_out = client.batches.retrieve(response.id)
response_out = client.batches.retrieve(ft_model_batch_id)

In [34]:
response_out

Batch(id='batch_6727eed914388190b810b0adadc60eaf', completion_window='24h', created_at=1730670297, endpoint='/v1/chat/completions', input_file_id='file-g7HelGZw8wKZkYKRp4Vt9A8e', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1730670859, error_file_id=None, errors=None, expired_at=None, expires_at=1730756697, failed_at=None, finalizing_at=1730670849, in_progress_at=1730670299, metadata={'description': 'Image label prediction for test set using the fine-tuned model'}, output_file_id='file-hYg5OtQ1cBd0qRT4c94oc9dk', request_counts=BatchRequestCounts(completed=160, failed=0, total=160))

In [35]:
jsonl_string = client.files.retrieve_content(response_out.output_file_id)

  jsonl_string = client.files.retrieve_content(response_out.output_file_id)


In [36]:
# Split the string by newline characters to get each JSON line
json_lines = jsonl_string.strip().split('\n')

In [37]:
# Parse each JSON line into a Python dictionary
json_objects = [json.loads(line) for line in json_lines]

# Load the list of JSON objects into a DataFrame
df_test = pd.DataFrame(json_objects)

In [38]:
df_test.shape

(160, 4)

In [39]:
df_test.head()

Unnamed: 0,id,custom_id,response,error
0,batch_req_6727f1023c7c8190a18ba1bc456b829f,1206,"{'status_code': 200, 'request_id': '66a29f9f23...",
1,batch_req_6727f1024ae08190a9b9f883eb352ce6,6497,"{'status_code': 200, 'request_id': 'e3b5c63154...",
2,batch_req_6727f10257e08190bf643214aed025b3,6436,"{'status_code': 200, 'request_id': '32ee6fe664...",
3,batch_req_6727f10267448190a5b1c3af39aaf83a,2647,"{'status_code': 200, 'request_id': 'bcd77bb0ca...",
4,batch_req_6727f102748c81908af8a47eb66b47f5,5504,"{'status_code': 200, 'request_id': 'e27cf2963c...",


In [40]:
df_test.to_csv('results/responses_ft_model.csv', index=False)