In [None]:
# !pip install datasets tiktoken openai wandb

# Fine Tuning using OpenAI GPT-4o

See the pricing page:
https://openai.com/api/pricing/

**Fine-tuning for GPT-4o and GPT-4o mini is free up to a daily token limit through October 31, 2024. 

`gpt-4o-2024-08-06`
For GPT-4o, each qualifying org gets up to 1M complimentary training tokens daily and any overage will be charged at the normal rate of $25.00/1M tokens. 

`gpt-4o-mini-2024-07-18`
For GPT-4o mini, each qualifying org gets up to 2M complimentary training tokens daily and any overage will be charged at the normal rate of $3.00/1M tokens.

In [49]:
import os
import openai

# openai.api_key = ''
openai.api_key=os.environ.get("OPENAI_API_KEY")

# Set the model name
model_name = "gpt-4o-mini-2024-07-18"
# model_name = "gpt-4o-2024-08-06"

## Read data

In [1]:
from datasets import load_dataset, DatasetDict

# Maxime Labonne's ai knowledge dataset.
# https://huggingface.co/datasets/mlabonne/FineTome-100k
ds = load_dataset("mlabonne/FineTome-100k")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100000
    })
})


In [2]:
from datasets import load_dataset, DatasetDict
import json

TRAIN_SIZE = 100
VALID_SIZE = 100
TEST_SIZE = 100

# Shuffle data
ds = ds.shuffle(seed=42)

# Split off a small 'test' dataset.
ds_split_test = ds['train'].train_test_split(test_size=TEST_SIZE)

# Split off a small 'valid' dataset.
ds_split_valid = ds_split_test['train'].train_test_split(test_size=VALID_SIZE)

# Split off a small 'train' dataset from the remaining train dataset.
ds_split_train = ds_split_valid['train'].train_test_split(test_size=TRAIN_SIZE)

# Create a new DatasetDict to hold the train, valid, test sets
ds = DatasetDict({
    'test': ds_split_test['test'],
    'valid': ds_split_valid['test'],
    'train': ds_split_train['test'],
})
print(ds)

DatasetDict({
    test: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100
    })
    valid: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100
    })
    train: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100
    })
})


In [3]:
# Common patterns for Hugging Face Datasets
# https://www.youtube.com/watch?app=desktop&v=blF9uxYcKHo
from datasets import load_dataset, load_from_disk

# # 1. ARROW datasets directly.
# # Create a new dir and save Arrow format, typically in 3 folders: train, valid, test.
# # Splits info stored in the file: new_dir/dataset_dict.json.
# ds.save_to_disk('my-arrow-datasets')
# # Read them back into memory anytime.
# ds_saved = load_from_disk('my-arrow-datasets')
# ds_saved

# 2. datasets to/from JSON Lines files.
# Loop through ds dict object, create new dir, save each split to a JSON Lines file.
for split, dataset in ds.items():
    dataset.to_json(f'data/fine_tome_{split}.json', orient='records', lines=True) 
data_files={'train': 'data/fine_tome_train.json', 'valid': 'data/fine_tome_valid.json', 'test': 'data/fine_tome_test.json'}
json_datasets_saved = load_dataset('json', data_files=data_files)
print(json_datasets_saved)

# # 3. datasets to/from Parquet, more efficient for big data.
# # Loop through ds dict object, create new dir, save each split to a .parquet file.
# for split, dataset in ds.items():
#     dataset.to_parquet(f'my-parquet-datasets/{split}.parquet') 
# data_files={'train': 'my-parquet-datasets/train.parquet', 'valid': 'my-parquet-datasets/valid.parquet', 'test': 'my-parquet-datasets/test.parquet'}
# parquet_datasets_saved = load_dataset('parquet', data_files=data_files)
# print(parquet_datasets_saved)

# # 4. datasets to/from CSV files, less efficient for big data.
# # Loop through ds dict object, create new dir, save each split to a CSV file.
# for split, dataset in ds.items():
#     dataset.to_csv(f'my-csv-datasets/{split}.csv', index=None)
# data_files={'train': 'my-csv-datasets/train.csv', 'valid': 'my-csv-datasets/valid.csv', 'test': 'my-csv-datasets/test.csv'}
# csv_datasets_saved = load_dataset('csv', data_files=data_files)
# csv_datasets_saved

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100
    })
    valid: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100
    })
    test: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100
    })
})


In [4]:
# Double-check row counts of the saved datasets.

# Compare the original data with the data read from the file
original_data = ds['train']
# Check datatypes.
print(type(original_data), type(json_datasets_saved['train']))
# Check number of rows.
print(len(original_data), len(json_datasets_saved['train']))
# Check if the data matches.
print(original_data == json_datasets_saved['train'])
print(original_data.to_dict() == json_datasets_saved['train'].to_dict())
# False, inspect 1 row.
print(original_data[10]['conversations'])
print(json_datasets_saved['train'][10]['conversations'])

# hmm, spot-check looks the same to me, try to find the differences?
import deepdiff
# Perform a detailed comparison
differences = deepdiff.DeepDiff(original_data, json_datasets_saved['train'], ignore_order=True)
print("Differences:", differences)
# No differences found!

<class 'datasets.arrow_dataset.Dataset'> <class 'datasets.arrow_dataset.Dataset'>
100 100
False
False
[{'from': 'human', 'value': "Write Python code to solve the task:\nGiven a number N, generate bit patterns from 0 to 2^N-1 such that successive patterns differ by one bit. \nA Gray code sequence must begin with 0.\n \nExample 1:\nInput:\nN = 2\nOutput: \n00 01 11 10\nExplanation: \n00 and 01 differ by one bit.\n01 and 11 differ by one bit.\n11 and 10 also differ by one bit.\n \nExample 2:\nInput:\nN=3\nOutput:\n000 001 011 010 110 111 101 100\nExplanation:\n000 and 001 differ by one bit.\n001 and 011 differ by one bit.\n011 and 010 differ by one bit.\nSimilarly, every successive pattern \ndiffers by one bit.\nYour task:\nYou don't need to read input or print anything. Your task is to complete the function graycode() which takes an integer N as input and returns a la list of patterns.\n \nExpected Time Complexity: O(2^{n})\nExpected Auxiliary Space: O(2^{n})\n \nConstraints :\n1<=N<=16"

## Format data for Fine-tuning

Use OpenAI Cookbook

https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

We now use `standardize_sharegpt` to convert ShareGPT style datasets into HuggingFace's generic format. This changes the dataset from looking like:

```json
{"from": "system", "value": "You are an assistant"}
{"from": "human", "value": "What is 2+2?"}
{"from": "gpt", "value": "It's 4."}
```
to
```json
{"messages": [
    {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},{"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
{"messages": [
    {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
```

In [8]:
# Function to convert the dictionaries
def convert_dicts(dict_list, system_message="Think before responding."):
    """Convert a list of HF conversations to list of GPT Multi-turn messages. 
    Each turn begins with 1 system message and can continue with many user, 
    assistant messages. Skip fine-tuning on all but last assistant messages 
    per turn. That is, set weight=0 on all but last assistant message per turn.
    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

    Args:
        dict_list (_type_): HF conversation list of dicts
        system_message (str, optional): Defaults to "Think before responding".

    Returns:
        _type_: OpenAI GPT Multi-turn messages list of dicts
    """
    messages = []
    system_message_added = False
    assistant_indices = []

    for d in dict_list:
        if "from" in d:
            if d["from"] == "system":
                # Process the previous turn's assistant messages
                if assistant_indices:
                    messages[assistant_indices[-1]]["weight"] = 1
                    assistant_indices = []
                # Process the system message
                messages.insert(0, {"role": "system", "content": d["value"]})
                system_message_added = True
            elif d["from"] == "human":
                messages.append({"role": "user", "content": d["value"]})
            elif d["from"] == "gpt":
                messages.append({"role": "assistant", "content": d["value"], "weight": 0})
                assistant_indices.append(len(messages) - 1)
            else:
                messages.append({"role": "user", "content": d["value"]})
        else:
            messages.append(d)  # Keep the original dictionary if 'from' key is missing
    
    # Process the last turn's assistant messages
    if assistant_indices:
        messages[assistant_indices[-1]]["weight"] = 1
    
    # Include the system message if none provided
    if not system_message_added:
        messages.insert(0, {
            "role": "system",
            "content": system_message
        })
    
    return messages

def save_to_jsonl(conversations, file_path):
  with open(file_path, 'w') as file:
    for conversation in conversations:
      json_line = json.dumps(conversation)
      file.write(json_line + '\n')

def process_and_save(system_message, in_json_file_path, out_json_file_path):

    # Load dataset from input json file.
    test_jsonl = []
    with open(in_json_file_path) as f:
        test_jsonl = [ json.loads(line) for line in f]
    # print(f"input: {len(test_jsonl[0])}")  # 3

    gpt_format = []
    for item in test_jsonl:
        # print(item)
        conversation = item['conversations']
        conversation = convert_dicts(conversation, system_message)
        formatted_conversation = {"messages": conversation}
        gpt_format.append(formatted_conversation)
    
    print(f"Number of conversations: {len(gpt_format)}")
    save_to_jsonl(gpt_format, out_json_file_path)

# Input:
# {'conversations': [{'from': 'human', 'value': "In a gravity-free and balanced, the total pressure at the center remains $P=\\dfrac{F}{A}$."}], 'source': 'WebInstructSub_axolotl', 'score': 3.8947217464}

In [11]:
# TEST THE CONVERSION FUNCTIONS FROM HF TO GPT FORMATS.

temp = json_datasets_saved['test']
print("Num examples HF:", len(temp))
print("HF example:")
TEST_ROW = 13
temp[TEST_ROW]

# Convert the list of dictionaries
gpt_format = []
system_message = """Think before responding."""

for item in temp:
    conversation = item['conversations']
    conversation = convert_dicts(conversation, system_message)
    formatted_conversation = {"messages": conversation}
    gpt_format.append(formatted_conversation)
    
# gpt_format
print("Num examples GPT:", len(gpt_format))
print("14th GPT example:")
gpt_format[TEST_ROW]

Num examples HF: 100
HF example:
Num examples GPT: 100
14th GPT example:


{'messages': [{'role': 'system', 'content': 'Think before responding.'},
  {'role': 'user',
   'content': 'Discuss the ways in which Latin American art has impacted and influenced the content, style, and techniques of Western art throughout history. Provide specific examples of Latin American artists and their works that have had a significant impact on the development of Western art movements and styles. Analyze the cultural, political, and social factors that have contributed to this influence and explore how contemporary Western art continues to be shaped by Latin American artistic traditions.'},
  {'role': 'assistant',
   'content': 'Latin American art has had a significant impact on the history of Western art throughout the centuries. From the vibrant colors and imagery of the pre-Columbian era to contemporary works by artists like Frida Kahlo and Gabriel Orozco, Latin American art continues to inspire and challenge Western artists.\n\nOne of the primary ways that Latin American a

In [12]:
system_message = """Think before responding."""

# Process and save the train dataset
in_json_file_path = "data/fine_tome_train.json"
train_data_filename = "data/fine_tome_train_gpt.jsonl"
print("Processing and saving train data...")
process_and_save(
    system_message, 
    in_json_file_path, train_data_filename)

# Process and save the valid dataset
in_json_file_path = "data/fine_tome_valid.json"
valid_data_filename = "data/fine_tome_valid_gpt.jsonl"
print("Processing and saving valid data...")
process_and_save(
    system_message, 
    in_json_file_path, valid_data_filename)

# Process and save the test dataset
in_json_file_path = "data/fine_tome_test.json"
test_data_filename = "data/fine_tome_test_gpt.jsonl"
print("Processing and saving test data...")
process_and_save(
    system_message, 
    in_json_file_path, test_data_filename)

Processing and saving train data...
Number of conversations: 100
Processing and saving valid data...
Number of conversations: 100
Processing and saving test data...
Number of conversations: 100


### Check for Data Format Errors

Use OpenAI Cookbook sample code.

https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [13]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [14]:
# Read the JSONL file

# Check: load dataset from json file.
file_path = test_data_filename
temp = []
with open(file_path) as f:
  temp = [ json.loads(line) for line in f]
print(f"test: {len(temp)}")  # 100

temp[5]


test: 100


{'messages': [{'role': 'system', 'content': 'Think before responding.'},
  {'role': 'user',
   'content': 'Solve the following problem step-by-step:\nGiven the context and corresponding question, choose the correct answer from the options.\n\nContext: \nPeople often pronounce a word differently when asked to read written material aloud than when speaking spontaneously. These differences may cause problems for those who develop computers that recognize speech. Usually the developers "train" the computers by using samples of written material read by the people who will be using the computer.\n\nQuestion: \nThe observations above provide most evidence for the conclusion that\n\nOptions: \nA. computers may be less reliable in decoding spontaneous speech than in decoding samples that have been read aloud\nB. it will be impossible to develop computers that decode spontaneous speech\nC. computers are now able to interpret oral speech without error\nD. a "trained" computer never correctly deco

### Check for data formatting errors

In [15]:
# Format error checks
format_errors = defaultdict(int)

for ex in temp:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


### Estimate Num Tokens, to estimate costs

In [18]:
encoding = tiktoken.get_encoding("cl100k_base")

# helper functions to token counting, not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            if isinstance(value, str):  # Ensure the value is a string
                num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [19]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in temp:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 25
mean / median: 3.88, 3.0
p5 / p95: 3.0, 5.0

#### Distribution of num_total_tokens_per_example:
min / max: 114, 4167
mean / median: 620.52, 439.0
p5 / p95: 230.9, 1102.3000000000002

#### Distribution of num_assistant_tokens_per_example:
min / max: 6, 3829
mean / median: 432.29, 306.5
p5 / p95: 150.4, 768.9000000000004

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning


In [20]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
total_billing_tokens = n_epochs * n_billing_tokens_in_dataset
print(f"By default, you'll be charged for ~{total_billing_tokens} tokens")

Dataset has ~62052 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~186156 tokens


In [22]:
# Estimated training cost
# https://openai.com/api/pricing/

PER_TOKEN_COST = 0.0000003
NUM_EPOCHS = 3
NUM_DATASETS = 2

print(f"Estimated cost is ${total_billing_tokens * PER_TOKEN_COST * NUM_EPOCHS * NUM_DATASETS}")


Estimated cost is $0.33508079999999996


In [23]:
temp[:1]

[{'messages': [{'role': 'system', 'content': 'Think before responding.'},
   {'role': 'user',
    'content': 'A line segment is bisected by the line represented by the equation \\(2y + x = 7\\). If one endpoint of the line segment is at \\((5, 3)\\), determine the coordinates of the other endpoint.'},
   {'role': 'assistant',
    'content': "The bisector's equation can be found by considering that it is perpendicular to the given line \\(L\\): \\(2y + x = 7\\), which has a gradient \\(m_L = -\\frac{1}{2}\\). The gradient of the bisector \\(B\\), \\(m_B\\), is the negative reciprocal of \\(m_L\\), so \\(m_B = 2\\).\n\nGiven a point \\((x_1, y_1)\\) on the bisector, we can express its equation as:\n\\[y - y_1 = m_B (x - x_1)\\]\n\nUsing the point \\((5, 3)\\), the equation of the bisector \\(B\\) is:\n\\[y - 3 = 2(x - 5)\\]\n\\[y - 3 = 2x - 10\\]\n\\[y = 2x - 7\\]\n\nThe intersection point \\(M\\) of \\(B\\) and \\(L\\) can be found by solving the system of equations:\n\\[2y + x = 7\\]\n

**Training Data:**

Definition:<br>
  

*   dataset used to train or update the model's parameters
*   It is the input data that the model learns from.
* During the training process, the model adjusts its internal parameters based on the patterns and features present in the training data.
* Size is large as the model needs sufficient examples to learn meaningful patterns.


**Validation Data:**

* Dataset that is not used during the training phase.
* Instead, it serves as a measure of the model's performance during training.
* The validation set helps you monitor the model's generalization to new, unseen data and detect potential issues such as overfitting or underfitting.
* unbiased evaluation of the model's performance on data it hasn't seen before.

* Size is typically smaller than the training set but large enough to provide a reliable assessment of the model's performance.

### Upload training/validation dataset

##### for openai ver 1.0.0

In [24]:
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

# Set the model name
model_name = "gpt-4o-mini-2024-07-18"
# model_name = "gpt-4o-2024-08-06"

In [25]:
print(train_data_filename)
print(valid_data_filename)
print(test_data_filename)

data/fine_tome_train_gpt.jsonl
data/fine_tome_valid_gpt.jsonl
data/fine_tome_test_gpt.jsonl


In [26]:
from pathlib import Path

# Upload the train data to OpenAI
training_response = client.files.create(
    file=Path(train_data_filename),
    purpose="fine-tune"
)
print(training_response)
training_file_id = training_response.id
print(training_file_id)

FileObject(id='file-H0LeAG22X3404CTIHkuPE4EM', bytes=265449, created_at=1730069170, filename='fine_tome_train_gpt.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
file-H0LeAG22X3404CTIHkuPE4EM


In [27]:
# Upload the valid data to OpenAI
validation_response = client.files.create(
    file=Path(valid_data_filename),
    purpose="fine-tune"
)
print(validation_response)
validation_file_id = validation_response.id
print(validation_file_id)

FileObject(id='file-ReZ2mMJJYp67b2E7wMvFPqDO', bytes=241215, created_at=1730069172, filename='fine_tome_valid_gpt.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
file-ReZ2mMJJYp67b2E7wMvFPqDO


### Start a fine-tuning job

In [28]:
# Check your base model.
model_name

'gpt-4o-mini-2024-07-18'

In [30]:
# Best practices: https://beta.openai.com/docs/guides/best-practices
# Set batch size to approx 1/20th of the dataset size, for smoother learning curves.
print(f"Train data num samples: {TRAIN_SIZE}")  
batch_size = TRAIN_SIZE // 20
print(f"Batch size: {batch_size}")  

Train data num samples: 100
Batch size: 5


In [35]:
# https://platform.openai.com/docs/api-reference/fine-tuning/create
response = client.fine_tuning.jobs.create(
    model = model_name,
    training_file = training_file_id,
    validation_file = validation_file_id,
    suffix="finetome_try3",
    hyperparameters = {
        "n_epochs": 3,  # overtrain since checkpointing is supported
        "batch_size": 1, #batch_size, # 5, default is 1
        # "learning_rate_multiplier": 1.8, # default 1.8
    },
    integrations = [{
        "type": "wandb",
        "wandb": {
          "project": "my-wandb-project",
          "name": "finetome_try3",
          "tags": ["project:tag", "lineage"]
        }
      }],
)
print(response)
job_id = response.id
print(job_id)

FineTuningJob(id='ftjob-dNsMjoTKcaJFJndhzPBsQgwh', created_at=1730071087, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-0TvGxvi7iOVWbyYmOwLasUrl', result_files=[], seed=356598581, status='validating_files', trained_tokens=None, training_file='file-H0LeAG22X3404CTIHkuPE4EM', validation_file='file-ReZ2mMJJYp67b2E7wMvFPqDO', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='my-wandb-project', entity=None, name=None, tags=None, run_id='ftjob-dNsMjoTKcaJFJndhzPBsQgwh'))], user_provided_suffix='finetome_try3')
ftjob-dNsMjoTKcaJFJndhzPBsQgwh


#### List fine-tuning jobs and events

In [45]:
# client.fine_tuning.jobs.list(limit=5)
# client.fine_tuning.jobs.retrieve(job_id)

# Check job submission details of the fine-tuning job
job_response = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id)
print(job_response)

# List event messages
events = job_response.data
# events
for event in events:
  print(event.message)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-hlrL3gEeRxsO4xRXHVVp4OeW', created_at=1730071706, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-1zL5V2XCPdS9ZCpypUGtF5PU', created_at=1730071700, level='info', message='New fine-tuned model created', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-mlF1wi5Dt26ZP4iH3C7SoKRF', created_at=1730071700, level='info', message='Checkpoint created at step 200', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-hA0Aq0KCh1zH0gab9ZhcJAW3', created_at=1730071700, level='info', message='Checkpoint created at step 100', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-0z73RSVNqQnGM1z6XhjHB06M', created_at=1730071673, level='info', message='Step 300/300: training loss=0.13, validation loss=0.40, full validation l

#### View Weights and Biases metrics

In [39]:
from wandb.integration.openai.fine_tuning import WandbLogger

# one line command
WandbLogger.sync()

# passing optional parameters
WandbLogger.sync(
    fine_tune_job_id=None,
    num_fine_tunes=None,
    project="OpenAI-Fine-Tune",
    entity=None,
    overwrite=False,
    model_artifact_name="model-metadata",
    model_artifact_type="model",
    # **kwargs_wandb_init
)

# Web browser all fine-tuning jobs
# https://wandb.ai/cbergman/OpenAI-Fine-Tune
# Web browser single fine-tuning job
# https://wandb.ai/cbergman/OpenAI-Fine-Tune/runs/ftjob-dPYoOEifQuaQ84BdJ5I8wWFK
# https://wandb.ai/cbergman/OpenAI-Fine-Tune/runs/ftjob-dNsMjoTKcaJFJndhzPBsQgwh

[34m[1mwandb[0m: Waiting for the OpenAI fine-tuning job to finish training...
[34m[1mwandb[0m: To avoid blocking, you can call `WandbLogger.sync` with `wait_for_job_success=False` after OpenAI training completes.
[34m[1mwandb[0m: Fine-tuning finished, logging metrics, model metadata, and run metadata to Weights & Biases
[34m[1mwandb[0m: Logging training/validation files...


0,1
train_accuracy,▃▄▅▃▁▂▅▄▅▁▁▅▁▄▄▄▃▄▁▆▅▆▆▅▆▅▆▅▆▇▆▆▇▆▇▆▇▄█▇
train_loss,▆▅▅▄▇█▅▂▂▃▃▄▄▃▂▄▃▂▂▃▃▂▂▃▃▂▃▃▁▂▁▂▁▃▂▂▂▂▁▁
valid_loss,▇▂▃▂▃▆▃▅▄▆▃█▂▄▃▄▃▄▇▁▃▂▁▂▂▂▅▅▇▂
valid_mean_token_accuracy,▃▇▇▇▆▃▆▅▄▄▇▁▇▆▆▅▆▅▃█▆▇█▆█▇▅▅▃▇

0,1
fine_tuned_model,ft:gpt-4o-mini-2024-...
status,succeeded
train_accuracy,0.9541
train_loss,0.13083
valid_loss,0.39653
valid_mean_token_accuracy,0.87421


### Get fine tuned model and checkpoints

In [None]:
import openai
import requests

openai_api_key = os.environ.get("OPENAI_API_KEY")

# Define the API endpoint and headers
url = f"https://api.openai.com/v1/fine_tuning/jobs/{job_id}/checkpoints"
headers = {
    "Authorization": f"Bearer {openai_api_key}"
}

# Make the API request
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    checkpoints = response.json()
    # Get the checkpoints
    for checkpoint in checkpoints['data']:
        print(checkpoint)
else:
    print(f"Failed to retrieve checkpoints: {response.status_code}")
    print(response.text)

# Save the desired checkpoint ID
chosen_checkpoint_step_number = checkpoints['data'][1]['step_number']
print(f"Using checkpoint: {chosen_checkpoint_step_number}")

In [81]:
# Get the final fine-tuned model.
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tune_model_id = response.fine_tuned_model
print(f"Final fine-tuned model: {fine_tune_model_id}")

Final fine-tuned model: ft:gpt-4o-mini-2024-07-18:christybergman-com:finetome-try3:AN6yBpe1


### Generating using new model

Use either the final fine-tuned model or any intermediate checkpoint.

In [None]:
# TODO: how to get this from api, got below from console.
inference_model_id = "ft:gpt-4o-mini-2024-07-18:christybergman-com:finetome-try3:AN6yBKew:ckpt-step-200"
inference_model_id

'ft:gpt-4o-mini-2024-07-18:christybergman-com:finetome-try3:AN6yBKew:ckpt-step-200'

In [None]:
# TODO: Fix this cell later.
# Final fine-tuned model: ft:gpt-4o-mini-2024-07-18:christybergman-com:finetome-try3:AN6yBpe1

# TODO replace checkpoint for the inference model id.
# Assemble the full, checkpoint model_id.
parts = str.split_parst(fine_tune_model_id, ':')
print(parts)
chosen_checkpoint_step_number = 200
inference_model_id = parts[-1] + chosen_checkpoint_step_number

In [90]:
# Read sample question from test data 
MESSAGE_NUM = 72  # 3, 80, 72
sample_message = temp[MESSAGE_NUM]

In [96]:
sample_message['messages']['user']

[{'role': 'system', 'content': 'Think before responding.'},
 {'role': 'user',
  'content': 'Explain how the concept of signed area can be used to find the total area under the graph of $y=x^2$ from $x=-2$ to $x=2$, even though the function is not odd.'},
 {'role': 'assistant',
  'content': 'While $x^2$ is not an odd function, we can still use the concept of signed area to find the total area under its graph. We can split the integral into two parts: from $x=-2$ to $x=0$ and from $x=0$ to $x=2$. Since $x^2$ is negative below the x-axis from $x=-2$ to $x=0$, the integral over this interval will give a negative signed area. However, the integral from $x=0$ to $x=2$ will give a positive signed area. To find the total area, we can take the absolute value of the negative signed area and add it to the positive signed area, as follows:\n\n$$\\left|\\int_{-2}^0x^2dx\\right|+\\left|\\int_{0}^2x^2dx\\right|=\\frac{8}{3}+\\frac{8}{3}=\\frac{16}{3}.$$',
  'weight': 1}]

In [106]:
# Assemble the list of dictionaries called test_messages
# Use convert_dicts() to re-format the message
test_messages = convert_dicts([sample_message])
test_messages[1]['messages']

[{'role': 'system', 'content': 'Think before responding.'},
 {'role': 'user',
  'content': 'Explain how the concept of signed area can be used to find the total area under the graph of $y=x^2$ from $x=-2$ to $x=2$, even though the function is not odd.'},
 {'role': 'assistant',
  'content': 'While $x^2$ is not an odd function, we can still use the concept of signed area to find the total area under its graph. We can split the integral into two parts: from $x=-2$ to $x=0$ and from $x=0$ to $x=2$. Since $x^2$ is negative below the x-axis from $x=-2$ to $x=0$, the integral over this interval will give a negative signed area. However, the integral from $x=0$ to $x=2$ will give a positive signed area. To find the total area, we can take the absolute value of the negative signed area and add it to the positive signed area, as follows:\n\n$$\\left|\\int_{-2}^0x^2dx\\right|+\\left|\\int_{0}^2x^2dx\\right|=\\frac{8}{3}+\\frac{8}{3}=\\frac{16}{3}.$$',
  'weight': 1}]

In [114]:
# Your fine-tuned model's inference answer.
response = client.chat.completions.create(
    model = inference_model_id,
    messages = test_messages[1]['messages']
)
print(response.choices[0].message.content)


The concept of signed area can help us find the total area above and below the x-axis, considering the areas with different signs. To find the area under the graph of \(y=x^2\) from \(x=-2\) to \(x=2\), we can divide the integral into two separate parts:

1. From \(x=-2\) to \(x=0\) (area below the x-axis)
2. From \(x=0\) to \(x=2\) (area above the x-axis)

Since \(x^2\) is always positive over the closed interval \([-2, 2]\), there are no negative regions under its graph!

However, instead of splitting the area into segments, we should directly compute:

The required total area under \(y = x^2\) from \(x=-2\) to \(x=2\) can be calculated as follows:

  \[
  \text{Total Area} = \int_{-2}^{2} x^2 \,dx
  \]

Performing the integration, we get:

\[
= \left[ \frac{x^3}{3} \right]_{-2}^{2} = \frac{2^3}{3} - \frac{(-2)^3}{3},  \text{ where } x^3 \text{ at } -2 \text{ gives us positive value}
\]

So:
 \[
2^3 - (-2)^3 = 8 - (-8) = 16
\]

Thus:
 \[
 = \frac{16}{3}
\]
And hence, we prefer the me

In [None]:
# Ground truth answer from test data.
import pprint
pprint.pprint(dataset[MESSAGE_NUM]['messages'][-1]['content'])

In [None]:
# Playground Fine Tuned Response

# To find the region on a graph that satisfies the inequality #y-|x|>1#, we start by rewriting the 
# inequality as #y>1+|x|#. This means that we are looking for the values of #y# that are greater than #1+|x|#.
# Next, we graph the line #y=1+|x|#. The graph of #y=|x|# is a V-shaped graph that opens upwards and has vertex 
# at the origin (0,0). Adding #1# to #y=|x|# simply shifts the graph of #y=|x|# upwards by #1#. 
# So the graph of #y=1+|x|# will be a V-shaped graph that opens upwards and has vertex at (0,1).
# Therefore, the region that satisfies the inequality #y-|x|>1# is the region above the graph of #y=1+|x|#, 
# which is the shaded region above the line #y=1+|x|#.
# In conclusion, the region on a graph that satisfies the inequality #y-|x|>1# is the region above the line #y=1+|x|#.