In [11]:
import openai
import json
import random

In [12]:
api_key = ""

In [13]:
def alpaca_to_gpt(alpaca_dict):
    instruction = alpaca_dict.get("instruction", "").strip()
    input_content = alpaca_dict.get("input", "").strip()
    output = alpaca_dict.get("output", "").strip()

    messages = []
    if instruction:
        messages.append({"role": "system", "content": instruction})
    if input_content:
        messages.append({"role": "user", "content": input_content})
    if output:
        messages.append({"role": "assistant", "content": output})
    
    return {"messages": messages}




with open("alpaca_train_data.json", "r", encoding="utf-8") as f:
    alpaca_train_data = json.load(f)
    gpt_train_data = [alpaca_to_gpt(data) for data in alpaca_train_data]
    
with open("alpaca_valid_data.json", "r", encoding="utf-8") as f:
    alpaca_valid_data = json.load(f)
    gpt_valid_data = [alpaca_to_gpt(data) for data in alpaca_valid_data]
    
with open("alpaca_test_data.json", "r", encoding="utf-8") as f:
    alpaca_test_data = json.load(f)
    gpt_test_data = [alpaca_to_gpt(data) for data in alpaca_test_data]
    



In [14]:


with open("oai_train_data.jsonl", "w", encoding="utf-8") as f:
    json.dump(gpt_train_data, f, ensure_ascii=False, indent=2)

with open("oai_valid_data.jsonl", "w", encoding="utf-8") as f:
    json.dump(gpt_valid_data, f, ensure_ascii=False, indent=2)

with open("oai_test_data.jsonl", "w", encoding="utf-8") as f:
    json.dump(gpt_test_data, f, ensure_ascii=False, indent=2)

print(f"train: {len(gpt_train_data)}, valid: {len(gpt_valid_data)}, test: {len(gpt_test_data)}")

train: 949, valid: 126, test: 191


In [32]:
import json

def convert_pretty_json_to_jsonl(pretty_json_path, output_jsonl_path):
    with open(pretty_json_path, "r", encoding="utf-8") as infile:
        data = json.load(infile)  # Load the entire JSON array

    with open(output_jsonl_path, "w", encoding="utf-8") as outfile:
        for obj in data:
            json_line = json.dumps(obj)
            outfile.write(json_line + "\n")

    print(f"Converted to JSONL and saved to {output_jsonl_path}")
convert_pretty_json_to_jsonl('oai_train_data.jsonl', 'oai_train_data1.jsonl')
convert_pretty_json_to_jsonl('oai_valid_data.jsonl', 'oai_valid_data1.jsonl')

Converted to JSONL and saved to oai_train_data1.jsonl
Converted to JSONL and saved to oai_valid_data1.jsonl


In [34]:
import json

def validate_openai_chat_jsonl(filepath):
    errors = []
    valid_roles = {"system", "user", "assistant"}

    with open(filepath, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, start=1):
            try:
                # Try to parse the line as JSON
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                errors.append(f"Line {line_num}: Invalid JSON - {e}")
                continue

            # Check for 'messages' key
            if "messages" not in obj:
                errors.append(f"Line {line_num}: Missing 'messages' key.")
                continue

            messages = obj["messages"]

            if not isinstance(messages, list):
                errors.append(f"Line {line_num}: 'messages' should be a list.")
                continue

            for i, msg in enumerate(messages):
                if not isinstance(msg, dict):
                    errors.append(f"Line {line_num}, message {i+1}: Message is not a dict.")
                    continue
                if "role" not in msg:
                    errors.append(f"Line {line_num}, message {i+1}: Missing 'role'.")
                elif msg["role"] not in valid_roles:
                    errors.append(f"Line {line_num}, message {i+1}: Invalid role '{msg['role']}'.")
                if "content" not in msg:
                    errors.append(f"Line {line_num}, message {i+1}: Missing 'content'.")
                elif not isinstance(msg["content"], str) or not msg["content"].strip():
                    errors.append(f"Line {line_num}, message {i+1}: 'content' must be a non-empty string.")

    return errors if errors else "✅ All lines are valid."

result = validate_openai_chat_jsonl("oai_train_data1.jsonl")
print(result)

result = validate_openai_chat_jsonl("oai_valid_data1.jsonl")
print(result)


✅ All lines are valid.
✅ All lines are valid.


In [29]:
import os
from openai import OpenAI

client = OpenAI(api_key=api_key)

with open("oai_train_data1.jsonl", "rb") as f:
    file = client.files.create(file=f, purpose="fine-tune")

file_id = file.id
print(f"Uploaded File ID: {file_id}")


Uploaded File ID: file-1UazESknwdHCdJatYYZJpW


In [35]:
client = OpenAI(api_key=api_key)

with open("oai_valid_data1.jsonl", "rb") as f:
    file = client.files.create(file=f, purpose="fine-tune")

file_id = file.id
print(f"Uploaded File ID: {file_id}")


Uploaded File ID: file-VMvsbXQeDCQN5yKF6xEh5j


In [54]:


client = OpenAI(api_key=api_key)

response = client.fine_tuning.jobs.create(
    model="gpt-4o-mini-2024-07-18",
    training_file="file-1UazESknwdHCdJatYYZJpW", 
    validation_file="file-VMvsbXQeDCQN5yKF6xEh5j",
    suffix="smart-4o-mini"
)

print(f"Job ID: {response.id}")
print(f"Status: {response.status}")
# Job ID: ftjob-f7ObGrkv3sAziHeJP2wVW5hI
# Status: validating_files

Job ID: ftjob-f7ObGrkv3sAziHeJP2wVW5hI
Status: validating_files


In [None]:
# Job ID: ftjob-f7ObGrkv3sAziHeJP2wVW5hI
# Status: validating_files

response = client.fine_tuning.jobs.list()
for job in response.data:
    print(job.id, job.status)


In [55]:
for model in client.models.list():
    if model.id.startswith("gpt-4o-mini"):
        print(model.id)

gpt-4o-mini-audio-preview
gpt-4o-mini-realtime-preview
gpt-4o-mini-realtime-preview-2024-12-17
gpt-4o-mini-search-preview
gpt-4o-mini-search-preview-2025-03-11
gpt-4o-mini-tts
gpt-4o-mini-2024-07-18
gpt-4o-mini
gpt-4o-mini-audio-preview-2024-12-17
gpt-4o-mini-transcribe
