<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/distillation_openai_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install openai -q
!pip install python-dotenv -q
!pip install tiktoken -q
!pip install colab-env -q

In [6]:
from openai import OpenAI
client = OpenAI()

modellist=client.models.list()
modellist.data

[Model(id='gpt-4o-audio-preview-2024-12-17', created=1734034239, object='model', owned_by='system'),
 Model(id='gpt-4o-realtime-preview-2024-12-17', created=1733945430, object='model', owned_by='system'),
 Model(id='dall-e-3', created=1698785189, object='model', owned_by='system'),
 Model(id='dall-e-2', created=1698798177, object='model', owned_by='system'),
 Model(id='gpt-4o-audio-preview-2024-10-01', created=1727389042, object='model', owned_by='system'),
 Model(id='gpt-4-0314', created=1687882410, object='model', owned_by='openai'),
 Model(id='gpt-4o-realtime-preview-2024-10-01', created=1727131766, object='model', owned_by='system'),
 Model(id='gpt-4o-transcribe', created=1742068463, object='model', owned_by='system'),
 Model(id='gpt-4o-mini-transcribe', created=1742068596, object='model', owned_by='system'),
 Model(id='gpt-4o-realtime-preview', created=1727659998, object='model', owned_by='system'),
 Model(id='babbage-002', created=1692634615, object='model', owned_by='system'),
 

In [13]:
import openai
import json
import time
import io
import os
import colab_env  # Only needed if running in Google Colab

# --- Configuration ---
teacher_model_name = "gpt-4o"  # Or another powerful OpenAI model
student_model_name = "gpt-3.5-turbo-0125"  # Using a specific version
# dataset_path = "your_dataset.jsonl"  # No longer needed
distillation_dataset_path = "distillation_data.jsonl"  # Where the teacher's (input, output) pairs will be saved
num_teacher_responses = 1  # How many responses to get from the teacher for each prompt
teacher_response_temperature = 0.7  # Controls the randomness of the teacher's responses
fine_tuning_epochs = 3  # Number of training epochs for the student model
fine_tuning_learning_rate_multiplier = None # Optional: Adjust the learning rate

# --- In-memory dataset ---
in_memory_dataset = [
    {"prompt": "What is the capital of Canada?"},
    {"prompt": "Explain the basics of quantum entanglement."},
    {"prompt": "Write a very short story set in Montreal in the springtime."},
    {"prompt": "Translate 'Thank you very much' into French."},
    {"prompt": "Summarize the key points of the latest IPCC report on climate change."},
    {"prompt": "Suggest three fun activities to do in Montreal this weekend."},
    {"prompt": "What is the best restaurant in Montreal?"},
    {"prompt": "Write a haiku about a cat."},
    {"prompt": "Explain the difference between a list and a tuple in Python."},
    {"prompt": "What are the benefits of using AI for flight planning?"},  # Added based on your goal
    {"prompt": "If Newton, Galileo, Einstein, and Hinton met, what would they discuss about AI?"},  # Added based on your goal
    {"prompt": "Give me a Gemini 2.0 title for a blog post about AI in 2025."},  # Added based on your goal
]

# --- Initialize OpenAI Client ---
openai_api_key = os.environ.get('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError("The OPENAI_API_KEY environment variable must be set.")
client = openai.OpenAI(api_key=openai_api_key)

# --- Step 1: Generate Teacher Outputs from In-Memory Data ---
print(f"Generating teacher outputs from '{teacher_model_name}' (in-memory data)...")
teacher_data_count = 0
distillation_examples = []  # Store examples in a list

distillation_buffer = io.StringIO()  # Initialize buffer outside the loop

for item in in_memory_dataset:
    prompt = item.get("prompt")
    if prompt:
        try:
            responses = client.chat.completions.create(
                model=teacher_model_name,
                messages=[{"role": "user", "content": prompt}],
                n=num_teacher_responses,
                temperature=teacher_response_temperature,
            )
            for response in responses.choices:
                teacher_output = response.message.content
                distillation_example = {
                    "messages": [
                        {"role": "user", "content": prompt},
                        {"role": "assistant", "content": teacher_output}
                    ]
                }
                distillation_examples.append(distillation_example)
                teacher_data_count += 1
            time.sleep(0.1)  # Be mindful of rate limits
        except Exception as e:
            print(f"An error occurred while processing prompt '{prompt}': {e}")
    else:
        print(f"Warning: Skipping item due to missing 'prompt': {item}")

    # distillation_buffer.close()  # Removed: we don't close the buffer here

print(f"Generated {teacher_data_count} teacher responses.")

if teacher_data_count == 0:
    print("No teacher data was generated. Please check your in-memory dataset.")
    exit()

# --- Step 2: Prepare Distillation Data in Memory (Strict JSONL Format) ---
# This is the CRITICAL part - Corrected JSONL generation
distillation_data = "\n".join([
    json.dumps({"messages": [
        {"role": "user", "content": item["messages"][0]["content"]},  # Original user prompt
        {"role": "assistant", "content": item["messages"][1]["content"]}  # Teacher's response
    ]})
    for item in distillation_examples
]).encode('utf-8')

distillation_buffer.close()  # Close the buffer after using it

# --- Step 3: Upload Distillation Dataset (from in-memory buffer) for Fine-tuning ---
print("Uploading distillation dataset for fine-tuning...")
try:
    file_create_response = client.files.create(
        file=io.BytesIO(distillation_data),
        purpose="fine-tune"
    )
    training_file_id = file_create_response.id
    print(f"Training file uploaded with ID: {training_file_id}")
except Exception as e:
    print(f"Error uploading file: {e}")
    exit()

# --- Step 4: Initiate Fine-tuning Job (Minimal Parameters) ---
print(f"Initiating fine-tuning of '{student_model_name}'...")
try:
    fine_tuning_params = {
        "training_file": training_file_id,
        "model": student_model_name,
    }
    # Removed hyperparameters for now. If it works, we can add them back later.

    fine_tuning_job = client.fine_tuning.jobs.create(**fine_tuning_params)
    fine_tuning_job_id = fine_tuning_job.id
    print(f"Fine-tuning job created with ID: {fine_tuning_job.id}")
    print(f"You can monitor the status of your fine-tuning job in the OpenAI dashboard or using the command:")
    print(f"`openai api fine_tuning.jobs.retrieve {fine_tuning_job_id}`")

except Exception as e:
    print(f"Error initiating fine-tuning job: {e}")

print("Distillation process initiated. Check the OpenAI dashboard for progress.")

print("\nIf this still fails, PLEASE check the OpenAI dashboard for the EXACT error message for the failed job and share it. That's the most crucial piece of information for fixing this.")

Generating teacher outputs from 'gpt-4o' (in-memory data)...
Generated 12 teacher responses.
Uploading distillation dataset for fine-tuning...
Training file uploaded with ID: file-Qmzk5i34MSP2oED75uxsui
Initiating fine-tuning of 'gpt-3.5-turbo-0125'...
Fine-tuning job created with ID: ftjob-ag9h9X2ORD37QF9V7zeah5BY
You can monitor the status of your fine-tuning job in the OpenAI dashboard or using the command:
`openai api fine_tuning.jobs.retrieve ftjob-ag9h9X2ORD37QF9V7zeah5BY`
Distillation process initiated. Check the OpenAI dashboard for progress.

If this still fails, PLEASE check the OpenAI dashboard for the EXACT error message for the failed job and share it. That's the most crucial piece of information for fixing this.


In [16]:
from openai import OpenAI
client = OpenAI()

modellist=client.models.list()
modellist.data

[Model(id='gpt-4o-audio-preview-2024-12-17', created=1734034239, object='model', owned_by='system'),
 Model(id='gpt-4o-realtime-preview-2024-12-17', created=1733945430, object='model', owned_by='system'),
 Model(id='dall-e-3', created=1698785189, object='model', owned_by='system'),
 Model(id='dall-e-2', created=1698798177, object='model', owned_by='system'),
 Model(id='gpt-4o-audio-preview-2024-10-01', created=1727389042, object='model', owned_by='system'),
 Model(id='gpt-4-0314', created=1687882410, object='model', owned_by='openai'),
 Model(id='gpt-4o-realtime-preview-2024-10-01', created=1727131766, object='model', owned_by='system'),
 Model(id='gpt-4o-transcribe', created=1742068463, object='model', owned_by='system'),
 Model(id='gpt-4o-mini-transcribe', created=1742068596, object='model', owned_by='system'),
 Model(id='gpt-4o-realtime-preview', created=1727659998, object='model', owned_by='system'),
 Model(id='babbage-002', created=1692634615, object='model', owned_by='system'),
 

In [17]:
import datetime

timestamp = 1743100174
dt_object = datetime.datetime.fromtimestamp(timestamp)

print(dt_object)

2025-03-27 18:29:34
