In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# from datasets import load_dataset

# # Load dataset into a Colab-compatible cache directory
# ds = load_dataset("AI-MO/NuminaMath-CoT", cache_dir="/content/hf_cache")

In [None]:
!pip install -U datasets huggingface_hub

In [None]:
from datasets import load_dataset

# Load dataset into a Colab-compatible cache directory
ds = load_dataset("meta-math/MetaMathQA", cache_dir="/content/hf_cache")


In [None]:
print(ds)

# Step 2: Split into train and validation

In [None]:
# Step 1: Shuffle and downsample to 3,000 examples
sampled_ds = ds["train"].shuffle(seed=42).select(range(3000))

# Step 2: Split into 90% train, 10% validation
split_ds = sampled_ds.train_test_split(test_size=0.1, seed=42)

train_data = split_ds["train"]
val_data = split_ds["test"]


# Convert dataset to JSON

In [None]:
import pandas as pd
import json

def save_openai_format(dataset_split, output_file):
    system_message = {
        "role": "system",
        "content": "You are a math tutor. Solve the user's problem step-by-step."
    }

    with open(output_file, "w", encoding="utf-8") as f:
        for example in dataset_split:
            user_prompt = example["original_question"]
            assistant_answer = example["response"]

            formatted = {
                "messages": [
                    system_message,
                    {"role": "user", "content": user_prompt},
                    {"role": "assistant", "content": assistant_answer}
                ]
            }

            f.write(json.dumps(formatted) + "\n")

    print(f"‚úÖ Saved to {output_file}")

# Step 4: Save to JSONL
save_openai_format(train_data, "training_data_math.jsonl")
save_openai_format(val_data, "validation_data_math.jsonl")


# Setup API and Training File

In [None]:
open_ai_key = "PUT KEY"

In [None]:
import os
from openai import OpenAI
from time import sleep

# Initialize OpenAI client
client = OpenAI(api_key = open_ai_key)

In [None]:
def upload_training_file(file_path):
    """Upload training file to OpenAI"""
    with open(file_path, "rb") as file:
        response = client.files.create(
            file=file,
            purpose="fine-tune"
        )
        return response.id

In [None]:
# Upload the files
training_file_id = upload_training_file("training_data_math.jsonl")
validation_file_id = upload_training_file("validation_data_math.jsonl")

print("Training File ID:", training_file_id)
print("Validation File ID:", validation_file_id)

# Start Fine-Tuning Job

In [None]:
def create_fine_tuning_job(training_file_id, validation_file_id=None, model="gpt-4o-mini-2024-07-18"):
    response = client.fine_tuning.jobs.create(
        training_file=training_file_id,
        validation_file=validation_file_id,
        model=model
    )
    return response.id

job_id = create_fine_tuning_job(training_file_id, validation_file_id)
print("Fine-tuning Job ID:", job_id)


# Monitor the Job

In [None]:
import time

def monitor_job(job_id):

    while True:
        job = client.fine_tuning.jobs.retrieve(job_id)
        print(f"Status: {job.status}")

        if job.status in ["succeeded", "failed"]:
            return job

        events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=5)
        for event in events.data:

            print(f"Event: {event.message}")
        time.sleep(30)

job = monitor_job(job_id)

if job.status == "succeeded":
    fine_tuned_model = job.fine_tuned_model
    print(f"üéØ Fine-tuned model ID: {fine_tuned_model}")
else:
    print("‚ùå Fine-tuning failed.")


In [None]:
job = client.fine_tuning.jobs.retrieve("ftjob-FyNj5dZG0QMeuftwb8Izwya0")
print("üìå Job Status:", job.status)
print("üì¶ Model ID:", job.fine_tuned_model)


In [None]:
events = client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-TLd72r6lcI0vF3UtolgH3qN6", limit=10)
for event in events.data:
    print(f"[{event.created_at}] {event.message}")

# Test the Model

In [None]:
def test_model(model_id, question):
    response = client.chat.completions.create(
        model=model_id,
        messages=[
            {"role": "system", "content": "You are a math tutor."},
            {"role": "user", "content": question}
        ]
    )
    return response.choices[0].message.content.strip()

In [None]:
result = test_model(fine_tuned_model, "If f(x) = 2x + 3, what is f(4)?")
print(result)

In [None]:
# Integration test
result2 = test_model(fine_tuned_model, "Evaluate the definite integral of the function f(x) = 3x^2 + 2x + 1 from x = 0 to x = 2. Show your steps clearly.")
print("Integration result:", result2)

# Evaluation Phase

In [None]:
!pip install nltk rouge-score matplotlib

In [None]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')

In [None]:
import json
import re
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from openai import OpenAI

smoothie = SmoothingFunction().method4

def evaluate_model(model_id, test_data, max_samples=100):
    bleu_scores, rouge1_scores, rougeL_scores = [], [], []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    for i, item in enumerate(test_data.select(range(max_samples))):
        question = item['original_question']
        reference = item['response']

        prediction = test_model(model_id, question)

        # Tokenization
        reference_tokens = word_tokenize(reference)
        prediction_tokens = word_tokenize(prediction)

        # BLEU
        bleu = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothie)
        bleu_scores.append(bleu)

        # ROUGE
        rouge = scorer.score(reference, prediction)
        rouge1_scores.append(rouge['rouge1'].fmeasure)
        rougeL_scores.append(rouge['rougeL'].fmeasure)

        print(f"\nExample {i+1}")
        print("Q:", question)
        print("Expected:", reference[:200])
        print("Predicted:", prediction[:200])
        print(f"BLEU: {bleu:.4f}, ROUGE-1: {rouge['rouge1'].fmeasure:.4f}, ROUGE-L: {rouge['rougeL'].fmeasure:.4f}")

    return bleu_scores, rouge1_scores, rougeL_scores


In [None]:
from datasets import load_dataset

# Step 1: Shuffle and downsample to 3,000 examples
sampled_ds = ds["train"].shuffle(seed=42).select(range(3000))

# Step 2: Split into 90% train, 10% validation
split_ds = sampled_ds.train_test_split(test_size=0.1, seed=42)
test_data = split_ds["test"]

# Run evaluation
bleu_scores, rouge1_scores, rougeL_scores = evaluate_model(fine_tuned_model, test_data, max_samples=100)


In [None]:
def plot_metric(scores, title):
    plt.figure(figsize=(8, 4))
    plt.hist(scores, bins=20, alpha=0.7)
    plt.title(title)
    plt.xlabel("Score")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

plot_metric(bleu_scores, "BLEU Score Distribution")
plot_metric(rouge1_scores, "ROUGE-1 Score Distribution")
plot_metric(rougeL_scores, "ROUGE-L Score Distribution")
