## Before you begin

### Installation

In [47]:
%pip install requests azure-identity openai

Note: you may need to restart the kernel to use updated packages.


### Parameters

In [None]:
base_model = "gpt-4.1-mini-2025-04-14"

train_data_file = "Data/stock-train-hallucination_tools.jsonl"
eval_data_file = "Data/stock-test-hallucination_query_format.jsonl"

### Init OpenAI Client

It will use token authentication. Please make sure to run `az login` in your terminal.

In [None]:
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI()

## Prepare your data

### Preview training data

In [4]:
# Now you need to run some preliminary checks on our training and validation files.

import json

# Load the training set
from pathlib import Path

# Assuming the current directory is the root of your repository
with Path(train_data_file).open("r", encoding="utf-8") as f:
    training_dataset = [json.loads(line) for line in f]


# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
for message in training_dataset[0]["messages"]:
    print(message)

Number of examples in training set: 96
First example in training set:
{'role': 'system', 'content': "Don't make assumptions about what values to plug into functions. If you can't find the exact stock ticker symbol, you can ask for clarification. "}
{'role': 'user', 'content': "What was the highest price that Bank of America's stock reached last month?"}
{'role': 'assistant', 'tool_calls': [{'id': 'call_333605', 'type': 'function', 'function': {'name': 'get_last_nday_stock_price', 'arguments': '{"symbol": "BAC", "period": "1mo"}'}}]}


### Upload data to Azure OpenAI

In [5]:
from pathlib import Path

# Upload the training dataset files to Azure OpenAI with the SDK.
with Path(train_data_file).open("rb") as file:
    training_response = client.files.create(file=file, purpose="fine-tune")

training_file_id = training_response.id

print("Training file ID:", training_file_id)

Training file ID: file-DN7WK8S1SsfNQ4K9dcX6mQ


## Finetune

### Submit finetuning job

In [6]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model=base_model,  # Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters.
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

Job ID: ftjob-kWszFvhCRHubbliaZPcUej4Q
Status: validating_files
{
  "id": "ftjob-kWszFvhCRHubbliaZPcUej4Q",
  "created_at": 1756165769,
  "error": {
    "code": null,
    "message": null,
    "param": null
  },
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "batch_size": "auto",
    "learning_rate_multiplier": "auto",
    "n_epochs": "auto"
  },
  "model": "gpt-4.1-mini-2025-04-14",
  "object": "fine_tuning.job",
  "organization_id": "org-hTGGKhbVgQIQFEXlrcWgLHk9",
  "result_files": [],
  "seed": 258050106,
  "status": "validating_files",
  "trained_tokens": null,
  "training_file": "file-DN7WK8S1SsfNQ4K9dcX6mQ",
  "validation_file": null,
  "estimated_finish": null,
  "integrations": [],
  "metadata": null,
  "method": {
    "type": "supervised",
    "dpo": null,
    "reinforcement": null,
    "supervised": {
      "hyperparameters": {
        "batch_size": "auto",
        "learning_rate_multiplier": "auto",
        "n_epochs": "auto"
      }
    }
  },


### Track training job status

In [None]:
# Track training status

from IPython.display import clear_output
import time

start_time = time.time()

# Get the status of our fine-tuning job.
response = client.fine_tuning.jobs.retrieve(job_id)

status = response.status

# If the job isn't done yet, poll it every 10 seconds.
while status not in ["succeeded", "failed"]:
    time.sleep(10)

    response = client.fine_tuning.jobs.retrieve(job_id)
    print(response.model_dump_json(indent=2))
    print(
        "Elapsed time: {} minutes {} seconds".format(
            int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)
        )
    )
    status = response.status
    print(f"Status: {status}")
    clear_output(wait=True)

print(f"Fine-tuning job {job_id} finished with status: {status}")

# List all fine-tuning jobs for this resource.
print("Checking other fine-tune jobs for this resource.")
response = client.fine_tuning.jobs.list()
print(f"Found {len(response.data)} fine-tune jobs.")

{
  "id": "ftjob-kWszFvhCRHubbliaZPcUej4Q",
  "created_at": 1756165769,
  "error": {
    "code": null,
    "message": null,
    "param": null
  },
  "fine_tuned_model": null,
  "finished_at": 1756166715,
  "hyperparameters": {
    "batch_size": 1,
    "learning_rate_multiplier": 2.0,
    "n_epochs": 3
  },
  "model": "gpt-4.1-mini-2025-04-14",
  "object": "fine_tuning.job",
  "organization_id": "org-hTGGKhbVgQIQFEXlrcWgLHk9",
  "result_files": [],
  "seed": 258050106,
  "status": "running",
  "trained_tokens": null,
  "training_file": "file-DN7WK8S1SsfNQ4K9dcX6mQ",
  "validation_file": null,
  "estimated_finish": null,
  "integrations": [],
  "metadata": null,
  "method": {
    "type": "supervised",
    "dpo": null,
    "reinforcement": null,
    "supervised": {
      "hyperparameters": {
        "batch_size": 1,
        "learning_rate_multiplier": 2.0,
        "n_epochs": 3
      }
    }
  },
  "user_provided_suffix": null,
  "usage_metrics": null,
  "shared_with_openai": false,
  "ev

### Retrieve finetuned model name

In [None]:
# Retrieve fine_tuned_model name
# job_id = "ftjob-kWszFvhCRHubbliaZPcUej4Q"

response = client.fine_tuning.jobs.retrieve(job_id)

print(response.model_dump_json(indent=2))
fine_tuned_model = response.fine_tuned_model

NotFoundError: Error code: 404 - {'error': {'code': 'notFound', 'message': 'The specified entity cannot be found.'}}

## Evaluate

### Prepare Test Data

In [121]:
from pathlib import Path

# Upload the training dataset files to Azure OpenAI with the SDK.
with Path(eval_data_file).open("rb") as file:
    eval_response = client.files.create(file=file, purpose="evals")

eval_file_id = eval_response.id

print("Evaluation file ID:", eval_file_id)

Evaluation file ID: file-916415237d2d47daa74270298cd4446c


### Prepare Prompt for Data Sampling

In [122]:
system_prompt = "Don't make assumptions about what values to plug into functions. If you can't find the exact stock ticker symbol, you can ask for clarification. "

tools = [
    {
      "type": "function",
      "function": {
        "name": "get_current_stock_price",
        "parameters": {
          "type": "object",
          "properties": {},
          "required": [
            "symbol"
          ]
        }
      }
    },
    {
      "type": "function",
      "function": {
        "name": "get_last_nday_stock_price",
        "parameters": {
          "type": "object",
          "properties": {},
          "required": [
            "symbol",
            "period"
          ]
        }
      }
    }
  ]

### Define python grader

In [None]:
python_grader = """
def grade(sample, item) -> float:
    actual_tool_calls = sample['output_tools']
    expected_tool_calls = item['expected_tool_calls']
    return grade_tool_calls(actual_tool_calls, expected_tool_calls)

def grade_tool_calls(actual_tool_calls, expected_tool_calls):
    # Case 1: Both are empty (None or empty list)
    if (not actual_tool_calls) and (not expected_tool_calls):
        return 1.0

    # Case 2: One is empty, one is not
    if (not actual_tool_calls) != (not expected_tool_calls):
        return 0.0

    # Case 3: Both are not empty - check if function objects are equal
    if actual_tool_calls and expected_tool_calls:
        return compare_function_calls(actual_tool_calls, expected_tool_calls)

    return 0.0


def compare_function_calls(actual_calls, expected_calls):
    if len(actual_calls) != len(expected_calls):
        return 0.11

    for actual, expected in zip(actual_calls, expected_calls):
        # Compare function name and arguments, ignore call ID
        actual_func = actual.get("function", {})
        expected_func = expected.get("function", {})

        # Compare function name
        if actual_func.get("name") != expected_func.get("name"):
            return 0.12

        # Compare function arguments (parse JSON strings if needed)
        actual_args = actual_func.get("arguments", "{}")
        expected_args = expected_func.get("arguments", "{}")

        try:
            actual_args_dict = json.loads(actual_args) if isinstance(actual_args, str) else actual_args
            expected_args_dict = json.loads(expected_args) if isinstance(expected_args, str) else expected_args

            if actual_args_dict != expected_args_dict:
                return 0.13
        except json.JSONDecodeError:
            # If we can't parse, do string comparison
            if actual_args != expected_args:
                return 0.14

    return 1.0
    
"""


### Create Stock Hallucination Evaluation

In [124]:
stock_eval = client.evals.create(
    name="Stock Hallucination Eval",
    data_source_config={
        "type": "custom",
        "item_schema": {
            "type": "object",
            "properties": {
                "query": {"type": "string"},
                "expected_tool_calls": {"type": "array"},
            },
            "required": ["query", "expected_tool_calls"],
        },
        "include_sample_schema": True,
    },
    testing_criteria=[
        {
            "type": "python",
            "name": "Tool Use Evaluator",
            "source": python_grader,
            "pass_threshold": 0.5,
        }
    ],
)

### Define data source

In [125]:
data_source_template = {
    "type": "completions",
    "source": {"type": "file_id", "id": eval_file_id},
    "input_messages": {
        "type": "template",
        "template": [
            {
                "type": "message",
                "role": "system",
                "content": {"type": "input_text", "text": system_prompt},
            },
            {
                "type": "message",
                "role": "user",
                "content": {"type": "input_text", "text": "{{item.query}}"},
            },
        ],
    },
    "sampling_params": {
        "temperature": 0.0,
        "tools": tools,
    },
}

### Create sampling runs

In [None]:
gpt_41_completions_run = client.evals.runs.create(
    name="gpt-4.1",
    eval_id=stock_eval.id,
    data_source={**data_source_template, "model": "gpt-4.1"},
)
gpt_41_mini_completions_run = client.evals.runs.create(
    name="gpt-4.1-mini",
    eval_id=stock_eval.id,
    data_source={**data_source_template, "model": "gpt-4.1-mini"},
)
gpt_41_mini_ft_completions_run = client.evals.runs.create(
    name=fine_tuned_model,
    eval_id=stock_eval.id,
    data_source={**data_source_template, "model": fine_tuned_model},
)

BadRequestError: Error code: 400 - {'error': {'message': "Unknown parameter: 'data_source.sampling_params.tools'.", 'type': 'invalid_request_error', 'param': 'data_source.sampling_params.tools', 'code': 'unknown_parameter'}}

In [None]:
def poll_runs(eval_id, run_ids) -> None:
    # poll both runs at the same time, until they are complete or failed
    while True:
        runs = [client.evals.runs.retrieve(run_id, eval_id=eval_id) for run_id in run_ids]
        for run in runs:
            print(run.id, run.status, run.result_counts)
        if all(run.status in ("completed", "failed") for run in runs):
            break
        time.sleep(5)


poll_runs(
    stock_eval.id,
    [
        gpt_41_completions_run.id,
        gpt_41_mini_completions_run.id,
        gpt_41_mini_ft_completions_run.id,
    ],
)

NotFoundError: Error code: 404 - {'error': {'message': 'Run evalrun_68acf4faa5488191a77a3dea38abaa0f cannot be found. Please confirm the run_id or permissions to view it.', 'type': 'invalid_request_error', 'param': None, 'code': None}}