## Before you begin

### Installation

In [47]:
%pip install requests azure-identity openai

Note: you may need to restart the kernel to use updated packages.


### Parameters

In [166]:

subscription = "6a6fff00-4464-4eab-a6b1-0b533c7202e0"
resource_group = "rg-jialiuai"
resource_name = "jialiu-aoai-ncus"
api_version = "2025-04-01-preview"

base_model = "gpt-4.1-mini"

model_deployment_name = "gpt-4.1-mini-stock-hallucination"

train_data_file = "Data/stock-train-hallucination_tools.jsonl"
test_data_file = "Data/stock-test-hallucination_query_format.jsonl"
eval_data_file_template = "Data/stock-eval-hallucination_{model}_query_format.jsonl"

### Init Azure OpenAI Client

It will use token authentication. Please make sure to run `az login` in your terminal.

In [None]:
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

credential = DefaultAzureCredential()

token_provider = get_bearer_token_provider(
    credential,
    "https://cognitiveservices.azure.com/.default",
)

client = AzureOpenAI(
    azure_endpoint=f"https://{resource_name}.openai.azure.com/",
    api_version=api_version,
    azure_ad_token_provider=token_provider,
)

## Prepare your data

### Preview training data

In [66]:
# Now you need to run some preliminary checks on our training and validation files.

import json

# Load the training set
from pathlib import Path

# Assuming the current directory is the root of your repository
with Path(train_data_file).open("r", encoding="utf-8") as f:
    training_dataset = [json.loads(line) for line in f]


# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
for message in training_dataset[0]["messages"]:
    print(message)

Number of examples in training set: 96
First example in training set:
{'role': 'system', 'content': "Don't make assumptions about what values to plug into functions. If you can't find the exact stock ticker symbol, you can ask for clarification. "}
{'role': 'user', 'content': "What was the highest price that Bank of America's stock reached last month?"}
{'role': 'assistant', 'tool_calls': [{'id': 'call_333605', 'type': 'function', 'function': {'name': 'get_last_nday_stock_price', 'arguments': '{"symbol": "BAC", "period": "1mo"}'}}]}


### Upload data to Azure OpenAI

In [67]:
from pathlib import Path

# Upload the training dataset files to Azure OpenAI with the SDK.
with Path(train_data_file).open("rb") as file:
    training_response = client.files.create(file=file, purpose="fine-tune")

training_file_id = training_response.id

print("Training file ID:", training_file_id)

Training file ID: file-c359ba8517e748afbcc9bfde84bbec67


## Finetune

### Submit finetuning job

In [57]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model=base_model,  # Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters.
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

Job ID: ftjob-HnXTdssdEhCp9ANlaseHXZkh
Status: validating_files
{
  "id": "ftjob-HnXTdssdEhCp9ANlaseHXZkh",
  "created_at": 1756163279,
  "error": {
    "code": null,
    "message": null,
    "param": null
  },
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "batch_size": "auto",
    "learning_rate_multiplier": "auto",
    "n_epochs": "auto"
  },
  "model": "gpt-4.1-mini-2025-04-14",
  "object": "fine_tuning.job",
  "organization_id": "org-hTGGKhbVgQIQFEXlrcWgLHk9",
  "result_files": [],
  "seed": 721143020,
  "status": "validating_files",
  "trained_tokens": null,
  "training_file": "file-QAfKMt8EoewWC17cPD4nM7",
  "validation_file": null,
  "estimated_finish": null,
  "integrations": [],
  "metadata": null,
  "method": {
    "type": "supervised",
    "dpo": null,
    "reinforcement": null,
    "supervised": {
      "hyperparameters": {
        "batch_size": "auto",
        "learning_rate_multiplier": "auto",
        "n_epochs": "auto"
      }
    }
  },


### Track training job status

In [58]:
# Track training status

from IPython.display import clear_output
import time

start_time = time.time()

# Get the status of our fine-tuning job.
response = client.fine_tuning.jobs.retrieve(job_id)

status = response.status

# If the job isn't done yet, poll it every 10 seconds.
while status not in ["succeeded", "failed"]:
    time.sleep(10)

    response = client.fine_tuning.jobs.retrieve(job_id)
    print(response.model_dump_json(indent=2))
    print(
        "Elapsed time: {} minutes {} seconds".format(
            int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)
        )
    )
    status = response.status
    print(f"Status: {status}")
    clear_output(wait=True)

print(f"Fine-tuning job {job_id} finished with status: {status}")

# List all fine-tuning jobs for this resource.
print("Checking other fine-tune jobs for this resource.")
response = client.fine_tuning.jobs.list()
print(f"Found {len(response.data)} fine-tune jobs.")

KeyboardInterrupt: 

### Retrieve finetuned model name

In [68]:
# Retrieve fine_tuned_model name

response = client.fine_tuning.jobs.retrieve(job_id)

print(response.model_dump_json(indent=2))
fine_tuned_model = response.fine_tuned_model

NotFoundError: Error code: 404 - {'error': {'code': 'notFound', 'message': 'The specified entity cannot be found.'}}

## Deploy fine-tuned model

In [None]:
import json
import requests

token = credential.get_token("https://management.azure.com/.default")

deploy_params = {"api-version": "2025-04-01-preview"}
deploy_headers = {
    "Authorization": f"Bearer {token.token}",
    "Content-Type": "application/json",
}

deploy_data = {
    "sku": {"name": "GlobalStandard", "capacity": 50},
    "properties": {
        "model": {
            "format": "OpenAI",
            "name": fine_tuned_model,  # retrieve this value from the previous call, it will look like gpt-35-turbo-0613.ft-b044a9d3cf9c4228b5d393567f693b83
            "version": "1",
        }
    },
}
deploy_data = json.dumps(deploy_data)

request_url = f"https://management.azure.com/subscriptions/{subscription}/resourceGroups/{resource_group}/providers/Microsoft.CognitiveServices/accounts/{resource_name}/deployments/{model_deployment_name}"

print("Creating a new deployment...")

r = requests.put(request_url, params=deploy_params, headers=deploy_headers, data=deploy_data)

print(r)
print(r.reason)
print(r.json())

Creating a new deployment...
<Response [201]>
Created
{'id': '/subscriptions/6a6fff00-4464-4eab-a6b1-0b533c7202e0/resourceGroups/rg-jialiuai/providers/Microsoft.CognitiveServices/accounts/jialiu-aoai-ncus/deployments/gpt-4.1-mini-stock-hallucination', 'type': 'Microsoft.CognitiveServices/accounts/deployments', 'name': 'gpt-4.1-mini-stock-hallucination', 'sku': {'name': 'standard', 'capacity': 50}, 'properties': {'model': {'format': 'OpenAI', 'name': 'gpt-4.1-mini-2025-04-14.ft-dcd2c319f0e441a5a5a04f9c2090ecaa', 'version': '1'}, 'versionUpgradeOption': 'NoAutoUpgrade', 'currentCapacity': 50, 'capabilities': {'chatCompletion': 'true', 'area': 'US', 'responses': 'true', 'assistants': 'true'}, 'provisioningState': 'Creating', 'rateLimits': [{'key': 'request', 'renewalPeriod': 60, 'count': 50}, {'key': 'token', 'renewalPeriod': 60, 'count': 50000}]}, 'systemData': {'createdBy': 'daweil@microsoft.com', 'createdByType': 'User', 'createdAt': '2025-08-25T20:44:51.4079816Z', 'lastModifiedBy': 'd

## Evaluate

### Model to be evaluated

In [189]:
models_to_compare = [
    "gpt-4.1",
    "gpt-4.1-mini",
    model_deployment_name,
]

### Prepare test data

In [190]:
from pathlib import Path

# Replace 'Data/stock-test-token-reduction.jsonl' with the actual path to your file
test_file_path = Path(test_data_file)

with test_file_path.open(errors="ignore") as json_file:
    json_list = list(json_file)

### Prepare Prompt for Data Sampling

In [191]:
system_prompt = "Don't make assumptions about what values to plug into functions. If you can't find the exact stock ticker symbol, you can ask for clarification. "

tools = [
    {
      "type": "function",
      "function": {
        "name": "get_current_stock_price",
        "parameters": {
          "type": "object",
          "properties": {},
          "required": [
            "symbol"
          ]
        }
      }
    },
    {
      "type": "function",
      "function": {
        "name": "get_last_nday_stock_price",
        "parameters": {
          "type": "object",
          "properties": {},
          "required": [
            "symbol",
            "period"
          ]
        }
      }
    }
  ]

### Batch inferencing

In [None]:
from pathlib import Path

for model in models_to_compare:
    print(f"starting on model {model}")

    eval_file_path = Path(eval_data_file_template.format(model=model))
    with eval_file_path.open("w", encoding="utf-8") as output_file:
        for i, json_str in enumerate(json_list):
            row = json.loads(json_str)
            result = {
                "item": {
                    **row["item"],
                }
            }
            print(f"starting on {i}")
            completion = client.chat.completions.create(
                model=model,
                messages=[
                    {
                        "type": "message",
                        "role": "system",
                        "content": system_prompt,
                    },
                    {
                        "type": "message",
                        "role": "user",
                        "content": row["item"]["query"],
                    },
                ],
                temperature=0.0,  # to reduce randomness
                tools=tools,
                tool_choice="auto",
            )
            tool_calls = completion.choices[0].message.tool_calls
            if tool_calls:
                result["item"]["actual_tool_calls"] = [tool_calls[0].model_dump()]
            else:
                result["item"]["actual_tool_calls"] = []

            output_file.write(json.dumps(result) + "\n")


starting on model gpt-4.1
starting on 0
starting on 1
starting on 2
starting on 3
starting on 4
starting on 5
starting on 6
starting on 7
starting on 8
starting on 9
starting on model gpt-4.1-mini
starting on 0
starting on 1
starting on 2
starting on 3
starting on 4
starting on 5
starting on 6
starting on 7
starting on 8
starting on 9
starting on model gpt-4.1-mini-stock-hallucination
starting on 0
starting on 1
starting on 2
starting on 3
starting on 4
starting on 5
starting on 6
starting on 7
starting on 8
starting on 9


### Prepare Evaluation Data

In [193]:
from pathlib import Path

eval_file_mapping = {}

# Upload the training dataset files to Azure OpenAI with the SDK.
for model in models_to_compare:
    with Path(eval_data_file_template.format(model=model)).open("rb") as file:
        eval_response = client.files.create(file=file, purpose="evals")

    eval_file_id = eval_response.id
    eval_file_mapping[model] = eval_file_id

    print(f"Evaluation for model {model} with file ID: {eval_file_id}")

Evaluation for model gpt-4.1 with file ID: file-6303980bb96b4a6fac0b31a76b3da71d
Evaluation for model gpt-4.1-mini with file ID: file-e16b4c2eed6d4008a5118a464fbd6251
Evaluation for model gpt-4.1-mini-stock-hallucination with file ID: file-43e79c5e1ab04705a1b97f8f46316b29


### Define python grader

In [194]:
python_grader = """
def grade(sample, item) -> float:
    actual_tool_calls = item['actual_tool_calls']
    expected_tool_calls = item['expected_tool_calls']
    return grade_tool_calls(actual_tool_calls, expected_tool_calls)

def grade_tool_calls(actual_tool_calls, expected_tool_calls):
    # Case 1: Both are empty (None or empty list)
    if (not actual_tool_calls) and (not expected_tool_calls):
        return 10.0

    # Case 2: One is empty, one is not
    if (not actual_tool_calls) != (not expected_tool_calls):
        return 0.0

    # Case 3: Both are not empty - check if function objects are equal
    if actual_tool_calls and expected_tool_calls:
        return compare_function_calls(actual_tool_calls, expected_tool_calls)

    return 0.0


def compare_function_calls(actual_calls, expected_calls):
    if len(actual_calls) != len(expected_calls):
        return 1.0

    for actual, expected in zip(actual_calls, expected_calls):
        # Compare function name and arguments, ignore call ID
        actual_func = actual.get("function", {})
        expected_func = expected.get("function", {})

        # Compare function name
        if actual_func.get("name") != expected_func.get("name"):
            return 2.0

        # Compare function arguments (parse JSON strings if needed)
        actual_args = actual_func.get("arguments", "{}")
        expected_args = expected_func.get("arguments", "{}")

        try:
            actual_args_dict = json.loads(actual_args) if isinstance(actual_args, str) else actual_args
            expected_args_dict = json.loads(expected_args) if isinstance(expected_args, str) else expected_args

            if actual_args_dict != expected_args_dict:
                return 3.0
        except json.JSONDecodeError:
            # If we can't parse, do string comparison
            if actual_args != expected_args:
                return 4.0

    return 10.0
    
"""


### Create Stock Hallucination Evaluation

In [195]:
stock_eval = client.evals.create(
    name="Stock Hallucination Eval",
    data_source_config={
        "type": "custom",
        "item_schema": {
            "type": "object",
            "properties": {
                "query": {"type": "string"},
                "actual_tool_calls": {"type": "array"},
                "expected_tool_calls": {"type": "array"},
            },
            "required": ["query", "actual_tool_calls", "expected_tool_calls"],
        },
    },
    testing_criteria=[
        {
            "type": "python",
            "name": "Tool Use Evaluator",
            "source": python_grader,
            "pass_threshold": 9.9,
        }
    ],
)

### Define data source

In [196]:
for model in models_to_compare:
    data_source = {
        "type": "jsonl",
        "source": {"type": "file_id", "id": eval_file_mapping[model]},
    }

### Create sampling runs

In [197]:
gpt_41_completions_run = client.evals.runs.create(
    name="gpt-4.1",
    eval_id=stock_eval.id,
    data_source=data_source,
)
gpt_41_mini_completions_run = client.evals.runs.create(
    name="gpt-4.1-mini",
    eval_id=stock_eval.id,
    data_source=data_source,
)
gpt_41_mini_ft_completions_run = client.evals.runs.create(
    name=model_deployment_name,
    eval_id=stock_eval.id,
    data_source=data_source,
)

In [198]:
def poll_runs(eval_id, run_ids):
    # poll both runs at the same time, until they are complete or failed
    while True:
        runs = [client.evals.runs.retrieve(run_id, eval_id=eval_id) for run_id in run_ids]
        for run in runs:
            print(run.id, run.status, run.result_counts)
        if all(run.status in ("completed", "failed") for run in runs):
            break
        time.sleep(5)


poll_runs(
    stock_eval.id,
    [
        gpt_41_completions_run.id,
        gpt_41_mini_completions_run.id,
        gpt_41_mini_ft_completions_run.id,
    ],
)

evalrun_68ad1ce5231c8191b1fc3cb3ccf5cf1f queued ResultCounts(errored=0, failed=0, passed=0, total=0)
evalrun_68ad1ce601fc81919648ff8911ff8a60 queued ResultCounts(errored=0, failed=0, passed=0, total=0)
evalrun_68ad1ce69ca081919d0ee0ca58495228 queued ResultCounts(errored=0, failed=0, passed=0, total=0)
evalrun_68ad1ce5231c8191b1fc3cb3ccf5cf1f in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)
evalrun_68ad1ce601fc81919648ff8911ff8a60 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)
evalrun_68ad1ce69ca081919d0ee0ca58495228 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)
evalrun_68ad1ce5231c8191b1fc3cb3ccf5cf1f completed ResultCounts(errored=0, failed=5, passed=5, total=10)
evalrun_68ad1ce601fc81919648ff8911ff8a60 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)
evalrun_68ad1ce69ca081919d0ee0ca58495228 in_progress ResultCounts(errored=0, failed=0, passed=0, total=0)
evalrun_68ad1ce5231c8191b1fc3cb3ccf5cf1f completed ResultCount