In [1]:
import os
from dotenv import load_dotenv
from openai import AzureOpenAI

from datasets import load_dataset, concatenate_datasets
import pandas as pd
import re

In [2]:
load_dotenv()

True

In [3]:
endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
deployment_name=os.environ["AZURE_DEPLOYMENT_NAME"] #This will correspond to the custom name you chose for your deployment when you deployed a model. Use a gpt-35-turbo-instruct deployment. 
api_key = os.environ["AZURE_OPENAI_API_KEY"]

In [4]:
client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=api_key, 
    api_version="2024-02-01",
)

In [9]:
completion = client.chat.completions.create(
    model=deployment_name,
    messages=[
        {
            "role": "user",
            "content": "Hello, who are you and how can you help me? What are your capabilities?",
        }
    ]
)

print(completion.to_json())

{
  "id": "chatcmpl-9buZKvnH9Z0EJS4vhJPMaiiLQJs4l",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Hello! I'm an AI developed by OpenAI, and I'm here to assist you with a variety of tasks. I can help you with:\n\n1. **Information Retrieval**: I can provide information on a wide range of topics, from history and science to technology and culture.\n2. **Writing Assistance**: I can help with drafting, editing, and brainstorming ideas for essays, articles, stories, or any other writing projects.\n3. **Problem Solving**: I can assist with solving mathematical problems, coding issues, and providing step-by-step guidance for various technical and academic challenges.\n4. **Language Translation**: I can translate text between multiple languages with fairly high accuracy.\n5. **Learning and Education**: I can explain complex concepts, provide summaries of topics, and assist with learning new subjects.\n6. **Conversation**: I can enga

In [6]:
orca_math = load_dataset("microsoft/orca-math-word-problems-200k", split='train')

In [7]:
orca_math_questions = orca_math.shuffle(seed=42).select(range(10))
orca_math_5shot = orca_math.shuffle(seed=42).select(range(30, 35))


print(orca_math_questions[0])
print(orca_math_5shot[0])

{'question': 'Sally had 13 peaches at her roadside fruit dish.  She went to the orchard and picked peaches to stock up. She picked 55 peaches. There are _____ peaches now.', 'answer': 'Sally originally had 13 peaches. She picked 55 more peaches. To find out the total number of peaches she has now, we add the two amounts together:\n\n13 (original peaches) + 55 (picked peaches) = 68 peaches\n\nSo, there are 68 peaches now.'}
{'question': 'At what rate percent on simple interest will Rs. 750 amount to Rs. 900 in 5 years?', 'answer': 'To calculate the rate of simple interest, we can use the formula:\n\nSimple Interest (SI) = Principal (P) × Rate (R) × Time (T) / 100\n\nWe are given:\nPrincipal (P) = Rs. 750\nAmount (A) = Rs. 900\nTime (T) = 5 years\n\nWe need to find the Rate (R).\n\nFirst, we calculate the Simple Interest (SI) by subtracting the Principal (P) from the Amount (A):\n\nSI = A - P\nSI = Rs. 900 - Rs. 750\nSI = Rs. 150\n\nNow we can plug the values into the formula and solve f

In [8]:
system_prompt = """ 
You are an expert in the domains math, coding and logical thinking.
You are there to help users solve tasks in these domains efficiently and accurately. 
Follow these guidelines when responding to user instructions:
    \n
    1. **Understand the Task**: Carefully read and comprehend the given instruction or task description.
    2. **Generate Detailed Responses**: Provide clear, concise, and detailed responses to address the user's task. Ensure the response is complete and accurate.
    3. **Maintain Context**: Ensure that your response maintains the context of the task and addresses all aspects of the instruction.
    4. **Use Appropriate Formatting**: Structure your responses in a readable and logical format, using bullet points, lists, or code blocks as necessary.
    5. **Be Accurate and Concise**: Avoid unnecessary information. Be precise and to the point, ensuring the information is correct and useful.
\n
Now solve the following task based on the given instruction:
\n
### Task: 
\n
"""

In [76]:
# Define the apply_chat_template function
def apply_chat_template(question, examples):
    formatted_texts = [
        {"role": "system", "content": system_prompt},
        ]
    for example in examples:
        user_message = f"{example['question']} \n\n ### Response:"
        assistant_message = example['answer']
        formatted_texts.append({"role": "user", "content": user_message})
        formatted_texts.append({"role": "assistant", "content": assistant_message})
    
    formatted_texts.append({"role": "user", "content": question['question']})

    return formatted_texts

In [70]:
print(apply_chat_template(orca_math_questions[0], orca_math_5shot))

[{'role': 'system', 'content': " \nYou are an expert in the domains math, coding and logical thinking.\nYou are there to help users solve tasks in these domains efficiently and accurately. \nFollow these guidelines when responding to user instructions:\n    \n\n    1. **Understand the Task**: Carefully read and comprehend the given instruction or task description.\n    2. **Generate Detailed Responses**: Provide clear, concise, and detailed responses to address the user's task. Ensure the response is complete and accurate.\n    3. **Maintain Context**: Ensure that your response maintains the context of the task and addresses all aspects of the instruction.\n    4. **Use Appropriate Formatting**: Structure your responses in a readable and logical format, using bullet points, lists, or code blocks as necessary.\n    5. **Be Accurate and Concise**: Avoid unnecessary information. Be precise and to the point, ensuring the information is correct and useful.\n\n\nNow solve the following task 

In [79]:
def run_questions(questions, examples):
    answers = []

    for question in questions:
        prompt = apply_chat_template(question, examples)

        #print(prompt)
        completion = client.chat.completions.create(
            model=deployment_name,
            messages=prompt
        )
        answers.append(completion.choices[0].message.content)
    
    return answers


The Correct Answer in this Dataset is generated with GPT-4. So for this example it is more a comparison of GPT-4 vs GPT-4o.

In [80]:
questions = []
correct_answers = []
gpt_answers = []

for question in orca_math_questions: 
    questions.append(question['question'])
    correct_answers.append(question['answer'])

gpt_answers = run_questions(orca_math_questions, orca_math_5shot)

df = pd.DataFrame({'question': questions, 'correct_answer': correct_answers, 'gpt_answer': gpt_answers})
df 

Unnamed: 0,question,correct_answer,gpt_answer
0,Sally had 13 peaches at her roadside fruit dis...,Sally originally had 13 peaches. She picked 55...,"To find out how many peaches Sally has now, yo..."
1,two stations p and q are 200 km apart on a str...,Let's calculate the distance covered by the fi...,To find out the time at which the two trains w...
2,"In a recent election, Geoff received a certain...","To win the election, a candidate needed to rec...",Let's denote the number of votes Geoff receive...
3,"Quentin, Skylar, and Colten have a total of 38...",Let's denote the number of chickens that Skyla...,Let's break down the problem step by step to f...
4,25 binders can bind 1400 books in some days. 3...,Let's denote the number of days it takes for 2...,"To solve this problem, we will use the concept..."
5,The card shop has two boxes of cards. The firs...,"First, let's calculate the cost of the cards f...","To determine the total cost of the cards, incl..."
6,Luca went to a sandwich shop for lunch. The sa...,Let's calculate the cost of each item step by ...,"Given the provided information, Luca purchased..."
7,How is 2% expressed as a decimal fraction?,2% expressed as a decimal fraction is 0.02. To...,"To express 2% as a decimal fraction, follow th..."
8,alfred buys an old scooter for $ 4700 and spen...,"To calculate the gain percent, we first need t...","To calculate Alfred's gain percent, we'll firs..."
9,Christine is substituting aquafaba and flaxsee...,"First, let's determine the total number of egg...","To solve this problem, let's break it down ste..."


In [98]:
judge_system_prompt = """
Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. 
You will be given assistant A's answer and assistant B's answer. 
Your job is to evaluate which assistant's answer is better.\n\n
Begin your evaluation by generating your own answer to the prompt. 
You must provide your answers before judging any answers.\n\n
When evaluating the assistants' answers, compare both assistants' answers with your answer. 
You must identify and correct any mistakes or inaccurate information.\n\n
Then consider if the assistant's answers are helpful, relevant, and concise. 
Helpful means the answer correctly responds to the prompt or follows the instructions. 
Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. 
Relevant means all parts of the response closely connect or are appropriate to what is being asked. 
Concise means the response is clear and not verbose or excessive.\n\n
Then consider the creativity and novelty of the assistant's answers when needed. 
Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\n
After providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n
1. Assistant A is significantly better: [[A>>B]]\n
2. Assistant A is slightly better: [[A>B]]\n
3. Tie, relatively the same: [[A=B]]\n
4. Assistant B is slightly better: [[B>A]]\n
5. Assistant B is significantly better: [[B>>A]]\n\n

Example output: \"My final verdict is tie: [[A=B]]\"."
"""

In [93]:
def format_judge_prompts(df):
    formatted_texts = []
    
    for idx, row in df.iterrows():
        messages = []
        user_message = f"{row['question']} \n\n ### Response:"
        assistant_a_message = f"\n\n<|The Start of Assistant A's Answer|>\n{row['correct_answer']}\n<|The End of Assistant A's Answer|>"
        assistant_b_message = f"\n\n<|The Start of Assistant B's Answer|>\n{row['gpt_answer']}\n<|The End of Assistant B's Answer|>"
        messages.append({"role": "system", "content": judge_system_prompt})
        messages.append({"role": "user", "content": user_message})
        messages.append({"role": "assistant", "content": assistant_a_message})
        messages.append({"role": "assistant", "content": assistant_b_message})

        formatted_texts.append(messages)
    
    return formatted_texts

In [96]:
def run_benchmark(df): 
    prompts = format_judge_prompts(df)
    results = []

    for prompt in prompts:
        #print(prompt)
        completion = client.chat.completions.create(
            model=deployment_name,
            messages=prompt
        )
        print("Judge Result: " + completion.choices[0].message.content)
        results.append(completion.choices[0].message.content)
    
    return results

In [97]:
df['judge_result'] = run_benchmark(df)
df

Judge Result: "My final verdict is tie: [[A=B]]".
Judge Result: ### My Answer:
Let's determine when the two trains will meet.

1. The first train starts at 7 a.m. and travels at 20 km/h. By the time the second train starts at 8 a.m., the first train will have already been traveling for 1 hour. 
   - Distance covered by the first train in 1 hour = 20 km/h * 1 hour = 20 km.

2. The initial distance between the two trains at 8 a.m. will be:
   - 200 km (total distance) - 20 km (distance already covered by the first train) = 180 km.

3. The second train starts at 8 a.m. and travels at 25 km/h towards the first train.

4. The two trains are approaching each other, so their relative speed is:
   - 20 km/h + 25 km/h = 45 km/h.

5. To find the time it takes for them to meet, we can use the formula:
   - Time = Distance / Relative Speed
   - Time = 180 km / 45 km/h = 4 hours.

6. Since the second train starts at 8 a.m. and it will take 4 hours for them to meet:
   - 8 a.m. + 4 hours = 12 p.m. (

Unnamed: 0,question,correct_answer,gpt_answer,judge_result
0,Sally had 13 peaches at her roadside fruit dis...,Sally originally had 13 peaches. She picked 55...,"To find out how many peaches Sally has now, yo...","""My final verdict is tie: [[A=B]]""."
1,two stations p and q are 200 km apart on a str...,Let's calculate the distance covered by the fi...,To find out the time at which the two trains w...,### My Answer:\nLet's determine when the two t...
2,"In a recent election, Geoff received a certain...","To win the election, a candidate needed to rec...",Let's denote the number of votes Geoff receive...,"**My Response:**\n\nFirst, let's determine the..."
3,"Quentin, Skylar, and Colten have a total of 38...",Let's denote the number of chickens that Skyla...,Let's break down the problem step by step to f...,Here is my answer to the user prompt:\n\n1. Le...
4,25 binders can bind 1400 books in some days. 3...,Let's denote the number of days it takes for 2...,"To solve this problem, we will use the concept...",### My Answer:\nGiven:\n- 25 binders can bind ...
5,The card shop has two boxes of cards. The firs...,"First, let's calculate the cost of the cards f...","To determine the total cost of the cards, incl...",### My Answer:\n\n1. **Calculate the cost of t...
6,Luca went to a sandwich shop for lunch. The sa...,Let's calculate the cost of each item step by ...,"Given the provided information, Luca purchased...",### My Answer\n\nTo determine how many dollars...
7,How is 2% expressed as a decimal fraction?,2% expressed as a decimal fraction is 0.02. To...,"To express 2% as a decimal fraction, follow th...",2% expressed as a decimal fraction is 0.02. Th...
8,alfred buys an old scooter for $ 4700 and spen...,"To calculate the gain percent, we first need t...","To calculate Alfred's gain percent, we'll firs...","To determine Alfred's gain percent, we can fol..."
9,Christine is substituting aquafaba and flaxsee...,"First, let's determine the total number of egg...","To solve this problem, let's break it down ste...",My answer to the prompt would be as follows:\n...


In [100]:
# Function to extract the pattern using the provided regex pattern
def extract_final_outcome(text):
    match = re.search(r'\[\[([AB<>=]+)\]\]', text)
    return f'[[{match.group(1)}]]' if match else None

In [101]:
# Apply the function to the 'judge_result' column
df['final_outcome'] = df['judge_result'].apply(extract_final_outcome)
df

Unnamed: 0,question,correct_answer,gpt_answer,judge_result,final_outcome
0,Sally had 13 peaches at her roadside fruit dis...,Sally originally had 13 peaches. She picked 55...,"To find out how many peaches Sally has now, yo...","""My final verdict is tie: [[A=B]]"".",[[A=B]]
1,two stations p and q are 200 km apart on a str...,Let's calculate the distance covered by the fi...,To find out the time at which the two trains w...,### My Answer:\nLet's determine when the two t...,[[A=B]]
2,"In a recent election, Geoff received a certain...","To win the election, a candidate needed to rec...",Let's denote the number of votes Geoff receive...,"**My Response:**\n\nFirst, let's determine the...",[[B>>A]]
3,"Quentin, Skylar, and Colten have a total of 38...",Let's denote the number of chickens that Skyla...,Let's break down the problem step by step to f...,Here is my answer to the user prompt:\n\n1. Le...,[[A=B]]
4,25 binders can bind 1400 books in some days. 3...,Let's denote the number of days it takes for 2...,"To solve this problem, we will use the concept...",### My Answer:\nGiven:\n- 25 binders can bind ...,[[A=B]]
5,The card shop has two boxes of cards. The firs...,"First, let's calculate the cost of the cards f...","To determine the total cost of the cards, incl...",### My Answer:\n\n1. **Calculate the cost of t...,[[A=B]]
6,Luca went to a sandwich shop for lunch. The sa...,Let's calculate the cost of each item step by ...,"Given the provided information, Luca purchased...",### My Answer\n\nTo determine how many dollars...,[[A>B]]
7,How is 2% expressed as a decimal fraction?,2% expressed as a decimal fraction is 0.02. To...,"To express 2% as a decimal fraction, follow th...",2% expressed as a decimal fraction is 0.02. Th...,[[A=B]]
8,alfred buys an old scooter for $ 4700 and spen...,"To calculate the gain percent, we first need t...","To calculate Alfred's gain percent, we'll firs...","To determine Alfred's gain percent, we can fol...",[[A=B]]
9,Christine is substituting aquafaba and flaxsee...,"First, let's determine the total number of egg...","To solve this problem, let's break it down ste...",My answer to the prompt would be as follows:\n...,[[B>>A]]


In [102]:
# Initialize scores
score_A = 10
score_B = 10

# Update scores based on final_outcome
for outcome in df['final_outcome']:
    if outcome == '[[A>>B]]':
        score_A += 2
        score_B -= 2
    elif outcome == '[[A>B]]':
        score_A += 1
        score_B -= 1
    elif outcome == '[[B>>A]]':
        score_A -= 2
        score_B += 2
    elif outcome == '[[B>A]]':
        score_A -= 1
        score_B += 1

# Print final scores
print("="*20 + " Final Scores " + "="*20 + "\n")
print(f"Model A (GPT-4) Score: {score_A}\n")
print(f"Model B (GPT-4o) Score: {score_B}")


Model A (GPT-4) Score: 7

Model B (GPT-4o) Score: 13


A full Elo-System would be more complex and consider wins and draws against much better ranked models as more valuable compared to wins or draws against lower ranked models.