In [None]:
!pip install bert-score

In [None]:
!pip install openai

In [3]:
from openai import OpenAI
import os
import json
from tqdm import tqdm

In [7]:
def connect_4o(input):
  client = OpenAI(api_key="")

  response = client.responses.create(
    model="gpt-4o",
    input=[
      {
        "role": "system",
        "content": [
          {
            "type": "input_text",
            "text": "Your task is to generate a clear, plain language answer for user inquiries about what factor contributes to the decision-making of the Autonomous Driving System. The focus is on what is the key feature making the system generate current decision.\n \nYou will receive a json file containing:\n1. The identified key features with its description;\n2. The numerical analysis value associated with identified key features. \n3. The numerical value represents the velocity value of agent behavior.\n4. The time interval between each scene time is 0.1s.\n \nBased on these inputs and information, generate a comprehensive answer that:\n- Directly answer what feature results in the system behavior.\n- Explains the result based on the calculated quantitative value.\n- Uses non-technical language and avoids any mention of internal logic formulas or variable names.\n \nProvide your answer without any introductory or concluding remarks.\n \nExample: \nInput: { \"key_feature\": \"observed position of agent_4 at scene time 20 at the 19 timestamp\",  \"quantitative_value\": -4.150000000000027}\nAnswer: The reason why ego makes such decision at 2s is because it observes agent 4 decreases its speed 4.15m/s at 0.1s ago."
          }
        ]
      },
      {
        "role": "user",
        "content": [
          {
            "type": "input_text",
            "text": input
          }
        ]
      }
    ],
    text={
      "format": {
        "type": "text"
      }
    },
    reasoning={},
    tools=[],
    temperature=1,
    max_output_tokens=2048,
    top_p=1,
    store=True
  )

  gpt_response_message_content = next((out.content[0].text for out in response.output if out.type == 'message'), None)

  return gpt_response_message_content

In [8]:
def connect_gpt_o1(input):
  client = OpenAI(api_key="")

  response = client.responses.create(model="o1", input=[
    {
      "role": "developer",
      "content": [
        {
          "type": "input_text",
          "text": "Your task is to generate a clear, plain language answer for user inquiries about what factor contributes to the decision-making of the Autonomous Driving System. The focus is on what is the key feature making the system generate current decision.\n \nYou will receive a json file containing:\n1. The identified key features with its description;\n2. The numerical analysis value associated with identified key features. \n3. The numerical value represents the velocity value of agent behavior.\n4. The time interval between each scene time is 0.1s.\n \nBased on these inputs and information, generate a comprehensive answer that:\n- Directly answer what feature results in the system behavior.\n- Explains the result based on the calculated quantitative value.\n- Uses non-technical language and avoids any mention of internal logic formulas or variable names.\n \nProvide your answer without any introductory or concluding remarks.\n \nExample: \nInput: { \"key_feature\": \"observed position of agent_4 at scene time 20 at the 19 timestamp\",  \"quantitative_value\": -4.150000000000027}\nAnswer: The reason why ego makes such decision at 2s is because it observes agent 4 decreases its speed 4.15m/s at 0.1s ago."
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "input_text",
          "text": input
        }
      ]
    }
  ],
    text={
    "format": {
      "type": "text"
    }
  },
    reasoning={
    "effort": "low"
  },
    tools=[],
    store=True
  )

  gpt_response_message_content = next((out.content[0].text for out in response.output if out.type == 'message'), None)

  return gpt_response_message_content

In [10]:
from bert_score import score
import torch

def calculate_bertscore(generated_texts, reference_texts, lang="en", model_type=None, device="cuda:0" if torch.cuda.is_available() else "cpu"):
    """
    Calculates the BERTScore (Precision, Recall, and F1) between generated and reference texts.

    Args:
        generated_texts (list of str): A list of generated sentences.
        reference_texts (list of str or list of list of str): A list of reference sentences.
                                                             If multiple references per generated text,
                                                             provide a list of lists.
        lang (str): The language of the texts (e.g., "en" for English).
                    Required if model_type is not specified.
        model_type (str, optional): The pre-trained BERT model to use (e.g., "bert-base-uncased").
                                    If None, a default model for the specified language is used.
        device (str, optional): The device to run the model on ("cuda" for GPU, "cpu" for CPU).
                                Defaults to "cuda:0" if a GPU is available, otherwise "cpu".

    Returns:
        tuple: A tuple containing three torch.Tensor objects:
               - precision: BERTScore Precision for each sentence.
               - recall: BERTScore Recall for each sentence.
               - f1: BERTScore F1 score for each sentence.
    """
    try:
        P, R, F1 = score(generated_texts, reference_texts, lang=lang, model_type=model_type, device=device)
        return P, R, F1
    except Exception as e:
        print(f"An error occurred during BERTScore calculation: {e}")
        return None, None, None

In [19]:
def evaluate_4o_assistant_accuracy(folder_path):
    correct = 0
    total = 0

    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)

                dimension = data.get("dimension")
                key_feature = data.get("key_feature")
                extracted_info = data.get("extracted_info")
                quant_value = data.get("quantitative_value")
                ground_truth = data.get("ground_truth")

                query = key_feature + " " + str(quant_value)
                ref = key_feature + " " + str(ground_truth)

                if key_feature and extracted_info and ground_truth is not None:
                    output_1 = connect_4o(query)
                    output_2 = connect_4o(ref)
                    # print(output_1, output_2)
                    P, R, F1 = calculate_bertscore([output_1], [output_2])
                    total += F1

            except Exception as e:
                print(f"Failed to process {filename}: {e}")

    if total == 0:
        print("No valid files processed.")
        return 0.0
    print(total)
    accuracy = total/len(os.listdir(folder_path)) * 100
    print(f"Assistant Accuracy: {correct}/{total} = {accuracy:.2f}%")
    return accuracy


In [20]:
def evaluate_o1_assistant_accuracy(folder_path):
    correct = 0
    total = 0

    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)

                dimension = data.get("dimension")
                key_feature = data.get("key_feature")
                extracted_info = data.get("extracted_info")
                quant_value = data.get("quantitative_value")
                ground_truth = data.get("ground_truth")

                query = key_feature + " " + str(quant_value)
                ref = key_feature + " " + str(ground_truth)

                if key_feature and extracted_info and ground_truth is not None:
                    output_1 = connect_gpt_o1(query)
                    output_2 = connect_gpt_o1(ref)
                    # print(output_1, output_2)
                    P, R, F1 = calculate_bertscore([output_1], [output_2])
                    total += F1

            except Exception as e:
                print(f"Failed to process {filename}: {e}")

    if total == 0:
        print("No valid files processed.")
        return 0.0

    print(total)

    accuracy = total/len(os.listdir(folder_path)) * 100
    print(f"Assistant Accuracy: {correct}/{total} = {accuracy:.2f}%")
    return accuracy


In [22]:
if __name__ == "__main__":

    folder_path = "/content/drive/MyDrive/output_jsons_after_logical_calculation"
    accuracy_4o = evaluate_o1_assistant_accuracy(folder_path)
    # accuracy_o3 = evaluate_o3_assistant_accuracy(folder_path)
    print(f"The accuracy of quantitative value calculation after the logical reasoning is: {accuracy_4o:.2f}%")
    # print(f"The accuracy of quantitative value calculation after the logical reasoning is: {accuracy_o3:.2f}%")

  0%|          | 0/23 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▍         | 1/23 [00:29<10:42, 29.20s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  9%|▊         | 2/23 [01:55<22:00, 62.89s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 13%|█▎        | 3/23 [02:41<18:25, 55.28

tensor([21.9668])





TypeError: unsupported format string passed to Tensor.__format__