In [None]:
!pip install bert-score

In [None]:
!pip install openai

In [17]:
from openai import OpenAI
import os
import json
from tqdm import tqdm

In [18]:
def connect_4o(input):
  client = OpenAI(api_key="")

  response = client.responses.create(
    model="gpt-4o",
    input=[
      {
        "role": "system",
        "content": [
          {
            "type": "input_text",
          "text": "Your job is to clearly and directly explain the thought process of a decision-making process.\n \nThe uploaded JSON file contains data stored within an autonomous driving system, specifically designed to plan the future trajectory of ego agent based on the information of environment.\n \nDetails of the file format:\nThe data of surrounding agents contains the position, heading, and velocity, where ego agent makes its decision based upon.\n \nYou need to analyze and provide direct explanation of why system makes such suggestions for  ego agent based on the information stored in the json. Please answer the questions you are given directly without any introductory or concluding statements."
          }
        ]
      },
      {
        "role": "user",
        "content": [
          {
            "type": "input_text",
            "text": input
          }
        ]
      }
    ],
    text={
      "format": {
        "type": "text"
      }
    },
    reasoning={},
    tools=[],
    temperature=1,
    max_output_tokens=2048,
    top_p=1,
    store=True
  )

  gpt_response_message_content = next((out.content[0].text for out in response.output if out.type == 'message'), None)

  return gpt_response_message_content

In [19]:
def connect_gpt_o1(input):
  client = OpenAI(api_key="")

  response = client.responses.create(model="o1", input=[
    {
      "role": "developer",
      "content": [
        {
          "type": "input_text",
          "text": "Your job is to clearly and directly explain the thought process of a decision-making process.\n \nThe uploaded JSON file contains data stored within an autonomous driving system, specifically designed to plan the future trajectory of ego agent based on the information of environment.\n \nDetails of the file format:\nThe data of surrounding agents contains the position, heading, and velocity, where ego agent makes its decision based upon.\n \nYou need to analyze and provide direct explanation of why system makes such suggestions for  ego agent based on the information stored in the json. Please answer the questions you are given directly without any introductory or concluding statements."
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "input_text",
          "text": input
        }
      ]
    }
  ],
    text={
    "format": {
      "type": "text"
    }
  },
    reasoning={
    "effort": "low"
  },
    tools=[],
    store=True
  )

  gpt_response_message_content = next((out.content[0].text for out in response.output if out.type == 'message'), None)

  return gpt_response_message_content

In [20]:
from bert_score import score
import torch

def calculate_bertscore(generated_texts, reference_texts, lang="en", model_type=None, device="cuda:0" if torch.cuda.is_available() else "cpu"):
    """
    Calculates the BERTScore (Precision, Recall, and F1) between generated and reference texts.

    Args:
        generated_texts (list of str): A list of generated sentences.
        reference_texts (list of str or list of list of str): A list of reference sentences.
                                                             If multiple references per generated text,
                                                             provide a list of lists.
        lang (str): The language of the texts (e.g., "en" for English).
                    Required if model_type is not specified.
        model_type (str, optional): The pre-trained BERT model to use (e.g., "bert-base-uncased").
                                    If None, a default model for the specified language is used.
        device (str, optional): The device to run the model on ("cuda" for GPU, "cpu" for CPU).
                                Defaults to "cuda:0" if a GPU is available, otherwise "cpu".

    Returns:
        tuple: A tuple containing three torch.Tensor objects:
               - precision: BERTScore Precision for each sentence.
               - recall: BERTScore Recall for each sentence.
               - f1: BERTScore F1 score for each sentence.
    """
    try:
        P, R, F1 = score(generated_texts, reference_texts, lang=lang, model_type=model_type, device=device)
        return P, R, F1
    except Exception as e:
        print(f"An error occurred during BERTScore calculation: {e}")
        return None, None, None

In [21]:
ref_gt = {}

for filename in os.listdir("/content/drive/MyDrive/output_jsons_after_logical_calculation"):
  file_path = os.path.join("/content/drive/MyDrive/output_jsons_after_logical_calculation", filename)
  with open(file_path, 'r') as f:
    data = json.load(f)
    key_feature = data.get("key_feature")
    ground_truth = data.get("ground_truth")
    # print(key_feature, quant_value, ground_truth)
    # query = key_feature + " " + str(quant_value)
    ref = key_feature + " " + str(ground_truth)
    ref_gt[filename.split("_T")[0]] = ref

In [22]:
ref_gt

{'scene_0': 'observed position of agent_4 at scene time 20 at the 19 timestamp 4.15',
 'scene_4': 'observed position of agent_4 at scene time 122 at the 19 timestamp 0.1',
 'scene_3': 'observed position of agent_4 at scene time 122 at the 19 timestamp 9.85',
 'scene_1': 'observed position of agent_4 at scene time 87 at the 19 timestamp 16.55',
 'scene_7': 'observed position of agent_4 at scene time 120 at the 19 timestamp 5.02',
 'scene_12': 'observed position of agent_4 at scene time 117 at the 19 timestamp 11.1',
 'scene_13': 'observed position of agent_4 at scene time 117 at the 19 timestamp 12',
 'scene_16': 'observed position of agent_4 at scene time 20 at the 19 timestamp 6.8',
 'scene_17': 'observed position of agent_4 at scene time 22 at the 19 timestamp 0.05',
 'scene_20': 'observed position of agent_4 at scene time 21 at the 19 timestamp 10.4',
 'scene_21': 'observed position of agent_4 at scene time 111 at the 19 timestamp 3.9',
 'scene_22': 'observed position of agent_4 at 

In [23]:
for filename in os.listdir("/content/drive/MyDrive/processed_data_new"):
  print(filename.split(".")[0])


scene_29
scene_0
scene_26
scene_41
scene_31
scene_27
scene_3
scene_20
scene_4
scene_16
scene_21
scene_17
scene_40
scene_1
scene_23
scene_7
scene_13
scene_12
scene_42
scene_24
scene_35
scene_28
scene_22


In [27]:
def evaluate_4o_assistant_accuracy(folder_path):
    correct = 0
    total = 0

    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)

                query = str(data["scene_data"]["general_data"]["observed_agents"]["front_camera"]) + str(data["scene_data"]["general_data"]["ego_agent"])
                ref = ref_gt[filename.split(".")[0]]
                output_1 = connect_4o(query)
                output_2 = connect_4o(ref)
                # print(output_1, output_2)
                P, R, F1 = calculate_bertscore([output_1], [output_2])
                total += F1

            except Exception as e:
                print(f"Failed to process {filename}: {e}")

    if total == 0:
        print("No valid files processed.")
        return 0.0
    print(total)
    accuracy = total/len(os.listdir(folder_path)) * 100
    print(f"Assistant Accuracy: {correct}/{total} = {accuracy:.2f}%")
    return accuracy


In [28]:
def evaluate_o1_assistant_accuracy(folder_path):
    correct = 0
    total = 0

    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)

                query = str(data["scene_data"]["general_data"]["observed_agents"]["front_camera"]) + str(data["scene_data"]["general_data"]["ego_agent"])
                ref = ref_gt[filename.split(".")[0]]
                output_1 = connect_gpt_o1(query)
                output_2 = connect_gpt_o1(ref)
                # print(output_1, output_2)
                P, R, F1 = calculate_bertscore([output_1], [output_2])
                total += F1

            except Exception as e:
                print(f"Failed to process {filename}: {e}")

    if total == 0:
        print("No valid files processed.")
        return 0.0

    print(total)

    accuracy = total/len(os.listdir(folder_path)) * 100
    print(f"Assistant Accuracy: {correct}/{total} = {accuracy:.2f}%")
    return accuracy


In [31]:
if __name__ == "__main__":

    folder_path = "/content/drive/MyDrive/processed_data_new"
    accuracy_4o = evaluate_o1_assistant_accuracy(folder_path)
    # accuracy_o3 = evaluate_o3_assistant_accuracy(folder_path)
    print(f"The accuracy of quantitative value calculation after the logical reasoning is: {accuracy_4o:.2f}%")
    # print(f"The accuracy of quantitative value calculation after the logical reasoning is: {accuracy_o3:.2f}%")

  0%|          | 0/24 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  8%|▊         | 2/24 [00:42<07:52, 21.47s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 12%|█▎        | 3/24 [01:18<09:36, 27.44s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 17%|█▋        | 4/24 [01:53<10:06, 30.30

tensor([18.9554])





TypeError: unsupported format string passed to Tensor.__format__