In [2]:

import openai
from openai import OpenAI
import json

client = OpenAI(
    # KEEP IT PRIVATE!
    api_key="sk-***",
)
max_retries = 5
retry_delay = 2.0



In [5]:
json_path = 'dataset_train.json'
with open(json_path, 'r') as file:
    data = file.read()
index_dict = json.loads(data)
dataset = index_dict['dataset']


In [23]:
from collections import Counter

def flatten_frames(src_dataset):
    """
    Return a list of dicts — one per frame — with keys:
        frame_id, meta_action (str), waypoints_2d (str), image_paths (dict)
    Scene/agent IDs are discarded.
    """
    out = []

    for scene in src_dataset.values():
        for agent in scene.values():
            for fid, frame in agent.items():
                # ---------- 1) meta_action  ----------
                lat_vals = []
                lon_vals = []
                for rec in frame["meta_actions"].values():
                    if rec is not None:
                        lat, lon = rec.get("lateral"), rec.get("longitudinal")
                    lat_vals.append(lat)
                    lon_vals.append(lon)
                    

                majority_lat = Counter(lat_vals).most_common(1)[0][0] if lat_vals else None
                majority_lon = Counter(lon_vals).most_common(1)[0][0] if lon_vals else None
                meta_action_str = str([majority_lat, majority_lon])

                # ---------- 2) waypoints_2d  ----------
                tuples = []
                # sort by the integer part of 'dt_X'
                for k in sorted(frame["waypoints_3d"],
                                key=lambda s: int(s.split('_')[1])):
                    if frame["waypoints_3d"][k] is not None:
                        x, _, z = frame["waypoints_3d"][k]
                        tuples.append((round(x, 1), round(z, 1)))
                waypoints_str = str(tuples)
                
                speeds = [st["speed"] for st in frame["agent_state"].values()
                          if "speed" in st]
                speed_val = round(speeds[0],1) if speeds else None
                # ---------- 3) pack result ----------
                out.append({
                    "frame_id":      fid,
                    "image_paths":   frame["image_paths"],  # untouched
                    "meta_action":   meta_action_str,
                    "waypoints_2d":  waypoints_str,
                    "speed": speed_val,
                })

    return out

In [28]:
outset = flatten_frames(dataset)
outset[167]

{'frame_id': 'frame_8',
 'image_paths': {'CAM_BACK': '../data/nuscenes-full/samples/CAM_BACK/n015-2018-07-18-11-18-34+0800__CAM_BACK__1531884160887525.jpg',
  'CAM_BACK_LEFT': '../data/nuscenes-full/samples/CAM_BACK_LEFT/n015-2018-07-18-11-18-34+0800__CAM_BACK_LEFT__1531884160897423.jpg',
  'CAM_BACK_RIGHT': '../data/nuscenes-full/samples/CAM_BACK_RIGHT/n015-2018-07-18-11-18-34+0800__CAM_BACK_RIGHT__1531884160877893.jpg',
  'CAM_FRONT': '../data/nuscenes-full/samples/CAM_FRONT/n015-2018-07-18-11-18-34+0800__CAM_FRONT__1531884160862460.jpg',
  'CAM_FRONT_LEFT': '../data/nuscenes-full/samples/CAM_FRONT_LEFT/n015-2018-07-18-11-18-34+0800__CAM_FRONT_LEFT__1531884160854844.jpg',
  'CAM_FRONT_RIGHT': '../data/nuscenes-full/samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-18-34+0800__CAM_FRONT_RIGHT__1531884160870339.jpg'},
 'meta_action': "['VEER_RIGHT', 'DECEL']",
 'waypoints_2d': '[(0.3, 11.5), (1.0, 20.5), (2.8, 27.7)]',
 'speed': 7.0}

In [3]:
def extract_cam_front_objects(data_dict):
    """
    For each scene in data_dict, collect only the objects whose keys
    contain 'CAM_FRONT'. Return a list of dictionaries, one per scene:
    
    {
      'scene_id': ...,
      'scene_description': ...,
      'objects': [
          {
            'object_key': ...,
            'Category': ...,
            'Status': ...,
            'Visual_description': ...,
            '2d_bbox': ...
          },
          ...
      ]
    }
    """
    scenes_output = []

    for scene_id, scene_data in data_dict.items():
        
        scene_desc = scene_data.get("scene_description", "N/A")
        key_frames = scene_data.get("key_frames", {})
        # We'll gather only front-camera objects in this list
        for frame_id, frame_data in key_frames.items():
            key_object_infos = frame_data.get("key_object_infos", {})
            image_paths = frame_data.get("image_paths", {})
            cam_front_objects = {}
            cam_front_objects['objects'] = []
            cam_front_objects['image_path'] = image_paths
            for obj_key, obj_info in key_object_infos.items():
                
                category = obj_info.get("Category", "Unknown")
                status = obj_info.get("Status", "Unknown")
                visual_desc = obj_info.get("Visual_description", "")
                bbox = obj_info.get("2d_bbox", [])

                cam_front_objects["objects"].append({
                    "object_key": obj_key,
                    "Category": category,
                    "Status": status,
                    "Visual_description": visual_desc,
                    "2d_bbox": bbox
                    # Note: we are NOT including 'frame_id' here
                })
            scene_entry = {
                "scene_id": scene_id,
                "scene_description": scene_desc,
                "frame_id": frame_id,
                "objects": cam_front_objects
            }
            
            scenes_output.append(scene_entry)
        
    
    return scenes_output

results = extract_cam_front_objects(index_dict)
print(results[0])

{'scene_id': 'f0f120e4d4b0441da90ec53b16ee169d', 'scene_description': 'The ego vehicle proceeds through the intersection, continuing along the current roadway.', 'frame_id': '4a0798f849ca477ab18009c3a20b7df2', 'objects': {'objects': [{'object_key': '<c1,CAM_BACK,1088.3,497.5>', 'Category': 'Vehicle', 'Status': 'Moving', 'Visual_description': 'Brown SUV.', '2d_bbox': [966.6, 403.3, 1224.1, 591.7]}, {'object_key': '<c2,CAM_BACK,864.2,468.3>', 'Category': 'Vehicle', 'Status': 'Moving', 'Visual_description': 'Black sedan.', '2d_bbox': [816.7, 431.6, 917.2, 505.0]}, {'object_key': '<c3,CAM_FRONT,1043.2,82.2>', 'Category': 'Traffic element', 'Status': None, 'Visual_description': 'Green light.', '2d_bbox': [676.4, 0.0, 1452.6, 171.5]}], 'image_path': {'CAM_FRONT': '../nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291010612404.jpg', 'CAM_FRONT_LEFT': '../nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291010604799.jpg', 'CAM_FRONT_R

In [22]:
import base64

def encode_image(path_to_image: str) -> str:
    """
    Reads an image file from disk and returns a base64-encoded string (JPEG).
    """
    with open(path_to_image, "rb") as f:
        image_bytes = f.read()
    return base64.b64encode(image_bytes).decode("utf-8")

def build_autonomous_driving_prompt(
    camera_info_dict,
    use_base64=False
):
    """
    Builds a text prompt for GPT by either inserting
    base64-encoded images or textual descriptions.
    """
    # Start with a short context
    system_prompt = '''
        You are an advanced autonomous driving assistant with expertise in scene understanding, object detection, and action planning.
    '''

    # The user message includes demonstration data + the new scenario
    user_prompt = '''
    Below is data from an autonomous driving scenario. You are provided with:
    1) Six camera images (front-left, front, front-right, back-left, back, back-right).
    2) A demonstration example showing how you should reason step-by-step and provide a final action.

    Please follow this structure:
    1) Summarize the detected objects and their statuses from the images.
    2) Predict the future movement or intent of key objects.
    3) Propose reasoning on the above content and give some corresponding potential safe and correct driving actions.

    ---

    ### Demonstration Example

    **Camera Views (Example)**:
    1) front-left: [an image showing parked cars on the curb, no pedestrians].
    2) front: [an image showing a blue sedan ~30m ahead, slight braking.
    3) front-right: [an image showing clear sidewalk, no immediate obstacles].
    4) back-left: [an image showing a black SUV approaching quickly in the left lane].
    5) back: Clear, [an image showing no vehicle behind in the same lane].
    6) back-right: [an image showing a bicycle rider moving in the right lane behind].

    **Sample Step-by-Step Reasoning (Example)**:
    1) Detected Objects & Status:
    - Blue sedan ahead is braking slightly.
    - Black SUV is behind in the adjacent lane, accelerating.
    - Bicycle behind to the right, stable speed.
    2) Future Movement Prediction:
    - The sedan may slow further or maintain a slower speed.
    - The SUV may attempt to pass or merge.
    - The bicycle will continue along the right lane.
    3) Action Planning:
    - Since the sedan is slowing, be prepared to reduce speed.
    - The SUV might merge, so keep safe distance and monitor left mirror.
    - Maintain lane position and reduce speed to maintain a safe following distance.

    ---

    ### Now Your Turn

    Below is the new scenario for which we need the same type of reasoning. Please produce a step-by-step reasoning content including perception, prediction and planning, following the style shown in the example.
    
    '''
    system_content = [{"type": "text", "text": system_prompt}]
    user_content = [{"type": "text", "text": user_prompt}]
    assistant_content = [{"type": "text", "text": "Step-by-Step reasoning:"}]
    user_content.append({"type": "text", "text": "Camera Views:\n"})
    if use_base64:
        # Insert base64 data (GPT-3.5/4 standard models typically cannot decode, but let's show it anyway)
        for view, image_path in camera_info_dict.items():
            user_content.append({"type": "text", "text": f"{view}:"})
            base64_image = encode_image(image_path)
            user_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
            })                    
    else:
        # Insert textual descriptions
        for view, desc in camera_info_dict.items():
            user_content.append({"type": "text", "text": f"{view}:"})
            user_content.append({"type": "text", "text": desc})
    

    return system_content, user_content, assistant_content

def generate_reasoning_and_action(client, camera_dict, use_base64=False):
    """
    1. Optionally encode images or prepare text descriptions.
    2. Build a prompt using that info.
    3. Call OpenAI GPT to generate chain-of-thought reasoning + final action.
    """

    # Build the prompt
    system_content, user_content, assistant_content = build_autonomous_driving_prompt(
        camera_info_dict=camera_dict,
        use_base64=use_base64
    )

    # Make the ChatCompletion call
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ],
        temperature=0.7,
        max_tokens=512
    )

    # Extract the GPT response
    answer = response.choices[0].message.content
    return answer

In [6]:
meta_data = str(results[0]['objects']['objects'])
meta_data

"[{'object_key': '<c1,CAM_BACK,1088.3,497.5>', 'Category': 'Vehicle', 'Status': 'Moving', 'Visual_description': 'Brown SUV.', '2d_bbox': [966.6, 403.3, 1224.1, 591.7]}, {'object_key': '<c2,CAM_BACK,864.2,468.3>', 'Category': 'Vehicle', 'Status': 'Moving', 'Visual_description': 'Black sedan.', '2d_bbox': [816.7, 431.6, 917.2, 505.0]}, {'object_key': '<c3,CAM_FRONT,1043.2,82.2>', 'Category': 'Traffic element', 'Status': None, 'Visual_description': 'Green light.', '2d_bbox': [676.4, 0.0, 1452.6, 171.5]}]"

In [11]:
image_path = results[1]['objects']['image_path']
image_path

{'CAM_FRONT': '../nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291002262404.jpg',
 'CAM_FRONT_LEFT': '../nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291002254799.jpg',
 'CAM_FRONT_RIGHT': '../nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291002270482.jpg',
 'CAM_BACK': '../nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291002287558.jpg',
 'CAM_BACK_LEFT': '../nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291002297405.jpg',
 'CAM_BACK_RIGHT': '../nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291002278113.jpg'}

In [12]:
output = generate_reasoning_and_action(client, image_path, use_base64=True)
output

'**Detected Objects & Status:**\n\n1. **Front View (CAM_FRONT):**\n   - Green traffic light visible.\n   - Wet road conditions due to rain.\n   - No immediate vehicles or pedestrians in the intersection.\n\n2. **Front-Left View (CAM_FRONT_LEFT):**\n   - Red traffic light for cross traffic.\n   - No vehicles or pedestrians approaching from the left.\n\n3. **Front-Right View (CAM_FRONT_RIGHT):**\n   - Red traffic light for cross traffic.\n   - No vehicles or pedestrians approaching from the right.\n\n4. **Back View (CAM_BACK):**\n   - A vehicle directly behind, maintaining a close distance.\n   - Wet road conditions.\n\n5. **Back-Left View (CAM_BACK_LEFT):**\n   - Parked vehicles and postal trucks visible.\n   - No moving vehicles in the immediate vicinity.\n\n6. **Back-Right View (CAM_BACK_RIGHT):**\n   - Clear sidewalk with no pedestrians or cyclists.\n   - No immediate obstacles.\n\n**Future Movement Prediction:**\n\n- **Front:** The green light suggests it is safe to proceed, but cau

In [13]:
def qa_to_multiple_choice(client, question, answer):
    """
    Sends a request to GPT to convert a single Q/A into a 5-option multiple-choice question.
    Returns a tuple: (revised_question_string, correct_answer_letter).
    """

    # You can fine-tune this system prompt if desired
    system_message = (
        "You are a helpful assistant that rewrites autonomous driving Q/A into multiple-choice format. "
        "You must produce exactly one correct option that matches the original answer, "
        "and four distractors that are plausible but incorrect. "
        "Return output as a Python tuple of two elements: "
        "(\"Revised MC question\", \"CorrectAnswerLetter\"). "
        "Only one letter from A,B,C,D,E should be correct."
    )

    # User message with the raw Q/A. We instruct GPT how to format
    user_message = f"""
        Original Question: {question}
        Original Answer: {answer}

        Instructions:
        1. Convert the Q/A into a multiple-choice question with exactly 5 options (A, B, C, D, E).
        2. Only one option should be correct(can be anyone from A to E), reflecting the original answer.
        3. Provide 4 other distractor options that are different from the correct one.
        4. Format the final output strictly as a Python tuple with two elements:
        ( "<multiline MC question>", "<single letter denoting correct answer>" )
        5. The MC question should look like this:

        <QUESTION>
        A) ...
        B) ...
        C) ...
        D) ...
        E) ...

        6. The second element of the tuple is the letter of the correct choice, e.g. "B".
        7. Do not add extra text or explanation outside the tuple.

        """

    response = client.chat.completions.create(
        model="gpt-4o",  # or another ChatGPT-compatible model
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message},
        ],
        temperature=0.7,
        max_tokens=400,
    )

    # The assistant's reply should be a string that looks like: 
    # ( "Predict the behavior of the ego vehicle...\nA) ...\nB) ...\nC) ...\nD) ...\nE) ...", "C" )
    return response.choices[0].message.content.strip()

In [14]:
scene_id = results[0]['scene_id']
frame_id = results[0]['frame_id']
QA = index_dict[scene_id]['key_frames'][frame_id]['QA']['behavior'][0]
question = QA['Q']
answer = QA['A']
answer

'The ego vehicle is going straight. The ego vehicle is driving fast.'

In [15]:
import ast
new_QA = qa_to_multiple_choice(client, question, answer)
new_QA = new_QA.replace('```','').replace('python','').replace('\n','').replace('\\n','')
new_QA = ast.literal_eval(new_QA)
new_Q = new_QA[0]
new_A = new_QA[1]
new_Q

'What is the behavior of the ego vehicle?A) The ego vehicle is turning left.B) The ego vehicle is stopping.C) The ego vehicle is going straight and driving fast.D) The ego vehicle is reversing.E) The ego vehicle is changing lanes to the right.'

In [16]:
def build_verify_prompt(
    reasoning_context,
    question
):
    """
    Builds a text prompt for GPT by either inserting
    base64-encoded images or textual descriptions.
    """
    # Start with a short context
    system_prompt = '''
        You are an advanced autonomous driving assistant that uses provided reasoning context
        to answer multiple-choice questions. You should pick the single best correct option."
    '''

    # The user message includes demonstration data + the new scenario
    user_prompt = '''
    Below is data from an autonomous driving scenario. You are provided with:
    1) Reasoning context derived from a real driving scenario.
    2) A multi-choice question asking about the correct and safe driving action.

    Instruction
    1) Please analyze the reasoning context carefully, then select the single best answer (A, B, C, D, or E).
    2) Only output a single letter

    
    '''
    system_content = [{"type": "text", "text": system_prompt}]
    user_content = [{"type": "text", "text": user_prompt}]
    assistant_content = [{"type": "text", "text": "The single letter answer is:"}]
    
    user_content.append({"type": "text", "text": "Reasoning context:\n"})
    user_content.append({"type": "text", "text": reasoning_context})
    user_content.append({"type": "text", "text": "Multi-choice question:\n"})
    user_content.append({"type": "text", "text": question})
    

    return system_content, user_content, assistant_content

def generate_final_action(client, reasoning_context, question):
    """
    1. Optionally encode images or prepare text descriptions.
    2. Build a prompt using that info.
    3. Call OpenAI GPT to generate chain-of-thought reasoning + final action.
    """

    # Build the prompt
    system_content, user_content, assistant_content = build_verify_prompt(
        reasoning_context,
        question
    )

    # Make the ChatCompletion call
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ],
        temperature=0,
        max_tokens=4
    )

    # Extract the GPT response
    answer = response.choices[0].message.content
    return answer

In [17]:
action = generate_final_action(client, output, new_Q)
action

'C'

In [18]:
def build_refine_prompt(
    camera_info_dict,
    reasoning_context,
    use_base64=False
):
    """
    Builds a text prompt for GPT by either inserting
    base64-encoded images or textual descriptions.
    """
    # Start with a short context
    system_prompt = '''
        You are an advanced autonomous driving assistant specialized in 
        concise reasoning about camera images to determine the correct driving action.
    '''

    # The user message includes demonstration data + the new scenario
    user_prompt = '''
    Below is data from an autonomous driving scenario. You are provided with:
    1) Six camera images (front-left, front, front-right, back-left, back, back-right).
    2) A current chain-of-thought reasoning.

    Goal: Produce a shorter, more concise version of the reasoning that only includes details 
    necessary for deriving the final driving action. Remove unnecessary analysis, extraneous 
    tangents, or repeated points. Rephrase any sentences to be more succinct while preserving 
    meaning.

    Instructions:
    1. Review the camera images and the current reasoning.
    2. Delete or omit irrelevant details that do not influence the final driving action.
    3. Rephrase what's left so it's concise but still logically consistent.
    
    '''
    system_content = [{"type": "text", "text": system_prompt}]
    user_content = [{"type": "text", "text": user_prompt}]
    assistant_content = [{"type": "text", "text": "Concise reasoning:"}]
    user_content.append({"type": "text", "text": "Camera Views:\n"})
    if use_base64:
        # Insert base64 data (GPT-3.5/4 standard models typically cannot decode, but let's show it anyway)
        for view, image_path in camera_info_dict.items():
            user_content.append({"type": "text", "text": f"{view}:"})
            base64_image = encode_image(image_path)
            user_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
            })                    
    else:
        # Insert textual descriptions
        for view, desc in camera_info_dict.items():
            user_content.append({"type": "text", "text": f"{view}:"})
            user_content.append({"type": "text", "text": desc})

    user_content.append({"type": "text", "text": "Current reasoning chain:\n"})
    user_content.append({"type": "text", "text": reasoning_context})
    

    return system_content, user_content, assistant_content

def generate_concise_reasoning(client, camera_dict, reasoning_context, use_base64=False):
    """
    1. Optionally encode images or prepare text descriptions.
    2. Build a prompt using that info.
    3. Call OpenAI GPT to generate chain-of-thought reasoning + final action.
    """

    # Build the prompt
    system_content, user_content, assistant_content = build_refine_prompt(
        camera_info_dict=camera_dict,
        reasoning_context=reasoning_context,
        use_base64=use_base64
    )

    # Make the ChatCompletion call
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ],
        temperature=1,
        max_tokens=256
    )

    # Extract the GPT response
    answer = response.choices[0].message.content
    return answer

In [19]:
concise_reasoning = generate_concise_reasoning(client, image_path, output, use_base64=True)
concise_reasoning

'**Camera Analysis:**\n\n1. **Front View (CAM_FRONT):** \n   - Green light, wet road, no vehicles/pedestrians.\n\n2. **Front-Left (CAM_FRONT_LEFT) and Front-Right (CAM_FRONT_RIGHT):** \n   - Cross traffic stopped at red lights, no approaching vehicles/pedestrians.\n\n3. **Back View (CAM_BACK):**\n   - Vehicle close behind, wet conditions.\n\n4. **Left and Right Rear Views:** \n   - Clear of moving vehicles, no immediate obstacles.\n\n**Action Plan:**\n\n- Proceed through the intersection cautiously due to green light and wet roads.\n- Monitor the vehicle behind and ensure a safe distance.\n- Maintain smooth acceleration to prevent slipping.'

In [20]:
action = generate_final_action(client, concise_reasoning, new_Q)
action

'C'

In [21]:
assert action == new_A

In [27]:


from transformers import pipeline
import transformers
import torch
system_prompt = '''
        You are an advanced autonomous driving assistant that uses provided reasoning context
        to answer multiple-choice questions. You should pick the single best correct option.
    '''

user_prompt = '''
    Below is data from an autonomous driving scenario. You are provided with:
    1) Reasoning context derived from a real driving scenario.
    2) A multi-choice question asking about the correct and safe driving action.

    Instruction
    1) Please analyze the reasoning context carefully, then select the single best answer (A, B, C, D, or E).
    2) Only output a single letter
    '''

model_id = "meta-llama/Meta-Llama-3-8B"

pipe = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
messages = user_prompt+'\n Reasoning context:'+concise_reasoning+'\n Multi-choice question:'+new_Q+'\n The single letter answer is:'

out = pipe(messages)
print(out)

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B.
401 Client Error. (Request ID: Root=1-67f5a78e-60f95e072e4bbcab7595e853;945ac74c-38fe-4291-839a-3b02a936622b)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must have access to it and be authenticated to access it. Please log in.

In [32]:
from huggingface_hub import login

login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…