In [1]:

import openai
from openai import OpenAI
import json

client = OpenAI(
    # KEEP IT PRIVATE!
    api_key="sk-***",
)
max_retries = 5
retry_delay = 2.0



In [2]:
json_path = 'dataset_train.json'
with open(json_path, 'r') as file:
    data = file.read()
index_dict = json.loads(data)
dataset = index_dict['dataset']


In [3]:
from collections import Counter

def flatten_frames(src_dataset):
    """
    Return a list of dicts — one per frame — with keys:
        frame_id, meta_action (str), waypoints_2d (str), image_paths (dict)
    Scene/agent IDs are discarded.
    """
    out = []

    for scene in src_dataset.values():
        for agent in scene.values():
            for fid, frame in agent.items():
                # ---------- 1) meta_action  ----------
                lat_vals = []
                lon_vals = []
                
                for rec in frame["meta_actions"].values():
                    if rec is not None:
                        lat, lon = rec.get("lateral"), rec.get("longitudinal")
                    lat_vals.append(lat)
                    lon_vals.append(lon)
                '''
                meta_action = frame["meta_actions"]['dt_2.00']
                if meta_action is not None:
                    lat, lon = meta_action.get("lateral"), meta_action.get("longitudinal")
                    meta_action_str = str([lat, lon])
                else:
                    meta_action_str = str([None, None])
                '''
                meta_action_str = [lat_vals, lon_vals]
                #majority_lat = Counter(lat_vals).most_common(1)[0][0] if lat_vals else None
                #majority_lon = Counter(lon_vals).most_common(1)[0][0] if lon_vals else None
                #meta_action_str = str([majority_lat, majority_lon])

                # ---------- 2) waypoints_2d  ----------
                
                # sort by the integer part of 'dt_X'
                tuples = []
                for wp in frame.get("waypoints_3d", {}).values():
                    if wp and len(wp) >= 3:
                        x, _, z = wp
                        tuples.append((round(x, 1), round(z, 1)))
                waypoints_str = str(tuples)
                
                
                speeds = [st["speed"] for st in frame["agent_state"].values()
                          if "speed" in st]
                speed_val = round(speeds[0],1) if speeds else None
                # ---------- 3) pack result ----------
                out.append({
                    "frame_id":      fid,
                    "image_paths":   frame["image_paths"],  # untouched
                    "meta_action":   meta_action_str,
                    "waypoints_2d":  waypoints_str,
                    "speed": speed_val,
                })

    return out

In [4]:
index = 999
outset = flatten_frames(dataset)
outset[index]

{'frame_id': 'frame_3',
 'image_paths': {'CAM_BACK': '../data/nuscenes-full/samples/CAM_BACK/n015-2018-07-18-11-50-34+0800__CAM_BACK__1531886384137525.jpg',
  'CAM_BACK_LEFT': '../data/nuscenes-full/samples/CAM_BACK_LEFT/n015-2018-07-18-11-50-34+0800__CAM_BACK_LEFT__1531886384147423.jpg',
  'CAM_BACK_RIGHT': '../data/nuscenes-full/samples/CAM_BACK_RIGHT/n015-2018-07-18-11-50-34+0800__CAM_BACK_RIGHT__1531886384127893.jpg',
  'CAM_FRONT': '../data/nuscenes-full/samples/CAM_FRONT/n015-2018-07-18-11-50-34+0800__CAM_FRONT__1531886384112473.jpg',
  'CAM_FRONT_LEFT': '../data/nuscenes-full/samples/CAM_FRONT_LEFT/n015-2018-07-18-11-50-34+0800__CAM_FRONT_LEFT__1531886384104844.jpg',
  'CAM_FRONT_RIGHT': '../data/nuscenes-full/samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-50-34+0800__CAM_FRONT_RIGHT__1531886384120339.jpg'},
 'meta_action': [['VEER_RIGHT', 'VEER_RIGHT', 'STRAIGHT'],
  ['MAINTAIN', 'ACCELERATE', 'ACCELERATE']],
 'waypoints_2d': '[(0.2, 2.8), (0.5, 7.3), (1.0, 11.9), (1.5, 16.5), (1.9

In [8]:
import base64

def encode_image(path_to_image: str) -> str:
    """
    Reads an image file from disk and returns a base64-encoded string (JPEG).
    """
    with open(path_to_image, "rb") as f:
        image_bytes = f.read()
    return base64.b64encode(image_bytes).decode("utf-8")

def build_autonomous_driving_prompt(camera_info_dict, use_base64=False):
    """
    Build a concise prompt for an LLM to perform step‑by‑step scene reasoning
    from six surround‑view images.  The model must output three numbered
    sections—Perception, Prediction, Road—without prescribing any action.
    """

    # -------- System role ---------------------------------------------------
    system_prompt = """
You are an autonomous‑driving vision analyst.
Think step‑by‑step and output ONLY the three sections below.
Do NOT suggest steering or speed commands.
""".strip()

    # -------- User instructions & demo --------------------------------------
    user_prompt = """
### Task
From the six camera views, give a brief *situation report*:

1) **Detected Objects** – main vehicles, pedestrains, traffic lights, and road signs, etc., their state, lane/relative position, ≈distance (m).  
2) **Predicted Movements** – likely next motion for each key object.  
3) **Road Condition Ahead** – geometry in front of the ego car (e.g., “straight & clear”, “tight left‑hand curve”).


### Example

Camera Views (sample):
• front‑left – parked cars at curb  
• front       – blue sedan 30 m ahead, braking  
• front‑right – clear sidewalk  
• back‑left   – black SUV closing in left lane  
• back        – clear  
• back‑right  – cyclist 20 m behind

**Model Output**
1) Blue sedan ahead braking ~30 m on the front view; black SUV left‑rear closing fast; cyclist right‑rear steady ~20 m.  
2) Sedan will slow further; SUV may merge right; cyclist continues straight.  
3) Road ahead straight and unobstructed.

---

Now analyse the new scene:

""".lstrip()

    # -------- Assemble messages --------------------------------------------
    system_content    = [{"type": "text", "text": system_prompt}]
    user_content      = [{"type": "text", "text": user_prompt}]
    assistant_content = [{"type": "text", "text": "1)"}]   # model’s first token cue

    # Add camera views
    user_content.append({"type": "text", "text": "Camera Views:"})
    if use_base64:
        for view, path in camera_info_dict.items():
            user_content.append({"type": "text", "text": f"{view}:"})
            encoded = encode_image(path)                 # assume helper exists
            user_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{encoded}"}
            })
    else:
        for view, desc in camera_info_dict.items():
            user_content.append({"type": "text", "text": f"{view}: {desc}"})

    return system_content, user_content, assistant_content

def generate_reasoning_and_action(client, camera_dict, use_base64=False):
    """
    1. Optionally encode images or prepare text descriptions.
    2. Build a prompt using that info.
    3. Call OpenAI GPT to generate chain-of-thought reasoning + final action.
    """

    # Build the prompt
    system_content, user_content, assistant_content = build_autonomous_driving_prompt(
        camera_info_dict=camera_dict,
        use_base64=use_base64
    )

    # Make the ChatCompletion call
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ],
        temperature=1.0,
        max_tokens=1024
    )

    # Extract the GPT response
    answer = response.choices[0].message.content
    return answer

In [99]:
image_path = outset[index]['image_paths']
image_path

{'CAM_BACK': '../data/nuscenes-full/samples/CAM_BACK/n015-2018-07-18-11-50-34+0800__CAM_BACK__1531886337887525.jpg',
 'CAM_BACK_LEFT': '../data/nuscenes-full/samples/CAM_BACK_LEFT/n015-2018-07-18-11-50-34+0800__CAM_BACK_LEFT__1531886337898056.jpg',
 'CAM_BACK_RIGHT': '../data/nuscenes-full/samples/CAM_BACK_RIGHT/n015-2018-07-18-11-50-34+0800__CAM_BACK_RIGHT__1531886337878549.jpg',
 'CAM_FRONT': '../data/nuscenes-full/samples/CAM_FRONT/n015-2018-07-18-11-50-34+0800__CAM_FRONT__1531886337862460.jpg',
 'CAM_FRONT_LEFT': '../data/nuscenes-full/samples/CAM_FRONT_LEFT/n015-2018-07-18-11-50-34+0800__CAM_FRONT_LEFT__1531886337854851.jpg',
 'CAM_FRONT_RIGHT': '../data/nuscenes-full/samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-50-34+0800__CAM_FRONT_RIGHT__1531886337870339.jpg'}

In [100]:
output = generate_reasoning_and_action(client, image_path, use_base64=True)
output

'1) **Detected Objects**: \n   - **Front**: A white bus is approximately 20 meters ahead in the same lane.\n   - **Back**: A red vehicle is following at a moderate distance, roughly 30 meters.\n   - **Front-Left, Front-Right, Back-Left, Back-Right**: No vehicles or pedestrians detected, clear views with grass and trees.\n\n2) **Predicted Movements**:\n   - The bus ahead may slow down further or continue straight. No immediate braking action detected.\n   - The red vehicle behind appears to be maintaining its distance, likely to continue straight.\n\n3) **Road Condition Ahead**:\n   - The road is straight with slight curves, unobstructed, bordered by grass and trees.'

In [106]:
def build_verify_prompt(reasoning_context: str, speed: float):
    """
    Construct a concise, high‑signal prompt for an LLM that returns a driving
    meta‑action pair and a confidence score from 0‑5.
    """
    system_prompt = """
    You are an autonomous‑driving assistant.  
    Input: reasoning context + ego speed.  
    Task: decide what the ego vehicle should do from the lateral and longitudinal aspects.

    Output format (no extra text):
    (['<LATERAL>', '<LONGITUDINAL>'], <CONFIDENCE>)    # confidence ∈ 0‑5

    Allowed meta‑actions
      • Lateral:   VEER_LEFT | VEER_RIGHT | CHANGE_LANE_LEFT | CHANGE_LANE_RIGHT
                  STRAIGHT  | TURN_LEFT  | TURN_RIGHT
      • Longitudinal: ACCELERATE | MAINTAIN | DECELERATE | REVERSE

    Decision rules
      1. Avoid collisions; keep safe gaps.
      2. Stay on drivable surface.
      3. keep reasonable speed when road is clear
      4. Turning with low speed and deceleration.

    Considerations(IMPORTANT)
    • Lateral:
      1. Check roadway geometry first. If the main lane curves ahead, select the action that
        follows the curve (never output STRAIGHT in this case).
      2. Then account for pedestrians, vehicles, or other obstacles and steer to avoid
        any potential collision.

    • Longitudinal:
      1. Begin with the current speed.
      2. Decide on a change:
        - If the vehicle is moving too slowly for conditions, ACCELERATE.  
        - If it’s too fast or needs extra margin, DECELERATE. 
        - If ego is turning left or right, DECELERATE.
        - Otherwise, MAINTAIN the present speed.

   
    """

    # Assemble chat messages
    system_content = [{"type": "text", "text": system_prompt.strip()}]
    user_content = [
        {"type": "text", "text": f"Reasoning context:\n{reasoning_context}"},
        {"type": "text", "text": f"Ego speed: {speed}m/s"},
    ]
    assistant_content = [{"type": "text", "text": "Meta‑action and confidence:"}]

    return system_content, user_content, assistant_content

def generate_final_action(client, reasoning_context, speed):
    """
    1. Optionally encode images or prepare text descriptions.
    2. Build a prompt using that info.
    3. Call OpenAI GPT to generate chain-of-thought reasoning + final action.
    """

    # Build the prompt
    system_content, user_content, assistant_content = build_verify_prompt(
        reasoning_context,
        speed
    )

    # Make the ChatCompletion call
    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ],
        temperature=1,
        max_tokens=512
    )

    # Extract the GPT response
    answer = response.choices[0].message.content
    return answer

In [107]:
speed = outset[index]['speed']
print(speed)
action = generate_final_action(client, output, speed)
action

8.8


"(['STRAIGHT', 'DECELERATE'], 4)"

In [53]:
def build_refine_prompt(
    camera_info_dict,
    reasoning_context,
    use_base64=False
):
    """
    Builds a text prompt for GPT by either inserting
    base64-encoded images or textual descriptions.
    """
    # Start with a short context
    system_prompt = '''
        You are an advanced autonomous driving assistant specialized in 
        concise reasoning about camera images to determine the correct driving action.
    '''

    # The user message includes demonstration data + the new scenario
    user_prompt = '''
    Below is data from an autonomous driving scenario. You are provided with:
    1) Six camera images (front-left, front, front-right, back-left, back, back-right).
    2) A current chain-of-thought reasoning.

    Goal: Produce a shorter, more concise version of the reasoning that only includes details 
    necessary for deriving the final driving action. Remove unnecessary analysis, extraneous 
    tangents, or repeated points. Rephrase any sentences to be more succinct while preserving 
    meaning.

    Instructions:
    1. Review the camera images and the current reasoning.
    2. Delete or omit irrelevant details that do not influence the final driving action.
    3. Rephrase what's left so it's concise but still logically consistent.
    
    '''
    system_content = [{"type": "text", "text": system_prompt}]
    user_content = [{"type": "text", "text": user_prompt}]
    assistant_content = [{"type": "text", "text": "Concise reasoning:"}]
    user_content.append({"type": "text", "text": "Camera Views:\n"})
    if use_base64:
        # Insert base64 data (GPT-3.5/4 standard models typically cannot decode, but let's show it anyway)
        for view, image_path in camera_info_dict.items():
            user_content.append({"type": "text", "text": f"{view}:"})
            base64_image = encode_image(image_path)
            user_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
            })                    
    else:
        # Insert textual descriptions
        for view, desc in camera_info_dict.items():
            user_content.append({"type": "text", "text": f"{view}:"})
            user_content.append({"type": "text", "text": desc})

    user_content.append({"type": "text", "text": "Current reasoning chain:\n"})
    user_content.append({"type": "text", "text": reasoning_context})
    

    return system_content, user_content, assistant_content

def generate_concise_reasoning(client, camera_dict, reasoning_context, use_base64=False):
    """
    1. Optionally encode images or prepare text descriptions.
    2. Build a prompt using that info.
    3. Call OpenAI GPT to generate chain-of-thought reasoning + final action.
    """

    # Build the prompt
    system_content, user_content, assistant_content = build_refine_prompt(
        camera_info_dict=camera_dict,
        reasoning_context=reasoning_context,
        use_base64=use_base64
    )

    # Make the ChatCompletion call
    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ],
        temperature=1,
        max_tokens=256
    )

    # Extract the GPT response
    answer = response.choices[0].message.content
    return answer

In [56]:
concise_reasoning = generate_concise_reasoning(client, image_path, output, use_base64=True)
concise_reasoning

'The road ahead is clear, with a gentle left curve. There are construction barriers and a worker on the left, but they do not obstruct the lane. A pedestrian on the right sidewalk is moving away. No vehicles or obstacles are present in the path. Proceed forward with caution near the construction area.'

In [57]:
import ast
action = generate_final_action(client, concise_reasoning, speed)
action = ast.literal_eval(action)[0]
action

['TURN_LEFT', 'MAINTAIN']

In [161]:
#action = ast.literal_eval(action)[0]
print(set(action),set(ast.literal_eval(outset[index]['meta_action'])))
assert set(action) == set(ast.literal_eval(outset[index]['meta_action']))

{'STRAIGHT', 'DECEL'} {'STRAIGHT', 'DECEL'}


In [32]:
from huggingface_hub import login

login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…