#### Dependencies

In [35]:
# %%capture
# import os
# if "COLAB_" not in "".join(os.environ.keys()):
#     !pip install unsloth
# else:
#     # Do this only in Colab and Kaggle notebooks! Otherwise use pip install unsloth
#     !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
#     !pip install --no-deps cut_cross_entropy unsloth_zoo
#     !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
#     !pip install --no-deps unsloth

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
from PIL import Image
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from tqdm import tqdm
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from transformers import TextStreamer
import re
import torch.nn.functional as F
from itertools import product
import base64
import io
from dotenv import load_dotenv
import openai
from openai._exceptions import RateLimitError

#### Hyperparameters

In [None]:
load_dotenv()

DATA_DIR = "data"
OUTPUT_DIR = (
    f"evaluation/gpt-4o-mini_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
)
MODEL_NAME = "gpt-4o-mini"

In [None]:
# Hyperparameters
NUM_FRAMES = [10, 15, 20]  # Number of frames to extract from the video
TIME_BETWEEN_FRAMES = [0.5, 1.0, 1.5]  # Time between frames in seconds

#### Load Data

In [None]:
train_csv_path = f"{DATA_DIR}/train.csv"
train_videos_folder = f"{DATA_DIR}/train/"

df = pd.read_csv(train_csv_path)
df.head()

In [None]:
class ImageDataset(Dataset):
    def __init__(
        self, data, videos_folder, num_frames, time_between_frames, transform=None
    ):
        self.data = data
        self.videos_folder = videos_folder
        self.transform = transform  # Any image transformations (e.g., augmentations)
        self.num_frames = num_frames
        self.time_between_frames = time_between_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        vid_path = os.path.join(
            self.videos_folder, f"{str(int(row['id'])).zfill(5)}.mp4"
        )

        if "time_of_event" in row:
            time = None if np.isnan(row["time_of_event"]) else row["time_of_event"]
        else:
            time = None

        images = self.get_multiple_frames(vid_path, time)

        label = row["target"]

        # Apply transformations
        if self.transform:
            images = [self.transform(image) for image in images]

        return images, torch.tensor(label), row["id"]  # Convert label to tensor

    def get_frame(self, video_path, time):
        cap = cv2.VideoCapture(video_path)
        cap.set(cv2.CAP_PROP_POS_MSEC, time * 1000)

        success, frame = cap.read()
        cap.release()

        if not success:
            raise ValueError(f"Failed to read frame at {time} seconds.")

        # Convert BGR to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        return Image.fromarray(frame)

    def get_multiple_frames(self, video_path, time):
        time_between_frames = self.time_between_frames

        if time == 0:
            # Get the first frame
            frame = self.get_frame(video_path, time)
            return [frame]

        if time is None:
            # Get the last frames
            cap = cv2.VideoCapture(video_path)
            frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) - 1
            fps = cap.get(cv2.CAP_PROP_FPS)
            time = frame_count / fps
            cap.release()

        frames = []
        for i in range(self.num_frames):
            try:
                frame = self.get_frame(video_path, time - i * time_between_frames)
                frames.append(frame)
            except ValueError:
                break
        return frames


transform = transforms.Compose(
    [
        transforms.Resize((480 // 3, 854 // 3)),  # Resize to a standard size
    ]
)

#### Load Models

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")


def generate_response(conversation):
    response = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=conversation,
        n=1,
    )
    return response.choices[0].message.content.strip()

#### Convert Datasets

In [None]:
SYSTEM_PROMPT = """
You are an expert in accident reconstruction and traffic analysis. You will analyze a sequence of dashcam images with a chain of thought reasoning to determine whether there is an immediate threat of vehicle collision. Consider each of the following factors:

1. Vehicle Positions: Identify the locations of all vehicles in each frame and how they change over time.
2. Trajectories: Determine the direction, speed, and acceleration of each vehicle by comparing their positions across frames.
3. Nearby Vehicles and Traffic: Identify surrounding vehicles, pedestrians, and any traffic congestion that could impact movement.
4. Traffic Signals: Consider whether traffic signals indicate a stop, go, or caution state and how that affects the vehicle interactions. Pay special attention on whether vehicles are vialating or obeying traffic signal rules.
5. Road Conditions and Visibility: Note any obstructions, road markings, or weather conditions that could contribute to the situation.
"""

SYSTEM_FORMAT_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""


def encode_image(image):
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def convert_to_conversation(images):
    # Combine all images into a single conversation
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": SYSTEM_PROMPT},
                {"type": "text", "text": SYSTEM_FORMAT_PROMPT},
                *[
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encode_image(img)}",
                        },
                    }
                    for img in images
                ],
            ],
        },
    ]
    return conversation

#### Generate Validation Predictions

In [None]:
def extract_label(decoded_text):
    match = re.search(r"(?i)<answer>\s*(Yes|No)\s*</answer>", decoded_text)
    if match:
        answer = match.group(1).strip().lower()
        if answer == "yes":
            return 1
        elif answer == "no":
            return 0
    return None

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

for num_frames, time_between_frames in tqdm(product(NUM_FRAMES, TIME_BETWEEN_FRAMES)):
    print(
        f"Evaluating with num_frames={num_frames} and time_between_frames={time_between_frames}"
    )

    results = []

    dataset = ImageDataset(
        df,
        train_videos_folder,
        num_frames=num_frames,
        time_between_frames=time_between_frames,
        transform=transform,
    )

    for image, target, id in tqdm(dataset):
        conversation = convert_to_conversation(image)
        output_text = generate_response(conversation)
        extracted_label = extract_label(output_text)

        # Store more detailed results
        results.append(
            {
                "id": id,
                "model": MODEL_NAME,
                "num_frames": num_frames,
                "time_between_frames": time_between_frames,
                "target": target,
                "response": output_text,
                "extracted_label": extracted_label,
                "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            }
        )

    df = pd.DataFrame(results)
    output_filename = f"{OUTPUT_DIR}/results.csv"
    df.to_csv(output_filename, mode="a", index=False)
    print(f"Results saved to: {output_filename}")