In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from IPython.display import display
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
# Hyperparameters
SECONDS_BEFORE = 0.5 # number of seconds before the collision for image frame.

In [None]:
PROJECT_DIR = "nexcar-challenge"
DATA_DIR = "data"

train_csv_path = f"/content/drive/MyDrive/{PROJECT_DIR}/{DATA_DIR}/train.csv"
train_videos_folder = f"/content/drive/MyDrive/{PROJECT_DIR}/{DATA_DIR}/train/"

df = pd.read_csv(train_csv_path)
# df = df[df['target']==1]
df.head()

In [None]:
class ImageDataset(Dataset):
    def __init__(self, data, videos_folder, transform=None):
        self.data = data
        self.videos_folder = videos_folder
        self.transform = transform  # Any image transformations (e.g., augmentations)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        vid_path = os.path.join(
            self.videos_folder, f"{str(int(row['id'])).zfill(5)}.mp4"
        )

        time = (
            0.0
            if np.isnan(row["time_of_event"])
            else row["time_of_event"] - SECONDS_BEFORE
        )
        image = self.get_frame(vid_path, time)

        label = row["target"]

        # Apply transformations
        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label)  # Convert label to tensor

    def get_frame(self, video_path, time_sec):
        cap = cv2.VideoCapture(video_path)
        cap.set(cv2.CAP_PROP_POS_MSEC, time_sec * 1000)

        success, frame = cap.read()
        cap.release()

        if not success:
            raise ValueError(f"Failed to read frame at {time_sec} seconds.")

        # Convert BGR to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        return Image.fromarray(frame)


transform = transforms.Compose(
    [
        transforms.Resize((480, 854)),  # Resize to a standard size
    ]
)

dataset = ImageDataset(df, train_videos_folder, transform=transform)
dataloader = DataLoader(dataset, shuffle=True)

In [None]:
image, label = dataset[0]
display(image)
print(label)

In [None]:
instruction = """Examine the provided dashcam image and analyze the positions and trajectories of all visible vehicles and obstacles. Based solely on the visual cues in the image, determine if there is an imminent risk of collision. Answer with only "Yes" or "No"."""


def convert_to_conversation(image, label):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": image},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": "Yes" if label else "No"}],
        },
    ]
    return {"messages": conversation}


converted_dataset = [
    convert_to_conversation(image, label) for image, label in tqdm(dataset)
]

In [None]:
converted_dataset[0]

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab and Kaggle notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastVisionModel  # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",  # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit",  # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",
    "unsloth/Pixtral-12B-2409-bnb-4bit",  # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",  # Pixtral base model
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",  # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",
    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",  # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
]  # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    # "unsloth/Llama-3.2-11B-Vision-Instruct",
    fourbit_models[0],
    load_in_4bit=True,  # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for long context
)

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=False,  # False if not finetuning vision layers
    finetune_language_layers=False,  # False if not finetuning language layers
    finetune_attention_modules=False,  # False if not finetuning attention layers
    finetune_mlp_modules=False,  # False if not finetuning MLP layers
    r=16,  # The larger, the higher the accuracy, but might overfit
    lora_alpha=16,  # Recommended alpha == r at least
    lora_dropout=0,
    bias="none",
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)