In [None]:
# pip install torch torchvision opencv-python tqdm pycocotools 

In [None]:
import os
import torch
from torch.utils.data import Dataset # Import Dataset
import torchvision.transforms.functional as F
from PIL import Image
import random
# from pycocotools.coco import COCO # You can remove this import as it's no longer needed

class CustomYOLODataset(Dataset):
    def __init__(self, img_folder, label_folder, train=False):
        self.img_folder = img_folder
        self.label_folder = label_folder
        self.train = train

        # Get list of image files. Assuming image and label files have the same base name.
        self.image_files = [f for f in os.listdir(img_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        self.image_files.sort() # Ensure consistent order for pairing with labels

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.img_folder, img_name)
        # Create the corresponding label file name (e.g., 'image.jpg' -> 'image.txt')
        label_name = img_name.rsplit('.', 1)[0] + '.txt'
        label_path = os.path.join(self.label_folder, label_name)

        img = Image.open(img_path).convert("RGB")
        image_width, image_height = img.size

        boxes = []
        labels = []
        areas = []

        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f:
                    parts = list(map(float, line.strip().split()))
                    class_id = int(parts[0])
                    x_center_norm, y_center_norm, width_norm, height_norm = parts[1:]

                    # Convert YOLO format (normalized center_x, center_y, width, height)
                    # to Pascal VOC format (x_min, y_min, x_max, y_max) in pixel coordinates
                    x_min = (x_center_norm - width_norm / 2) * image_width
                    y_min = (y_center_norm - height_norm / 2) * image_height
                    x_max = (x_center_norm + width_norm / 2) * image_width
                    y_max = (y_center_norm + height_norm / 2) * image_height

                    # Clamp coordinates to image boundaries to prevent issues
                    x_min = max(0.0, x_min)
                    y_min = max(0.0, y_min)
                    x_max = min(float(image_width), x_max)
                    y_max = min(float(image_height), y_max)

                    # Only add valid bounding boxes (width and height > 0)
                    if x_max > x_min and y_max > y_min:
                        boxes.append([x_min, y_min, x_max, y_max])
                        labels.append(class_id)
                        areas.append((x_max - x_min) * (y_max - y_min)) # Calculate area

        if not boxes: # Handle images with no objects or only invalid boxes
            formatted_target = {
                "boxes": torch.zeros((0, 4), dtype=torch.float32),
                "labels": torch.zeros(0, dtype=torch.int64),
                "image_id": torch.tensor([idx]), # Use idx as a simple image_id
                "area": torch.zeros(0, dtype=torch.float32),
                "iscrowd": torch.zeros(0, dtype=torch.int64)
            }
        else:
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.as_tensor(labels, dtype=torch.int64)
            areas = torch.as_tensor(areas, dtype=torch.float32)
            iscrowd = torch.zeros((len(boxes),), dtype=torch.int64) # Assuming no crowd annotations in YOLO txt

            formatted_target = {
                "boxes": boxes,
                "labels": labels,
                "image_id": torch.tensor([idx]), # Use idx as a simple image_id
                "area": areas,
                "iscrowd": iscrowd
            }

        # Augmentation (Random Horizontal Flip)
        if self.train and random.random() < 0.5:
            img = F.hflip(img)
            if formatted_target["boxes"].numel() > 0: # Only flip boxes if there are any
                bbox = formatted_target["boxes"]
                # New x_min = image_width - old x_max
                # New x_max = image_width - old x_min
                bbox[:, [0, 2]] = image_width - bbox[:, [2, 0]]
                formatted_target["boxes"] = bbox

        # Convert PIL image to PyTorch Tensor
        img = F.to_tensor(img)
        return img, formatted_target

In [None]:
def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms) if transforms else None

# --- 3. Model Definition ---
def get_model(num_classes):
    # load a model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT", pretrain = True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model

# --- 4. Collate Function for Dataloader ---
def collate_fn(batch):
    return tuple(zip(*batch))

In [None]:
import torch
import torchvision
import time
import json
import os
import cv2
from torchvision.transforms import functional as F
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_model(model_name, weight_path):

    if model_name == 'fasterrcnn':
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=92)
    elif model_name == 'retinanet':
        model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=False, num_classes=92)
    elif model_name == 'yolo':
        model = torch.hub.load('ultralytics/yolov5', 'custom', path=weight_path).autoshape()
        return model.to(device), True

    state_dict = torch.load(weight_path, map_location=device, weights_only=False)
    model.load_state_dict(state_dict)
    model.to(device).eval()
    return model, False

def run_inference(model, image_paths, is_yolo=False, coco_gt=None):
    results = []
    total_time = 0
    for img_id, img_path in enumerate(tqdm(image_paths)):
        img = cv2.imread(img_path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w = img.shape[:2]

        if is_yolo:
            start = time.time()
            results_yolo = model(img_rgb, size=640)
            end = time.time()
            total_time += (end - start)
            for det in results_yolo.xyxy[0]:
                x1, y1, x2, y2, conf, cls = det.cpu().numpy()
                results.append({
                    "image_id": img_id + 1,
                    "category_id": int(cls),
                    "bbox": [float(x1), float(y1), float(x2 - x1), float(y2 - y1)],
                    "score": float(conf)
                })
        else:
            tensor_img = F.to_tensor(img_rgb).unsqueeze(0).to(device)
            start = time.time()
            with torch.no_grad():
                outputs = model(tensor_img)[0]
            end = time.time()
            total_time += (end - start)
            for box, label, score in zip(outputs['boxes'], outputs['labels'], outputs['scores']):
                x1, y1, x2, y2 = box.tolist()
                results.append({
                    "image_id": img_id + 1,
                    "category_id": int(label),
                    "bbox": [x1, y1, x2 - x1, y2 - y1],
                    "score": float(score)
                })
    avg_time = total_time / len(image_paths)
    fps = 1 / avg_time
    print(f"[INFO] Avg Inference Time: {avg_time:.4f}s | FPS: {fps:.2f}")
    return results, avg_time, fps

def evaluate_coco(preds, ann_file):
    coco_gt = COCO(ann_file)
    coco_dt = coco_gt.loadRes(preds)
    coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()
    return coco_eval.stats  # mAP@0.5, mAP@0.5:0.95, etc.

def benchmark(model_name, weight_path, image_dir, ann_file):
    print(f"--- Benchmarking {model_name.upper()} ---")
    model, is_yolo = load_model(model_name, weight_path)
    image_paths = sorted([os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))])
    preds, avg_time, fps = run_inference(model, image_paths, is_yolo)
    with open("predictions.json", "w") as f:
        json.dump(preds, f)
    stats = evaluate_coco("predictions.json", ann_file)
    return {
        "model": model_name,
        "mAP@0.5:0.95": round(stats[0], 3),
        "mAP@0.5": round(stats[1], 3),
        "FPS": round(fps, 2),
        "Inference Time (s)": round(avg_time, 4)
    }

# def benchmark(model_name, weight_path, image_dir, label_file):
#     print(f"--- Benchmarking {model_name.upper()} ---")
    

In [None]:
import os
import json
import cv2

def convert_yolo_to_coco(yolo_label_dir, image_dir, output_json, categories):
    images = []
    annotations = []
    ann_id = 1
    image_id = 1

    category_ids = {cat['id']: cat['name'] for cat in categories}

    for filename in sorted(os.listdir(image_dir)):
        if not filename.lower().endswith(('.jpg', '.png', '.jpeg')):
            continue

        image_path = os.path.join(image_dir, filename)
        label_path = os.path.join(yolo_label_dir, os.path.splitext(filename)[0] + ".txt")

        img = cv2.imread(image_path)
        if img is None:
            print(f"[WARNING] Could not read image: {filename}")
            continue

        h, w = img.shape[:2]

        images.append({
            "id": image_id,
            "width": w,
            "height": h,
            "file_name": filename
        })

        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) != 5:
                        continue
                    class_id, xc, yc, bw, bh = map(float, parts)
                    x = (xc - bw / 2) * w
                    y = (yc - bh / 2) * h
                    width = bw * w
                    height = bh * h

                    annotations.append({
                        "id": ann_id,
                        "image_id": image_id,
                        "category_id": int(class_id),
                        "bbox": [x, y, width, height],
                        "area": width * height,
                        "iscrowd": 0
                    })
                    ann_id += 1

        image_id += 1

    coco_format = {
        "info": {
            "description": "Converted from YOLO",
            "version": "1.0",
            "year": 2025,
            "contributor": "",
            "date_created": ""
        },
        "licenses": [],
        "images": images,
        "annotations": annotations,
        "categories": categories
    }

    with open(output_json, "w") as f:
        json.dump(coco_format, f, indent=2)

    print(f"[INFO] COCO annotation JSON saved to: {output_json}")


In [None]:
categories = [{"id": i, "name": f"class_{i}"} for i in range(91)]

In [None]:
convert_yolo_to_coco(
    yolo_label_dir="/kaggle/input/coco-minitrain-10k/coco_minitrain_10k/labels/val2017",
    image_dir="/kaggle/input/coco-minitrain-10k/coco_minitrain_10k/images/val2017",
    output_json="converted_val2017.json",
    categories=categories
)

In [None]:
data_root = '/kaggle/input/coco-minitrain-10k/coco_minitrain_10k/'
models = [
    ('fasterrcnn', '/kaggle/input/faster-r-cnn/other/default/1/fasterrcnn_best_model.pt'),
    ('dert', '/kaggle/input/dert/other/default/1/rtdetr_coco_mini.pt'),
    ('yolo', '/kaggle/input/yolo/other/default/1/yolo11n.pt'),
]

# image_dir = 'data/val2017/images'
# ann_file = 'data/val2017/annotations/instances_val2017.json'
image_dir = os.path.join(data_root, 'images', 'val2017')   
# ann_dir = os.path.join(data_root, 'labels', 'val2017')
ann_file='/kaggle/working/converted_val2017.json'

all_results = []
for model_name, weight in models:
    result = benchmark(model_name, weight, image_dir, ann_file)
    all_results.append(result)

print("\n--- Summary Benchmark ---")
for r in all_results:
    print(r)
