In [3]:
import time
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torchvision
from torchvision import transforms
import cv2
import matplotlib.pyplot as plt

from ultralytics import YOLO

IMAGE_DIR = Path(".")     # folder with 10 images
OUTPUT_CSV = "results.csv"
YOLO_MODEL_PATH = "yolov8n.pt" # YOLOv8 model
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SCORE_THRESHOLD = 0.5          # Faster R-CNN

print("Using device:", DEVICE)

Using device: cpu


In [4]:
# COCO category names (for Faster R-CNN label mapping)
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
    'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table',
    'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
    'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]


def get_image_stats(img: Image.Image):
    """
    Simple, non-deep-learning features:
    - width, height, aspect ratio
    - mean R, G, B values
    - edge density (fraction of edge pixels using Canny)
    """
    arr = np.array(img.convert("RGB"))
    h, w, _ = arr.shape
    aspect_ratio = w / h

    mean_r = arr[:, :, 0].mean()
    mean_g = arr[:, :, 1].mean()
    mean_b = arr[:, :, 2].mean()

    # Edge density using Canny
    gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    edge_density = edges.mean() / 255.0  # between 0 and 1

    return w, h, aspect_ratio, mean_r, mean_g, mean_b, edge_density

In [5]:
def run_yolov8_on_image(model, img_path: Path):
    """
    Run YOLOv8 on a single image.
    Returns: num_objects, mean_confidence, labels_str, time_sec
    """
    start = time.perf_counter()
    results = model(str(img_path), verbose=False)
    elapsed = time.perf_counter() - start

    result = results[0]
    boxes = result.boxes

    if boxes is None or len(boxes) == 0:
        return 0, 0.0, "", elapsed

    confs = boxes.conf.cpu().numpy()
    clses = boxes.cls.cpu().numpy().astype(int)
    names = result.names  # dict id -> name
    labels = [names[int(c)] for c in clses]

    labels_str = ", ".join(labels)
    mean_conf = float(confs.mean())

    return len(confs), mean_conf, labels_str, elapsed


def load_fasterrcnn_model():
    """
    Load pretrained Faster R-CNN (ResNet50-FPN).
    """
    try:
        weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=weights)
    except Exception:
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

    model.to(DEVICE)
    model.eval()
    return model


def run_fasterrcnn_on_image(model, img_path: Path):
    """
    Run Faster R-CNN on a single image.
    Returns: num_objects, mean_confidence, labels_str, time_sec
    """
    img = Image.open(img_path).convert("RGB")
    transform = transforms.Compose([transforms.ToTensor()])
    img_tensor = transform(img).to(DEVICE)

    start = time.perf_counter()
    with torch.no_grad():
        outputs = model([img_tensor])
    elapsed = time.perf_counter() - start

    output = outputs[0]
    scores = output["scores"].detach().cpu().numpy()
    labels = output["labels"].detach().cpu().numpy()

    keep = scores >= SCORE_THRESHOLD
    scores = scores[keep]
    labels = labels[keep]

    if len(scores) == 0:
        return 0, 0.0, "", elapsed

    label_names = [COCO_INSTANCE_CATEGORY_NAMES[int(l)] if int(l) < len(COCO_INSTANCE_CATEGORY_NAMES)
                   else f"class_{int(l)}"
                   for l in labels]

    labels_str = ", ".join(label_names)
    mean_conf = float(scores.mean())

    return len(scores), mean_conf, labels_str, elapsed

In [6]:
print("Loading YOLOv8 model...")
yolo_model = YOLO(YOLO_MODEL_PATH)

print("Loading Faster R-CNN model...")
frcnn_model = load_fasterrcnn_model()

Loading YOLOv8 model...
Loading Faster R-CNN model...


In [7]:
# Collect image paths
image_paths = sorted(
    [p for p in IMAGE_DIR.iterdir() if p.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp", ".webp"]]
)

print(f"Found {len(image_paths)} images:")
for p in image_paths:
    print(" -", p.name)

rows = []

for img_path in image_paths:
    print("\nProcessing:", img_path.name)
    img = Image.open(img_path).convert("RGB")

    # non-DL features for each image
    width, height, aspect_ratio, mean_r, mean_g, mean_b, edge_density = get_image_stats(img)

    # YOLOv8
    y_num, y_conf, y_labels, y_time = run_yolov8_on_image(yolo_model, img_path)
    rows.append({
        "image": img_path.name,
        "model": "YOLOv8",
        "num_objects": y_num,
        "mean_confidence": y_conf,
        "labels": y_labels,
        "time_sec": y_time,
        "width": width,
        "height": height,
        "aspect_ratio": aspect_ratio,
        "mean_R": mean_r,
        "mean_G": mean_g,
        "mean_B": mean_b,
        "edge_density": edge_density,
    })

    # Faster R-CNN
    f_num, f_conf, f_labels, f_time = run_fasterrcnn_on_image(frcnn_model, img_path)
    rows.append({
        "image": img_path.name,
        "model": "Faster R-CNN",
        "num_objects": f_num,
        "mean_confidence": f_conf,
        "labels": f_labels,
        "time_sec": f_time,
        "width": width,
        "height": height,
        "aspect_ratio": aspect_ratio,
        "mean_R": mean_r,
        "mean_G": mean_g,
        "mean_B": mean_b,
        "edge_density": edge_density,
    })

df = pd.DataFrame(rows)
df

Found 10 images:
 - craps.jpg
 - disneyland.jpg
 - football.jpg
 - golf.jpg
 - planes.jpg
 - poker.jpg
 - scottsdale.jpg
 - tennis.jpg
 - theater.jpg
 - trains.jpg

Processing: craps.jpg

Processing: disneyland.jpg

Processing: football.jpg

Processing: golf.jpg

Processing: planes.jpg

Processing: poker.jpg

Processing: scottsdale.jpg

Processing: tennis.jpg

Processing: theater.jpg

Processing: trains.jpg


Unnamed: 0,image,model,num_objects,mean_confidence,labels,time_sec,width,height,aspect_ratio,mean_R,mean_G,mean_B,edge_density
0,craps.jpg,YOLOv8,8,0.694454,"person, person, person, person, person, person...",0.30099,768,432,1.777778,139.562126,124.786627,94.245177,0.158327
1,craps.jpg,Faster R-CNN,14,0.882932,"person, person, person, person, person, person...",3.837,768,432,1.777778,139.562126,124.786627,94.245177,0.158327
2,disneyland.jpg,YOLOv8,6,0.630281,"person, person, teddy bear, person, teddy bear...",0.204593,2560,1709,1.497952,125.820949,128.655682,128.063492,0.06006
3,disneyland.jpg,Faster R-CNN,6,0.819509,"person, person, class_88, person, person, clas...",3.938322,2560,1709,1.497952,125.820949,128.655682,128.063492,0.06006
4,football.jpg,YOLOv8,9,0.446588,"person, person, person, person, person, person...",0.158972,1400,1100,1.272727,108.329962,104.877162,106.246036,0.037428
5,football.jpg,Faster R-CNN,13,0.822198,"person, person, person, person, skateboard, pe...",3.195332,1400,1100,1.272727,108.329962,104.877162,106.246036,0.037428
6,golf.jpg,YOLOv8,3,0.898602,"person, person, person",0.118016,1200,817,1.468788,128.803407,134.141771,110.85115,0.10996
7,golf.jpg,Faster R-CNN,4,0.913684,"person, person, person, clock",4.010725,1200,817,1.468788,128.803407,134.141771,110.85115,0.10996
8,planes.jpg,YOLOv8,1,0.63803,airplane,0.159034,281,179,1.569832,138.355594,135.319509,133.864928,0.139068
9,planes.jpg,Faster R-CNN,6,0.645034,"surfboard, tie, airplane, airplane, tie, surfb...",4.032042,281,179,1.569832,138.355594,135.319509,133.864928,0.139068
