# Object Detection with Image or Webcam on Ryzen AI

This example demonstrates the object detection model inference on the embedded Neural Processing Unit (NPU) in your AMD Ryzen AI enabled PC with either single image or the live webcam. 

In [2]:
# Before starting, be sure you've installed the requirements listed in the requirements.txt file:
%pip install -r C:\Users\isaia\Hackathon\EcoVision\backend\app\requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### 1. Get Model from Ryzen AI model zoo
The yolov8 model from [Ryzen AI model zoo](https://huggingface.co/amd) will be applied in this example. You may choose any other object detection models with tiny difference in the pre and post processing.

In [3]:
import sys
import os

sys.path.append(os.path.abspath('C:/Users/isaia/Hackathon/RyzenAI-SW/tutorial/yolov8/yolov8_python'))

import torch
import torch.nn as nn
import onnxruntime
import numpy as np
import cv2
import random
from huggingface_hub import hf_hub_download
from yolov8_utils import get_directories, non_max_suppression, plot_images, output_to_target
from PIL import Image
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Notebook 
# dependencies
from huggingface_hub import hf_hub_download
from yolov8_utils import get_directories

current_dir = get_directories()

# Download Yolov8 model from Ryzen AI model zoo. Registration is required before download.
hf_hub_download(repo_id="amd/yolov8m", filename="yolov8m.onnx", local_dir=str(current_dir))

'C:\\Users\\isaia\\Hackathon\\RyzenAI-SW\\tutorial\\yolov8\\yolov8_python\\yolov8m.onnx'

### 3. Model inference on NPU with webcam

Now we have validated the the model with image., and we will use the webcam as live input to do the inference on NPU.

In [None]:

import sys
import os

# Add the directory containing yolov8_utils.py to the Python path
sys.path.append(os.path.abspath('C:/Users/isaia/Hackathon/RyzenAI-SW/tutorial/yolov8/yolov8_python'))

import torch
import torch.nn as nn
import onnxruntime
import numpy as np
import cv2
import random
from huggingface_hub import hf_hub_download
from yolov8_utils import get_directories, non_max_suppression, plot_images, output_to_target
from PIL import Image
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Notebook dependencies
current_dir = get_directories()

# Download Yolov8 model from Ryzen AI model zoo. Registration is required before download.
onnx_model_path = hf_hub_download(repo_id="amd/yolov8m", filename="yolov8m.onnx", local_dir=str(current_dir))

# Load labels of coco dataset
coco_names_path = 'C:/Users/isaia/Hackathon/EcoVision/backend/data/coco.names'
with open(coco_names_path, 'r') as f:
    names = f.read().splitlines()

print(f"Class names: {names}")

imgsz = [640, 640]

# Point to the config file path used for the VitisAI Execution Provider
config_file_path = "./vaip_config.json"

npu_options = onnxruntime.SessionOptions()

npu_session = onnxruntime.InferenceSession(
    onnx_model_path,
    providers = ['VitisAIExecutionProvider'],
    sess_options=npu_options,
    provider_options=[{'config_file': config_file_path}]
)

# Paths to anchors and strides files
anchors_path = 'C:/Users/isaia/Hackathon/RyzenAI-SW/tutorial/yolov8/yolov8_python/anchors.npy'
strides_path = 'C:/Users/isaia/Hackathon/RyzenAI-SW/tutorial/yolov8/yolov8_python/strides.npy'

# Define Distribution Focal Loss
class DFL(nn.Module):
    def __init__(self, c1=16):
        super().__init__()
        self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
        x = torch.arange(c1, dtype=torch.float)
        self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
        self.c1 = c1

    def forward(self, x):
        b, c, a = x.shape  # batch, channels, anchors
        return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)

# Convert distance format to bounding box
def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
    lt, rb = torch.split(distance, 2, dim)
    x1y1 = anchor_points - lt
    x2y2 = anchor_points + rb
    if xywh:
        c_xy = (x1y1 + x2y2) / 2
        wh = x2y2 - x1y1
        return torch.cat((c_xy, wh), dim)
    return torch.cat((x1y1, x2y2), dim)


# Corrected post-processing function
def post_process(x):
    """
    Post-process the model's raw output to extract bounding boxes and class scores.
    """
    dfl = DFL(16)

    box_features = 64  # 4 * 16 DFL bins
    class_features = 80  # Original number of COCO classes
    
    # Use the correct paths for anchors and strides
    anchors_path = 'C:/Users/isaia/Hackathon/EcoVision/backend/data/anchors.npy'
    strides_path = 'C:/Users/isaia/Hackathon/EcoVision/backend/data/strides.npy'
    
    # Load anchors and strides
    anchors = torch.tensor(np.load(anchors_path))
    strides = torch.tensor(np.load(strides_path))

    batch_size = x[0].shape[0]

    # Reshape outputs to [batch, channels, total_points]
    reshaped_outputs = [xi.view(batch_size, xi.shape[1], -1) for xi in x]

    # Concatenate outputs along spatial dimension
    combined_output = torch.cat(reshaped_outputs, dim=2)

    # Split into bounding box regression and class probabilities
    # Keep the original split sizes to match the model's output structure
    box, cls = combined_output.split((box_features, class_features), dim=1)

    # Convert to bounding boxes
    dbox = dist2bbox(dfl(box), anchors.unsqueeze(0), xywh=True, dim=1) * strides

    # Apply sigmoid activation to class predictions
    cls = cls.sigmoid()
    
    # Map COCO class indices to your custom classes
    # For example, if you want to detect bottles (COCO class 39) as "plastic"
    # and cups (COCO class 41) as "glass", etc.
    
    # Create a new tensor for your 6 classes
    custom_cls = torch.zeros((batch_size, 6, cls.shape[2]), device=cls.device)
    
    # Map specific COCO classes to your custom classes
    # These are examples, adjust according to what you want to detect
    # You can check COCO class indices here: https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/
    
    # Plastic: map bottle (39), plastic bottle (bottles are 39 in COCO)
    custom_cls[:, 0, :] = cls[:, 39, :]  # Bottle -> Plastic
    
    # Metal: map cans and metal objects (can opener is not in COCO, use knife (49) as example)
    custom_cls[:, 1, :] = cls[:, 49, :]  # Knife -> Metal
    
    # Paper: map book (73), paper doesn't have direct COCO class
    custom_cls[:, 2, :] = cls[:, 73, :]  # Book -> Paper
    
    # Glass: map wine glass (40), cup (41) 
    custom_cls[:, 3, :] = torch.max(cls[:, 40, :], cls[:, 41, :])  # Wine glass or cup -> Glass
    
    # Organic: map fruits, vegetables, food (e.g., apple (47), orange (49), banana (52))
    custom_cls[:, 4, :] = torch.max(torch.max(cls[:, 47, :], cls[:, 49, :]), cls[:, 52, :])  # Fruits -> Organic
    
    # Other: map other common objects like cell phone (67)
    custom_cls[:, 5, :] = cls[:, 67, :]  # Cell phone -> Other
    
    return torch.cat((dbox, custom_cls), dim=1)

# Then in your main detection loop, after obtaining the predictions:
preds = non_max_suppression(
    preds, 
    0.25,  # confidence threshold 
    0.7,   # IoU threshold
    agnostic=False, 
    max_det=300, 
    classes=None  # Set to a list of indices to restrict detection to certain classes
)

def frame_process(frame, input_shape=(640, 640)):
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, input_shape)
    img = torch.from_numpy(img)
    img = img.float()  # uint8 to fp16/32
    img /= 255  # 0 - 255 to 0.0 - 1.0
    img = np.transpose(img, (2, 0, 1))
    return img
    
# Video input
cap = cv2.VideoCapture(0)

while (True):
    try:
        clear_output(wait=True)
        ret, frame = cap.read()
        
        if not ret:
            break
            
        input_shape = (640, 640)

        im = frame_process(frame, input_shape)
        if len(im.shape) == 3:
            im = im[None]
        outputs = npu_session.run(None, {npu_session.get_inputs()[0].name: im.permute(0, 2, 3, 1).cpu().numpy()})

        # Postprocessing
        outputs = [torch.tensor(item).permute(0, 3, 1, 2) for item in outputs]
        preds = post_process(outputs)
        preds = non_max_suppression(
            preds, 0.15, 0.7, agnostic=False, max_det=300, classes=None
        )

        colors = [[random.randint(0, 255) for _ in range(3)] 
                for _ in range(len(names))]

        # Print class indices in predictions
        for pred in preds:
            for det in pred:
                class_idx = int(det[5])
                print(f"Class index: {class_idx}")
                if class_idx >= len(names):
                    print(f"Warning: Class index {class_idx} is out of range for names list")

        # Filter out predictions with invalid class indices
        valid_preds = []
        for pred in preds:
            if len(pred) > 0:
                # Filter predictions to only include valid class indices
                valid_indices = [i for i, det in enumerate(pred) if int(det[5]) < len(names)]
            
                # If we have valid detections, create a tensor with them
                if valid_indices:
                    valid_pred = pred[valid_indices]
                else:
                    # Create an empty tensor with the right shape if no valid detections
                    valid_pred = torch.zeros((0, 6), device=pred.device)
            else:
                # Create an empty tensor with the right shape if no detections at all
                valid_pred = torch.zeros((0, 6), device=pred.device if len(pred) > 0 else 'cpu')
        
            valid_preds.append(valid_pred)

        # Check if we have any valid predictions before plotting
        if any(len(pred) > 0 for pred in valid_preds):
            plot_images(
                im,
                *output_to_target(valid_preds, max_det=6),
                frame,
                fname="output.jpg",
                names=names,
            )
        else:
            # Just display the original frame if no valid detections
            plt.figure(figsize=(12, 8))
            plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            plt.axis('off')
            plt.title("No valid detections")
            plt.show()
    except KeyboardInterrupt:
        cap.release()

Class index: 0
