<a href="https://colab.research.google.com/github/ericyoc/yolo-inference-obj-detect-webcam-google-co-lab-poc/blob/main/yolo_inference_obj_det_webcam_google_co_lab_poc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install ultralytics package
#!pip install ultralytics

In [2]:
# Import dependencies
import cv2
import numpy as np
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import io
import PIL
import os
import json

In [3]:
# Request permission for Google Colab files
from google.colab import output
output.enable_custom_widget_manager()

In [4]:
# Check if the YOLOv8 file is already present
if not os.path.exists('yolov8l.pt'):
    # Download YOLOv8 file
    !wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt

# Import YOLOv8 dependencies
from ultralytics import YOLO

--2024-05-18 22:20:49--  https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/521807533/407a6ed1-e224-4534-8672-b3118af5125c?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240518%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240518T222049Z&X-Amz-Expires=300&X-Amz-Signature=6648f6f30e3a31eeae99f17be90ab05805ab9d64b73e1d289b5b0d350131c3aa&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=521807533&response-content-disposition=attachment%3B%20filename%3Dyolov8l.pt&response-content-type=application%2Foctet-stream [following]
--2024-05-18 22:20:49--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/521807533/407a6ed1-e224-4534-8672-b3118af5125c?X-Amz-Algo

In [5]:
# Function to convert JavaScript object into an OpenCV image
def js_to_image(js_reply):
    image_bytes = b64decode(js_reply.split(',')[1])
    jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
    img = cv2.imdecode(jpg_as_np, flags=1)
    return img

In [6]:
# Function to convert OpenCV image to base64 string
def image_to_base64(img):
    _, buffer = cv2.imencode('.jpg', img)
    img_bytes = buffer.tobytes()
    img_b64 = b64encode(img_bytes).decode('utf-8')
    return img_b64

In [7]:
# Function to start the video stream and perform object detection
def start_video_and_detect():
    js = Javascript('''
        var video;
        var div = null;
        var stream;
        var captureCanvas;
        var imgElement;
        var labelElement;
        var captureButton;
        var exitButton;
        var singleObjectButton;

        var pendingResolve = null;
        var shutdown = false;
        var captured = false;
        var singleObject = false;

        function removeDom() {
           if (stream && stream.getVideoTracks().length > 0) {
               stream.getVideoTracks()[0].stop();
           }
           if (video) {
               video.remove();
           }
           if (div) {
               div.remove();
           }
           video = null;
           div = null;
           stream = null;
           imgElement = null;
           labelElement = null;
           captureButton = null;
           exitButton = null;
           singleObjectButton = null;
        }

        function onAnimationFrame() {
            if (!shutdown) {
                window.requestAnimationFrame(onAnimationFrame);
            }
            if (pendingResolve) {
                var result = "";
                if (!shutdown) {
                    captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
                    result = captureCanvas.toDataURL('image/jpeg', 0.8)
                }
                var lp = pendingResolve;
                pendingResolve = null;
                lp(result);
            }
        }

        async function createDom() {
            if (div !== null) {
                return stream;
            }

            div = document.createElement('div');
            div.style.border = '2px solid black';
            div.style.padding = '3px';
            div.style.width = '100%';
            div.style.maxWidth = '600px';
            div.style.position = 'relative';
            document.body.appendChild(div);

            video = document.createElement('video');
            video.style.display = 'block';
            video.width = div.clientWidth - 6;
            video.setAttribute('playsinline', '');
            stream = await navigator.mediaDevices.getUserMedia(
                {video: { facingMode: "environment"}});
            div.appendChild(video);

            imgElement = document.createElement('img');
            imgElement.style.position = 'absolute';
            imgElement.style.zIndex = 1;
            div.appendChild(imgElement);

            labelElement = document.createElement('div');
            labelElement.innerHTML = '<span>Detected Objects:</span>';
            labelElement.style.position = 'absolute';
            labelElement.style.zIndex = 2;
            labelElement.style.top = '10px';
            labelElement.style.left = '10px';
            labelElement.style.color = 'white';
            labelElement.style.backgroundColor = 'rgba(0, 0, 0, 0.7)';
            labelElement.style.padding = '5px';
            labelElement.style.borderRadius = '5px';
            div.appendChild(labelElement);

            captureButton = document.createElement('button');
            captureButton.innerHTML = 'Capture';
            captureButton.style.position = 'absolute';
            captureButton.style.zIndex = 3;
            captureButton.style.bottom = '10px';
            captureButton.style.left = '10px';
            captureButton.onclick = function() {
                captured = true;
            };
            div.appendChild(captureButton);

            exitButton = document.createElement('button');
            exitButton.innerHTML = 'Exit';
            exitButton.style.position = 'absolute';
            exitButton.style.zIndex = 3;
            exitButton.style.bottom = '10px';
            exitButton.style.right = '10px';
            exitButton.onclick = function() {
                shutdown = true;
                removeDom();
            };
            div.appendChild(exitButton);

            singleObjectButton = document.createElement('button');
            singleObjectButton.innerHTML = '1 Object';
            singleObjectButton.style.position = 'absolute';
            singleObjectButton.style.zIndex = 3;
            singleObjectButton.style.bottom = '10px';
            singleObjectButton.style.right = '80px';
            singleObjectButton.onclick = function() {
                singleObject = !singleObject;
                if (singleObject) {
                    singleObjectButton.style.backgroundColor = 'green';
                } else {
                    singleObjectButton.style.backgroundColor = '';
                }
            };
            div.appendChild(singleObjectButton);

            video.srcObject = stream;
            await video.play();

            captureCanvas = document.createElement('canvas');
            captureCanvas.width = 640;
            captureCanvas.height = 480;
            window.requestAnimationFrame(onAnimationFrame);

            return stream;
        }
        async function stream_frame(label, imgData) {
            if (shutdown) {
                return JSON.stringify({shutdown: true});
            }

            var preCreate = Date.now();
            stream = await createDom();

            var preShow = Date.now();
            if (label != "") {
                labelElement.innerHTML = '<span>Detected Objects:</span><br>' + label;
            }

            if (imgData != "") {
                var videoRect = video.getClientRects()[0];
                imgElement.style.top = videoRect.top + "px";
                imgElement.style.left = videoRect.left + "px";
                imgElement.style.width = videoRect.width + "px";
                imgElement.style.height = videoRect.height + "px";
                imgElement.src = imgData;
            }

            var preCapture = Date.now();
            var result = await new Promise(function(resolve, reject) {
                pendingResolve = resolve;
            });

            return JSON.stringify({
                create: preShow - preCreate,
                show: preCapture - preShow,
                capture: Date.now() - preCapture,
                img: result,
                captured: captured,
                shutdown: shutdown,
                singleObject: singleObject
            });
        }
    ''')

    display(js)

    # Set confidence threshold and NMS threshold
    confidence_threshold = 0.5
    nms_threshold = 0.45

    # Load the YOLOv8 model
    model = YOLO('yolov8l.pt')

    while True:
        js_reply = eval_js('stream_frame("", "")')
        js_reply = json.loads(js_reply)

        if js_reply.get("shutdown", False):
            break

        # Convert JavaScript response to OpenCV Image
        img = js_to_image(js_reply["img"])

        # Perform object detection using YOLOv8
        results = model(img, verbose=False, device='cuda', conf=confidence_threshold, iou=nms_threshold)

        # Get the detected objects
        detections = results[0].boxes.data

        # Generate labels for the detected objects
        labels = []
        last_detection = None
        for i, detection in enumerate(detections):
            class_id = int(detection[5])
            class_name = model.names[class_id]
            confidence = float(detection[4])
            if confidence >= confidence_threshold:
                x1, y1, x2, y2 = map(int, detection[:4])
                labels.append(f"{class_name}: {confidence:.2f}")

                if js_reply.get("singleObject", False):
                    last_detection = (x1, y1, x2, y2, class_name, confidence)
                else:
                    cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(img, f"{class_name}: {confidence:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        if js_reply.get("singleObject", False) and last_detection is not None:
            x1, y1, x2, y2, class_name, confidence = last_detection
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(img, f"{class_name}: {confidence:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        # Convert labels to a string
        label_text = "<br>".join(labels)

        # Convert the annotated image to base64
        img_data = f'data:image/jpeg;base64,{image_to_base64(img)}'

        # Pass the label text and annotated image data to JavaScript
        eval_js(f'stream_frame("{label_text}", "{img_data}")')

        # Check if the capture button is clicked
        if js_reply.get("captured", False):
            # Generate filename based on detected objects
            if js_reply.get("singleObject", False) and last_detection is not None:
                x1, y1, x2, y2, class_name, confidence = last_detection
                filename = f"{class_name}_{confidence:.2f}.jpg"
            else:
                filename = "_".join(labels) + ".jpg"
            filename = filename.replace(":", "_")  # Replace colons with underscores
            filename = filename.replace(" ", "_")  # Replace spaces with underscores

            # Open the "Save As" dialog to save the captured image
            js_save_dialog = '''
                var link = document.createElement('a');
                link.download = '%s';
                link.href = '%s';
                link.click();
            ''' % (filename, img_data)
            eval_js(js_save_dialog)

            # Reset the captured flag
            eval_js('captured = false;')

    # Clean up
    cv2.destroyAllWindows()

In [8]:
# Main function
def main():
    # Start the video stream and perform object detection
    start_video_and_detect()

In [9]:
if __name__ == "__main__":
    main()

<IPython.core.display.Javascript object>