In [None]:
!pip install ultralytics

In [1]:
from ultralytics import YOLO
import cv2
import math 

from transformers import GLPNImageProcessor, GLPNForDepthEstimation
from transformers import pipeline

import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt


# model
model = YOLO("/Users/suryanshshrivastava/IDD_Depth_Estimatipn/yolov8n.pt")

#depth_model
processor = GLPNImageProcessor.from_pretrained("vinvino02/glpn-kitti")
depth_model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")

# object classes
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]


# start webcam
cap = cv2.VideoCapture(0)
cap.set(3, 640)
cap.set(4, 480)

while True:
    success, img = cap.read()

    #cv2.imwrite('opencv'+str(i)+'.png', img)

    results = model(img, stream=True)

    color_coverted = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
    pil_image = Image.fromarray(color_coverted) 

    #save = pil_image.save('pil'+str(i)+'.jpg')


    # prepare image for the depth model
    inputs = processor(images=pil_image, return_tensors="pt")

    with torch.no_grad():
        outputs = depth_model(**inputs)
        predicted_depth = outputs.predicted_depth


    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=pil_image.size[::-1],
        mode="bicubic",
        align_corners=False,
    )

    output = prediction.squeeze().cpu().numpy()
    formatted = (output * 255 / np.max(output)).astype("uint8")
    depth = Image.fromarray(formatted)
    depth_map = np.array(depth)

    #save2 = depth.save('pil_depth'+str(i)+'.jpg')
    # coordinates
    for r in results:
        boxes = r.boxes

        for box in boxes:
            # bounding box
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values

            # put box in cam
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

            # confidence
            confidence = math.ceil((box.conf[0]*100))/100
            print("Confidence --->",confidence)

            # class name
            cls = int(box.cls[0])
            print("Class name -->", classNames[cls])

            # object details
            org = [x1, y1]
            font = cv2.FONT_HERSHEY_SIMPLEX
            fontScale = 1
            color = (255, 0, 0)
            thickness = 2

            cv2.putText(img, classNames[cls] + ' : ' + str(np.percentile(depth_map[int(y1):int(y2), int(x1):int(x2)], 75)), org, font, fontScale, color, thickness)

    cv2.imshow('Webcam', img)
    #cv2.imwrite('opencv_out'+str(i)+'.png', img)
    
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

  from .autonotebook import tqdm as notebook_tqdm



0: 480x640 1 person, 1 chair, 123.2ms
Confidence ---> 0.91
Class name --> person
Confidence ---> 0.52
Class name --> chair
Speed: 5.0ms preprocess, 123.2ms inference, 22.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 chair, 114.0ms
Confidence ---> 0.79
Class name --> person
Confidence ---> 0.57
Class name --> chair
Confidence ---> 0.29
Class name --> person
Speed: 2.4ms preprocess, 114.0ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 chair, 104.2ms
Confidence ---> 0.82
Class name --> person
Confidence ---> 0.6
Class name --> chair
Confidence ---> 0.36
Class name --> person
Speed: 1.4ms preprocess, 104.2ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 chair, 100.5ms
Confidence ---> 0.85
Class name --> person
Confidence ---> 0.57
Class name --> chair
Confidence ---> 0.4
Class name --> person
Speed: 2.3ms preprocess, 100.5ms inference, 1.1ms postprocess per image at shap




0: 480x640 2 persons, 1 chair, 142.1ms
Confidence ---> 0.92
Class name --> person
Confidence ---> 0.49
Class name --> chair
Confidence ---> 0.33
Class name --> person
Speed: 2.0ms preprocess, 142.1ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 78.9ms
Confidence ---> 0.9
Class name --> person
Speed: 2.3ms preprocess, 78.9ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 chair, 143.7ms
Confidence ---> 0.94
Class name --> person
Confidence ---> 0.32
Class name --> chair
Speed: 5.0ms preprocess, 143.7ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 chair, 94.5ms
Confidence ---> 0.94
Class name --> person
Confidence ---> 0.38
Class name --> chair
Speed: 1.5ms preprocess, 94.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 chair, 96.4ms
Confidence ---> 0.92
Class name --> person
Confidence ---> 0.49
Class name --> chair
S

: 

In [None]:
for i in range(10):
    return_value, image = camera.read()
    cv2.imwrite('opencv'+str(i)+'.png', image)

In [None]:
def plot_results(pil_img, target):
    
    inputs = processor(images=pil_img, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_depth = outputs.predicted_depth

# interpolate to original size
    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=pil_img.size[::-1],
        mode="bicubic",
        align_corners=False,
    )

# visualize the prediction
    output = prediction.squeeze().cpu().numpy()
    formatted = (output * 255 / np.max(output)).astype("uint8")
    depth = Image.fromarray(formatted)
    depth_map = np.array(depth)
    
    
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    i = 0
    for xmin, ymin, xmax, ymax in target['boxes']:
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, linewidth=3))
        
        text = f'{target["labels"][i]}: {np.percentile(depth_map[int(ymin):int(ymax), int(xmin):int(xmax)], 75)}'
        ax.text(xmin, ymin, text, fontsize=15,
                bbox=dict(facecolor='yellow', alpha=0.5))
        i += 1
    plt.axis('off')
    plt.show()