### Stereo Camera YOLOV8 with navigation

In [1]:
import ultralytics
ultralytics.checks()
from ultralytics import YOLO
import cv2

import numpy as np
import scipy
import scipy.optimize

from stereo_image_utils import get_cost, draw_detections, annotate_class2 
from stereo_image_utils import get_horiz_dist_corner_tl, get_horiz_dist_corner_br, get_dist_to_centre_tl, get_dist_to_centre_br, get_dist_to_centre_cntr

Ultralytics YOLOv8.1.45  Python-3.11.8 torch-2.2.2+cpu CPU (AMD Ryzen 5 2600 Six-Core Processor)
Setup complete  (12 CPUs, 15.9 GB RAM, 152.9/232.2 GB disk)


In [2]:
#n, s, m, l, x
# see https://github.com/ultralytics/ultralytics for more information
model = YOLO("yolov8n.pt")
#class names
names =  model.model.names

In [6]:
#camera url. I've used a static url in the esp32 cam sketc.
# connecting through local network with url = 192.168.1.xxx
# might need to change this if you are connecting through an iphone hotspot or some other network
URL_left = "http://192.168.1.185"
URL_right = "http://192.168.1.184"
#focal length. Pre-calibrated in stereo_image_v6 notebook
fl = 1.8739765360371479
tantheta = 0.4326786982030149
base = 10.3

In [9]:
if __name__ == '__main__':
    while True:
        cap_left = cv2.VideoCapture(URL_left + ":81/stream")
        
        cap_right = cv2.VideoCapture(URL_right + ":81/stream")

        if cap_left.isOpened():
            ret_l, frame_l = cap_left.read()
            frame_l = cv2.rotate(frame_l, cv2.ROTATE_90_CLOCKWISE)
            cap_left.release()
            if ret_l:
                cv2.imshow("left_eye", frame_l) 
        else:
            print("left not opened")
            break

        if cap_right.isOpened():
            ret_r, frame_r = cap_right.read()
            frame_r = cv2.rotate(frame_r, cv2.ROTATE_90_CLOCKWISE)
            cap_right.release()
            if ret_r:
                cv2.imshow("right_eye", frame_r)
        else:
            print("right not opened")
            break
        
        if ret_r and ret_l :
            imgs = [frame_l, frame_r]
            out_l = (model.predict(source =cv2.cvtColor(frame_l, cv2.COLOR_BGR2RGB), save=False, conf = 0.3, save_txt=False, show = False ))[0]
            out_r = (model.predict(source =cv2.cvtColor(frame_r, cv2.COLOR_BGR2RGB), save=False, conf = 0.3, save_txt=False, show = False ))[0]
            
            if out_l.boxes.shape[0]>0 and out_r.boxes.shape[0]>0: 
                #find the image centre
                sz1 = frame_r.shape[1]
                centre = sz1/2
                #dets are bounding boxes and lbls are labels.
                det = []
                lbls = []

                if(out_l.boxes.shape[0]>0 and out_r.boxes.shape[0]>0):
                    det.append(np.array(out_l.boxes.xyxy))
                    det.append(np.array(out_r.boxes.xyxy))
                    lbls.append(out_l.boxes.cls)
                    lbls.append(out_r.boxes.cls)
                
                #get the cost of matching each object in the left image
                #to each object in the right image
                cost = get_cost(det, lbls = lbls,sz1 = centre)
                
                #choose optimal matches based on the cost.
                tracks = scipy.optimize.linear_sum_assignment(cost)                
                
                #find top left and bottom right corner distance to centre (horizonatlly)
                dists_tl =  get_horiz_dist_corner_tl(det)
                dists_br =  get_horiz_dist_corner_br(det)

                final_dists = []
                dctl = get_dist_to_centre_tl(det[0],cntr = centre)
                dcbr = get_dist_to_centre_br(det[0], cntr = centre)
                
                #classes for left and right images. nm0 is left, nm1 is right
                q = [i.item() for i in lbls[0]]
                nm0 = [names[i] for i in q]
                q = [i.item() for i in lbls[1]]
                nm1 = [names[i] for i in q]
                
                for i, j in zip(*tracks):
                    if dctl[i] < dcbr[i]:
                        final_dists.append((dists_tl[i][j],nm0[i]))
                    else:
                        final_dists.append((dists_br[i][j],nm0[i]))
                
                #final distances as list
                fd = [i for (i,j) in final_dists]
                #find distance away
                dists_away = (base/2)*sz1*(1/tantheta)/np.array((fd))+fl
                cat_dist = []
                for i in range(len(dists_away)):
                    cat_dist.append(f'{nm0[(tracks[0][i])]} {dists_away[i]:.1f}cm')
                    print(f'{nm0[(tracks[0][i])]} is {dists_away[i]:.1f}cm away')
                t1 = [list(tracks[1]), list(tracks[0])]
                frames_ret = []
                for i, imgi in enumerate(imgs):
                    img = imgi.copy()
                    deti = det[i].astype(np.int32)
                    draw_detections(img,deti[list(tracks[i])], obj_order=list(t1[1]))
                    annotate_class2(img,deti[list(tracks[i])],lbls[i][list(tracks[i])],cat_dist)
                    frames_ret.append(img)
                cv2.imshow("left_eye", frames_ret[0])
                cv2.imshow("right_eye", frames_ret[1])

            key = cv2.waitKey(1)
                
            if key == 27: #esc key
                print(out_l)
                break

    cv2.destroyAllWindows()
    cap_left.release()
    cap_right.release()


0: 640x480 1 bottle, 1 cup, 114.0ms
Speed: 2.0ms preprocess, 114.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 bottle, 1 cup, 1 toothbrush, 110.0ms
Speed: 2.0ms preprocess, 110.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
bottle is 35.7cm away
cup is 38.9cm away

0: 640x480 1 bottle, 103.0ms
Speed: 2.0ms preprocess, 103.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 bottle, 1 cup, 1 toothbrush, 98.0ms
Speed: 2.0ms preprocess, 98.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
bottle is 35.7cm away

0: 640x480 1 bottle, 1 cup, 107.0ms
Speed: 2.0ms preprocess, 107.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 bottle, 1 cup, 1 toothbrush, 109.0ms
Speed: 2.0ms preprocess, 109.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)
bottle is 35.4cm away
cup is 39.0cm away

0: 640x480 1 bottle, 1 keyboard, 97.0ms
Speed: 2.0ms