In [2]:
# !pip install opencv-python
# !pip install tensorflow
# !pip install --upgrade tensorflow-hub
!pip3 install --upgrade opencv-python





[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!uninstall opencv-python
!uninstall opencv-contrib-python

: 

In [1]:
import cv2
import tensorflow as tf
import numpy as np

In [3]:
def preprocess(image):
    # Convert the image to a tensor
    input_tensor = tf.convert_to_tensor(image)
    
    # Resize the image to the expected size
    resized_tensor = tf.image.resize(input_tensor, [640, 640])

    # Add an extra dimension for the batch size
    input_tensor = tf.expand_dims(resized_tensor, 0)
    
    # Convert the tensor to tf.uint8
    input_tensor = tf.cast(input_tensor, tf.uint8)
    
    return input_tensor


In [4]:
def postprocess(outputs):
    # Get the number of detections
    num_detections = len(outputs['detection_scores'])

    # Get the detection classes and convert to numpy array
    detection_classes = outputs['detection_classes'].numpy()

    # Get the detection boxes and convert to numpy array
    detection_boxes = outputs['detection_boxes'].numpy()

    # Get the detection scores and convert to numpy array
    detection_scores = outputs['detection_scores'].numpy()

    # Filter out detections with low scores
    min_score = 0.5
    indices = np.where(detection_scores >= min_score)

    # Return filtered detections
    final_boxes = detection_boxes[indices]
    final_classes = detection_classes[indices]
    final_scores = detection_scores[indices]

    return final_boxes, final_classes, final_scores


In [5]:
class_names = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", 
    "train", "truck", "boat", "traffic light", "fire hydrant", 
    "street sign", "stop sign", "parking meter", "bench", "bird", 
    "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", 
    "zebra", "giraffe", "hat", "backpack", "umbrella", "shoe", 
    "eye glasses", "handbag", "tie", "suitcase", "frisbee", 
    "skis", "snowboard", "sports ball", "kite", "baseball bat", 
    "baseball glove", "skateboard", "surfboard", "tennis racket", 
    "bottle", "plate", "wine glass", "cup", "fork", "knife", 
    "spoon", "bowl", "banana", "apple", "sandwich", "orange", 
    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", 
    "chair", "couch", "potted plant", "bed", "mirror", "dining table", 
    "window", "desk", "toilet", "door", "tv", "laptop", "mouse", 
    "remote", "keyboard", "cell phone", "microwave", "oven", 
    "toaster", "sink", "refrigerator", "blender", "book", "clock", 
    "vase", "scissors", "teddy bear", "hair drier", "toothbrush", 
    "hair brush"
]

In [11]:
def get_top_person(boxes, classes, scores):
    # Define the class index for "person"
    person_index = class_names.index('person') + 1
    # Initialize the highest score and corresponding box and class
    highest_score = -1
    highest_box = None
    highest_class = None

    # Iterate over all the classes
    for i in range(len(classes)):
        # Check if the class is "person" and if its score is higher than the highest score found so far
        if classes[i] == person_index and scores[i] > highest_score:
            highest_score = scores[i]
            highest_box = boxes[i]
            highest_class = classes[i]

    # Return the highest scoring "person" box and class
    return highest_box, highest_class, highest_score

In [71]:
def convert_to_pixel_coordinates(bbox, input_frame):
    ymin, xmin, ymax, xmax = bbox
    ymin = int(ymin * input_frame.shape[0])
    xmin = int(xmin * input_frame.shape[1])
    ymax = int(ymax * input_frame.shape[0])
    xmax = int(xmax * input_frame.shape[1])
    return [ymin, xmin, ymax, xmax]

In [59]:
def visualize_detections(image, boxes, classes, scores):

    # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = np.array(image, dtype=np.uint8)
    
    for i in range(len(boxes)):
        box = boxes[i]
        class_id = classes[i]
        score = scores[i]

        # Get the bounding box coordinates
        ymin, xmin, ymax, xmax = convert_to_pixel_coordinates(box, image)

        # Draw the bounding box on the image
        cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

        # Prepare the label
        label = f"{class_names[int(class_id)-1]}: {score:.2f}"

        # Put the label on the image
        cv2.putText(image, label, (xmin, ymin-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    return image

# Use the function


In [19]:
import tensorflow as tf
import tensorflow_hub as hub

# Specify the model's URL from TensorFlow Hub
module_handle = "https://tfhub.dev/tensorflow/ssd_mobilenet_v2/2"
# Load the model
detector = hub.load(module_handle)

# Save the model locally
tf.saved_model.save(detector, "./model/")



INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


In [8]:
model = tf.saved_model.load('./model/')

In [107]:
def track_object(cap, bbox, tracker, frame):
    # Initialize tracker with the first frame and bounding box
    ok = tracker.init(frame, bbox)

    while cap.isOpened():
        # Read a new frame
        ret, frame = cap.read()
        if not ret:
            break

        # Update tracker
        ok, bbox = tracker.update(frame)

        # Draw bounding box
        if ok:
            p1 = (int(bbox[1]), int(bbox[0]))
            p2 = (int(bbox[3]), int(bbox[2]))
            cv2.rectangle(frame, p1, p2, (255,0,0), 2, 1)
            return True, bbox
        else:
            return False, bbox

In [116]:
# Open the webcam
cap = cv2.VideoCapture(0)

# Create a tracker
tracking_mode = False
bbox = (0, 0, 0, 0)

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()

    if not ret:
        break

    if not tracking_mode:
        input_frame = preprocess(frame)

        # Run the model
        outputs = model(input_frame)

        # Postprocess the outputs
        boxes, classes, scores = postprocess(outputs)
        boxes, classes, scores = get_top_person(boxes, classes, scores)

        if boxes is not None and scores > 0.7:
            frame = visualize_detections(frame, [boxes], [classes], [scores])
            # Switch to tracking mode if a person is detected
            bbox = convert_to_pixel_coordinates(boxes, frame)
            cv2.imshow('Tracker', frame)
            tracker = cv2.TrackerKCF_create()
            tracking_mode = True

            ymin, xmin, ymax, xmax = bbox

# convert to (x, y, w, h)
            x = xmin
            y = ymin
            w = xmax - xmin
            h = ymax - ymin

            # new bbox for tracker
            bbox_for_tracker = (x, y, w, h)
            tracker.init(frame, bbox_for_tracker)
        elif boxes is not None and scores < 0.7:
            frame = visualize_detections(frame, [boxes], [classes], [scores])
            # Switch to tracking mode if a person is detected
            bbox = convert_to_pixel_coordinates(boxes, frame)
            cv2.imshow('Tracker', frame)
        else:
            cv2.imshow('Tracker', frame)
    else:
        try:
            ok, bbox = tracker.update(frame)
        except cv2.error as e:
            print("Tracking error: ", e)
            tracking_mode = False
        # Draw bounding box
        if ok:
            p1 = (int(bbox[0]), int(bbox[1]))
            p2 = (int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3]))
            cv2.rectangle(frame, p1, p2, (0, 255, 225), 2, 1)
            label = f"Tracking mode"
        # Put the label on the image
            cv2.putText(frame, label, (bbox[0], bbox[1]-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 225), 2)
            cv2.imshow('Tracker', frame)
        if not ok:
            # If tracking failed, switch back to detection mode
            tracking_mode = False

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close the window
cap.release()
cv2.destroyAllWindows()

In [73]:
# Open the webcam
cap = cv2.VideoCapture(0)

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()

    if not ret:
        break

    # Preprocess the frame
    input_frame = preprocess(frame)

    # Run the model
    outputs = model(input_frame)

    # Postprocess the outputs
    boxes, classes, scores = postprocess(outputs)
    boxes, classes, scores = get_top_person(boxes, classes, scores)

    if boxes is not None:
        frame = visualize_detections(frame, [boxes], [classes], [scores])

    # Display the frame
    cv2.imshow('Webcam Stream', frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close the window
cap.release()
cv2.destroyAllWindows()