## Demo Tracking pose and face

## Use input size :
- file cfg
- yolo3_weight.h5 when convert
- file train.py

In [1]:
import colorsys
import cv2
import dlib
import face_recognition
from keras import backend as K
from keras.utils import multi_gpu_model
from keras.layers import Input
from keras.models import load_model

from imutils.video import WebcamVideoStream
from imutils.video import FPS

import numpy as np
import os
from PIL import Image, ImageFont, ImageDraw
from timeit import default_timer as timer
from scipy.spatial import distance

from yolo3.model import yolo_eval, yolo_body, tiny_yolo_body
from yolo3.utils import letterbox_image

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
gpu_num = 1

Using TensorFlow backend.


In [2]:
model_path = '../model_data/yolo_weights.h5'
anchors_path = '../model_data/yolo_anchors.txt'
classes_path = '../model_data/coco_classes.txt'
score = 0.4
iou = 0.35
model_image_size = (416, 416)
sess = K.get_session()

In [3]:
# Get class
classes_path = os.path.expanduser(classes_path)
with open(classes_path) as f:
    class_names = f.readlines()

class_names = [c.strip() for c in class_names]

# Anchors
anchors_path = os.path.expanduser(anchors_path)
with open(anchors_path) as f:
    anchors = f.readline()
anchors = [float(x) for x in anchors.split(',')]
anchors = np.array(anchors).reshape(-1, 2)

In [4]:
# Load model
model_path = os.path.expanduser(model_path)
assert model_path.endswith('.h5'), 'Keras model end with file .h5'

num_anchors = len(anchors)
num_classes = len(class_names)

is_tiny_version = num_anchors==6
try:
    yolo_model = load_model(model_path, compile=False)
except:
    if is_tiny_version:
        yolo_model = tiny_yolo_body(Input(shape=(None, None, 3)), num_anchors//2, num_classes)
    else:
        yolo_model = yolo_body(Input(shape=(None, None, 3)), num_anchors//3, num_classes)
    
    yolo_model.load_weights(model_path)
else:
    yolo_model.layers[-1].output_shape[-1] == num_anchors/len(yolo_model.output) * (num_classes + 5), 'Mismatch between model and given anchor and class sizes'
    
print("{} model, anchors, and classes loaded.".format(model_path))

../model_data/yolo_weights.h5 model, anchors, and classes loaded.


In [5]:
# OPENCV_OBJECT_TRACKERS = {
#     'crst': cv2.TrackerCSRT_create,
#     'kcf': cv2.TrackerKCF_create,
#     'boosting': cv2.TrackerBoosting_create,
#     'mil': cv2.TrackerMIL_create,
#     'tld': cv2.TrackerTLD_create,
#     'medianflow': cv2.TrackerMedianFlow_create,
#     'mosse': cv2.TrackerMOSSE_create
# }
# tracker = OPENCV_OBJECT_TRACKERS['kcf']()

In [6]:
tracker = cv2.MultiTracker_create()

In [7]:
input_image_shape = K.placeholder(shape=(2, ))
boxes, scores, classes = yolo_eval(yolo_model.output, anchors, len(class_names), input_image_shape, score_threshold=score, iou_threshold=iou)
num_frame = 0
font = cv2.FONT_HERSHEY_DUPLEX
initBB = None
fps = None

# Video capture
video_capture = WebcamVideoStream(src=0).start()

while True:
    num_frame += 1

    # Read video frame and flip camera
    frame = video_capture.read()
    frame = cv2.flip(frame, 1)
    (H, W) = frame.shape[:2]
#     frame_process = np.copy(frame)
    fps = FPS().start()
    if initBB is not None:
        (success, boxes) = tracker.update(frame)
        print(success)
        print(boxes)
        if success:
            for newbox in boxes:
                p1 = (int(newbox[0]), int(newbox[1]))
                p2 = (int(newbox[0] + newbox[2]), int(newbox[1] + newbox[3]))
                cv2.rectangle(frame, p1, p2, (200, 0, 0))
#             (x, y, w, h) = [int(v) for v in box]
#             cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 3)
#             point = np.asarray([x + w/2. , y + h/2.])
#             center_points.append(point)

        fps.update()
        fps.stop()
        
        info = [
            ("Tracker", (str(tracker)).split(" ")[0] + ">"),
            ("Success", "Yes" if success else "NO"),
            ("FPS", "{:2f}".format(fps.fps())),
        ]
    
        for (i, (k, v)) in enumerate(info):
            text = "{}: {}".format(k, v)
            cv2.putText(frame, text, (10, H - ((i * 20) + 20)),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
    
    if initBB is None:
        image = Image.fromarray(frame)

        # Process detect pe
        boxed_image = letterbox_image(image, tuple(reversed(model_image_size)))
        image_data = np.array(boxed_image, dtype='float32')
        image_data /= 255.
        image_data = np.expand_dims(image_data, 0)

        # Rim keras backend tensorflow forward neural network
        out_boxes, out_scores, out_classes = sess.run([boxes, scores, classes],
                                                     feed_dict={
                                                         yolo_model.input: image_data,
                                                         input_image_shape: [image.size[1], image.size[0]],
                                                         K.learning_phase(): 0
                                                     })

        for i, c in reversed(list(enumerate(out_classes))):
            predicted_class = class_names[c]
            box = out_boxes[i]
            score = out_scores[i]
            if predicted_class == "person":
                label = '{} {:.2f}'.format(predicted_class, score)
                top, left, bottom, right = box
                print(type(top))
                top = int(top)
                left = int(left)
                bottom = int(bottom)
                right = int(right)
                print("run yolov3")
#                 cv2.rectangle(frame, (left, top), (right, bottom), (255, 0, 0), 3)
                bbox = (left, top, right - left, bottom - top)
                tracker.add(cv2.TrackerKCF_create(), frame, bbox)
                print(left, top, right, bottom)
                initBB = 1
                
                
            
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
#         if centerpoints:
#             index_distance_min, distance_min = compare_points(centerpoints, point)

#             # Compare distance min with (bottom - top) / 4
#             if distance_min < (bottom - top) / 4.:
#                 # point new same name index distance min
#                 name = namefromcenterpoint[index_distance_min]        
#                 label = name + ": " + label + "don't compute"
#                 cv2.putText(frame, label, (left + 6, top + 20), font, 1.0, (0, 0, 255), 1)

#                 # Update center point
#                 centerpoints[index_distance_min] = point
#             else:
#                 #-------------------------------------------------------#
#                 # Face recognition
#                 crop_img = frame_process[top:bottom, left:right]
#                 # Convert the image from BGR color to RGB to face_recognition use
#                 rgb_frame = crop_img[:, :, ::-1]

#                 # Find all the faces and face encodings in the current frame of video
#                 face_locations = face_recognition.face_locations(rgb_frame)
#                 face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)

#                 if not face_encodings:
#                     cv2.putText(frame, label, (left + 6, top + 20), font, 1.0, (0, 0, 255), 1)
#                 else:
#                     frame, name = detect_name(frame, face_locations, face_encodings, known_face_encodings, 
#                                 known_face_names, (top, left, bottom, right), label)
#                     centerpoints.append(point)
#                     namefromcenterpoint.append(name)
#         else:
#             # Face recognition
#             crop_img = frame_process[top:bottom, left:right]
#             # Convert the image from BGR color to RGB to face_recognition use
#             rgb_frame = crop_img[:, :, ::-1]

#             # Find all the faces and face encodings in the current frame of video
#             face_locations = face_recognition.face_locations(rgb_frame)
#             face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)

#             if not face_encodings:
#                 cv2.putText(frame, label, (left + 6, top + 20), font, 1.0, (0, 0, 255), 1)
#             else:
#                 frame, name = detect_name(frame, face_locations, face_encodings, known_face_encodings, 
#                             known_face_names, (top, left, bottom, right), label)
#                 centerpoints.append(point)
#                 namefromcenterpoint.append(name)
    #         #-------------------------------------------------------#        

    cv2.imshow("Frame", frame)
    #
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

<class 'numpy.float32'>
run yolov3
180 25 245 90
<class 'numpy.float32'>
run yolov3
98 -1 630 477
True
[[180.  25.  65.  65.]
 [ 98.  -1. 532. 478.]]
True
[[174.  26.  65.  65.]
 [ 98.  -1. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 92.  -7. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 86. -11. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 86. -11. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 84. -11. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 84. -13. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 70. -19. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 58. -23. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 64. -21. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 64. -21. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 66. -21. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 68. -21. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 68. -21. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 68. -21. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 68. -21. 532. 478.]]
False
[[174.  26.  65.  65.]
 [ 68. -21. 532. 478.]]
Fal

KeyboardInterrupt: 

## Test

## Add

In [None]:
# from threading import Thread
# import cv2
# import imutils
# class WebcamVideoStream:
#     def __init__(self, src=0):
#         self.stream = cv2.VideoCapture(src)
#         self.stream.set(3, 800)
#         self.stream.set(4, 600)
#         (self.grabbed, self.frame) = self.stream.read()
        
#         self.stopped = False
        
#     def start(self):
#         # Start the thread to read frames from the video stream
#         Thread(target=self.update, args=()).start()
#         return self
    
#     def update(self):
#         while True:
#             if self.stopped:
#                 return
            
#             (self.grabbed, self.frame) = self.stream.read()
            
#     def read(self):
#         return self.frame
    
#     def stop(self):
#         self.stopped = True  