# Person Detection

### 動画の中の人を検出し、検出した領域にボックスを表示する
### モデル： YOLO v2

In [3]:
import pandas as pd
import numpy as np
import cv2

from keras.layers import Conv2D, Input, MaxPooling2D, Concatenate, Lambda
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras import backend as K
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from tensorflow import space_to_depth

from operator import itemgetter

In [4]:
with open("yolov2.weights",'rb') as w:
    weights = np.fromfile(w, dtype=np.float32)[4:]

In [5]:
layers=[
    [3, 32, 3], 
    [32, 64, 3],
    [64, 128, 3], 
    [128, 64, 1], 
    [64, 128, 3], 
    [128, 256, 3], 
    [256, 128, 1], 
    [128, 256, 3], 
    [256, 512, 3], 
    [512, 256, 1], 
    [256, 512, 3], 
    [512, 256, 1], 
    [256, 512, 3], 
    [512, 1024, 3], 
    [1024, 512, 1], 
    [512, 1024, 3], 
    [1024, 512, 1], 
    [512, 1024, 3], 
    [1024, 1024, 3], 
    [1024, 1024, 3], 
    [512, 64, 1],
    [1280, 1024, 3],
    [1024, 425, 1]
]

In [6]:
def space_to_depth_x2(x):
    return space_to_depth(x, block_size=2)

In [7]:
stride = 1
padding = 'same'
alpha = 0.1
pool_size = 2
maxpooling_timing = [0, 1, 4, 7, 12]
passthrough_timing = 12 
passthrough_layer = 20
concate_timing = 20
start=0
inputs = Input(shape=(None, None, 3),\
               name="input")

for i, l  in enumerate(layers):
    in_ch = l[0]
    out_ch = l[1]
    k_size = l[2]
    
    if i < (len(layers) - 1):
        batch_normalize = True
        activate = 'leaky'
    else:
        batch_normalize = False
        activate = 'linear'
   
    exec("bias = weights[%d:%d]" % (start, start + out_ch))
    start += out_ch

    if batch_normalize:
        exec("bn_weights = weights[%d:%d].reshape(3, %d)" % (start, start + (out_ch * 3), out_ch))
        start += out_ch * 3
        bn_weight_list = [
                    bn_weights[0],  # gamma
                    bias,           # beta
                    bn_weights[1],  # avg_mean
                    bn_weights[2]   #  avg_var
                ] 

    # from (out_dim, in_dim, height, width) to (height, width, in_dim, out_dim)
    exec("conv_weights = weights[%d:%d].reshape(%d, %d, %d, %d).transpose(2, 3, 1, 0)" \
         % (start, start + (in_ch * out_ch * (k_size ** 2)), out_ch, in_ch, k_size, k_size))  
    start += in_ch * out_ch * (k_size ** 2)
    if not batch_normalize:
        conv_weights = [conv_weights, bias]
    else:
        conv_weights = [conv_weights]

    if i == 0:
        Inputs = inputs
    elif i == passthrough_layer:
        Inputs = passthrough
    else:
        Inputs = x
    
    x = Conv2D(out_ch,\
               (k_size, k_size),\
               strides=(stride, stride),\
               use_bias=not batch_normalize,\
               weights=conv_weights,\
               padding=padding,\
               name="conv"+str(i))(Inputs)

    if batch_normalize:
        x = BatchNormalization(weights=bn_weight_list,\
                               name="batch"+str(i))(x)

    if activate == 'leaky':
        x = LeakyReLU(alpha=alpha,\
                      name="leaky"+str(i))(x)

    if i == passthrough_timing:
         passthrough = x
            
    if i in maxpooling_timing:
        x = MaxPooling2D(pool_size=(pool_size, pool_size),\
                         name="maxpooling"+str(i))(x)

    if i == (passthrough_layer-1):
        opponent = x 
        
    if i == passthrough_layer:
        x = Lambda(space_to_depth_x2,\
                   name="lambda"+str(i))(x)

    if i == concate_timing:
        x = Concatenate(name="concate"+str(i))([x,opponent])
        
model = Model(inputs=inputs, outputs=x)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv0 (Conv2D)                  (None, None, None, 3 864         input[0][0]                      
__________________________________________________________________________________________________
batch0 (BatchNormalization)     (None, None, None, 3 128         conv0[0][0]                      
__________________________________________________________________________________________________
leaky0 (LeakyReLU)              (None, None, None, 3 0           batch0[0][0]                     
__________________________________________________________________________________________________
maxpooling

In [8]:
class Bbox:
    def __init__(self, xmin, ymin, xmax, ymax, class_label, class_score):
        self.xmin = xmin
        self.ymin = ymin
        self.xmax = xmax
        self.ymax = ymax
        self.clabel = class_label
        self.cscore = class_score

In [9]:
def bbox_iou(box1, box2):
    area1 = (box1.xmax - box1.xmin) * (box1.ymax - box1.ymin)
    area2 = (box2.xmax - box2.xmin) * (box2.ymax - box2.ymin)
    # intersection
    inter_xmin = max([box1.xmin, box2.xmin])
    inter_ymin = max([box1.ymin, box2.ymin])
    inter_xmax = min([box1.xmax, box2.xmax])
    inter_ymax = min([box1.ymax, box2.ymax])
    intersection = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
    # iou
    iou = intersection / (area1 + area2 - intersection)
    return iou

In [13]:
video_path = './Shoppers.mp4'
start_timing = 0 # millisecond
num_frames = int(24*5) # frames 
input_shape = (416, 416)
grid_h = int(input_shape[0] / 32)
grid_w = int(input_shape[1] / 32)
num_box = 5
num_class = 80
obj_thresh = 0.5
bbox_iou_thresh = 0.3
init_box =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828

# video open
vid = cv2.VideoCapture(video_path)
vidw = int(vid.get(3)) # WIDTH
vidh = int(vid.get(4)) # HEIGHT
# video setting
vid_FourCC = int(vid.get(cv2.CAP_PROP_FOURCC))
vid_fps = vid.get(cv2.CAP_PROP_FPS)
out = cv2.VideoWriter('person.mp4', vid_FourCC, vid_fps, (vidw,vidh))
# start_timing
if start_timing > 0:
    vid.set(0, start_timing)
    
coordinates=[]

# processing each frame
for f in range(num_frames):
    remain, frame = vid.read()
    if not remain:
        print("process complete")
        break
    #resize
    resized = cv2.resize(frame, input_shape)
    # BGR to RGB
    rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
    inputs = np.array([rgb/255.])
    #predict
    predict = model.predict(inputs)
    predict = predict.reshape(grid_h, grid_w, num_box, 4 + 1 + num_class)
    # class probability
    predict[..., 5:] = (1/(1+np.exp(-predict[..., 4:5]))) * ( 1/ (1+ np.exp(-predict[..., 5:])))
    predict[..., 5:] *= predict[..., 5:] > obj_thresh
    # bounding boxes
    boxes = []
    for gh in range(grid_h):
        for gw in range(grid_w):
            for b in range(num_box):
                classes = predict[gh,gw,b,5:]
                if np.sum(classes) > 0.:
                    # class
                    box_label = np.argmax(classes)
                    class_score = classes[box_label]
                    if class_score > obj_thresh:
                        # coordinate and size
                        x, y, w, h = predict[gh, gw, b, :4]
                        x = (gw + 1/(1+np.exp(-x))) / grid_w
                        y = (gh + 1/(1+np.exp(-y))) / grid_h
                        w = init_box[2 * b] * np.exp(w) / grid_w
                        h = init_box[2 * b + 1] * np.exp(h) / grid_h
                        box = Bbox(x-w/2, y-h/2, x+w/2, y+h/2, box_label, class_score)
                        boxes.append(box)
       
    # non maximal suppression
    for c in range(num_class):
        indices = [[l,box.cscore]  for l,box in enumerate(boxes) if box.clabel==c]
        indices.sort(key=itemgetter(1),reverse=True)
        for i in range(len(indices)):
            index_i = indices[i][0]                 
            for j in range(i+1, len(indices)):
                index_j = indices[j][0]
                if bbox_iou(boxes[index_i], boxes[index_j]) > bbox_iou_thresh:
                    boxes[index_j].cscore = 0
                    
    boxes = [box for box in boxes if box.cscore > obj_thresh]

    coordinates.append([f,0,0,0])
    for b in boxes:
        if b.clabel == 0:
            xmin = int(round(b.xmin * frame.shape[1]))
            ymin = int(round(b.ymin * frame.shape[0]))
            xmax = int(round(b.xmax * frame.shape[1]))
            ymax = int(round(b.ymax * frame.shape[0]))
        if xmin < 0:
            xmin = 0
        if ymin < 0:
            ymin = 0
        if frame.shape[1] < xmax:
            xmax = int(frame.shape[1] - 1)
        if frame.shape[0] < ymax:
            ymax = int(frame.shape[0] - 1)
        
        coordinates.append([xmin,ymin,xmax,ymax])      
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0,255,0), 2)
    
    # frame write
    out.write(frame)
    
co = pd.DataFrame(coordinates)   
co.to_csv("coordinates.csv", index=False, header=False)    
vid.release()
out.release()
cv2.destroyAllWindows()