There are lot of ways to do face detection, e.g. OpenCV : Haar cascades Classifier

## This document is base on MTCNN from Szegedy et al.
[Joint Face Detection and Alignment Using Multitask Cascaded Convolutional Networks](https://arxiv.org/ftp/arxiv/papers/1604/1604.02878.pdf)
<br>K Zhang, Z Zhang, Z Li, Y Qiao - IEEE Signal Processing Letters, 2016 - ieeexplore.ieee.org
<p><img src="images/MCTNN.png" style="width:580px;height:194px;" align="left"><br><br><br><br><br><br><br><br><br><br><br>


## Code and pretrained weights are mainly from <a href="https://github.com/davidsandberg/facenet">David Sandberg GitHub</a>
The architecture based on MTCNNv2, env requirements as the table shown below, and will demostrate 2 usage(all run in jupyter)
    1. batch photo detect
    2. Real time face detect
本文架構為MTCNNv2, 執行環境需求如下表所示, 在此將展示2種用法(全程於jupyter介面運行)
    1. 批次圖片偵測
    2. 動態即時偵測
<table align="left">
  <thead>
    <tr>
      <th><p style="text-align:center;">Environments</p></th>
      <th><p style="text-align:center;">Frameworks</th>
      <th><p style="text-align:center;">Mian Packages</th>
      <th><p style="text-align:center;">Notes</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><p style="text-align:center;">Python 3.7.7</td>
      <td><p style="text-align:center;">Tensorflow 1.15</td>
      <td><p style="text-align:center;">numpy, etc.,</td>
      <td><p style="text-align:center;">Suggest to creat you env through Anacond etc.,<br>due to the automatic installation of lots of packages<br>建議採用Anaconda建立環境, 可節省諸多package安裝過程</td>
    </tr>
    <tr>
      <td><p style="text-align:center;"> </td>
      <td><p style="text-align:center;"> </td>
      <td><p style="text-align:center;">OpenCV '4.4.0'</td>
      <td><p style="text-align:center;">Suggest to install the package by .whl file<br>建議下載whl檔案安裝, 透過Anaconda pip安裝的版本無法正常結束影片</td>
    </tr>
      <tr>
      <td><p style="text-align:center;"> </td>
      <td><p style="text-align:center;"> </td>
      <td><p style="text-align:center;"> </td>
      <td><p style="text-align:center;">we will use pretrained weights from David to run the model<br>我們會用David Sandberg已經學習好的參數(det1/det2/det3)來跑模型</td>
    </tr>
  </tbody>
</table>

In [1]:
""" Tensorflow implementation of the face detection / alignment algorithm found at
https://github.com/kpzhang93/MTCNN_face_detection_alignment
"""
# MIT License
# 
# Copyright (c) 2016 David Sandberg
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to dealr
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six import string_types, iteritems

import numpy as np
import tensorflow as tf
#from math import floor
import cv2
import os
import imageio                         # try this if you got error on 'scipy.misc' has no attribute 'imread'
import matplotlib.pyplot as plt

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def layer(op):
    """Decorator for composable network layers."""

    def layer_decorated(self, *args, **kwargs):
        # Automatically set a name if not provided.
        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
        # Figure out the layer inputs.
        if len(self.terminals) == 0:
            raise RuntimeError('No input variables found for layer %s.' % name)
        elif len(self.terminals) == 1:
            layer_input = self.terminals[0]
        else:
            layer_input = list(self.terminals)
        # Perform the operation and get the output.
        layer_output = op(self, layer_input, *args, **kwargs)
        # Add to layer LUT.
        self.layers[name] = layer_output
        # This output is now the input for the next layer.
        self.feed(layer_output)
        # Return self for chained calls.
        return self

    return layer_decorated

In [3]:
class Network(object):
    
    """
    David Sandberg在編輯時，很認真的自己寫了 CNN 所需要的各種组件，包括 Conv 层，MaxPool 层，Softmax 层等等。
    原則上都還是用tensorflow架構, 但是增添一些細節補述, 建議解讀成這算是一種個人風格的展現, 這邊依序將可看到 :
    
    @layer :
    def conv :   、   def prelu :   、   def max_pool :   、   def fc :   、   def softmax :     
    另外還有其它一些小功能 def validate_padding : 、 def make_var : 、
    
    You will see David Sandberg creat the required layer component here by him self, 
    still base on tensorflow, but I think the usage here, is more personal style
    """

    def __init__(self, inputs, trainable=True):                 # define layers function might creat by yourself
        # The input nodes for this network
        self.inputs = inputs
        # The current list of terminal nodes
        self.terminals = []
        # Mapping from layer names to layers
        self.layers = dict(inputs)
        # If true, the resulting variables are set as trainable
        self.trainable = trainable

        self.setup()

    def setup(self):
        """Construct the network. """
        raise NotImplementedError('Must be implemented by the subclass.')

    def load(self, data_path, session, ignore_missing=False):   # e.g. you can see usage of pnet.load in "def create_mtcnn" below
        """Load network weights.
        data_path: The path to the numpy-serialized network weights
        session: The current TensorFlow session
        ignore_missing: If true, serialized weights for missing layers are ignored.
        """
        data_dict = np.load(data_path, encoding='latin1').item() #pylint: disable=no-member

        for op_name in data_dict:
            with tf.variable_scope(op_name, reuse=True):
                for param_name, data in iteritems(data_dict[op_name]):
                    try:
                        var = tf.get_variable(param_name)
                        session.run(var.assign(data))
                    except ValueError:
                        if not ignore_missing:
                            raise

    def feed(self, *args):
        """Set the input(s) for the next operation by replacing the terminal nodes.
        The arguments can be either layer names or the actual layers.
        """
        assert len(args) != 0
        self.terminals = []
        for fed_layer in args:
            if isinstance(fed_layer, string_types):
                try:
                    fed_layer = self.layers[fed_layer]
                except KeyError:
                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
            self.terminals.append(fed_layer)
        return self

    def get_output(self):
        """Returns the current network output."""
        return self.terminals[-1]

    def get_unique_name(self, prefix):
        """Returns an index-suffixed unique name for the given prefix.
        This is used for auto-generating layer names based on the type-prefix.
        """
        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
        return '%s_%d' % (prefix, ident)

    def make_var(self, name, shape):
        """Creates a new TensorFlow variable."""
        return tf.get_variable(name, shape, trainable=self.trainable)

    def validate_padding(self, padding):
        """Verifies that the padding is one of the supported ones."""
        assert padding in ('SAME', 'VALID')

    @layer
    def conv(self,
             inp,
             k_h,     # kernel size_h-direct.
             k_w,     # kernel size_w-direct.
             c_o,     # filters (channel_outter)
             s_h,     # stride_h-direct.
             s_w,     # stride_w-direct.
             name,                                            # define op_name here, then you can fine the weights in after
             relu=True,
             padding='SAME',                                  # notice here : the default "SAME" padding
             group=1,
             biased=True):
        self.validate_padding(padding)                        # Verify that the padding is acceptable
        c_i = int(inp.get_shape()[-1])                        # Get the number of channels in the input (channel_inner)
        # Verify that the grouping parameter is valid
        assert c_i % group == 0                              # % 1 == 0, assert value c_i is able to work
        assert c_o % group == 0                              # % 1 == 0, assert value c_o is able to work
        
        # Convolution for a given input and kernel
        convolve = lambda i, k: tf.nn.conv2d(input = i, filter = k, strides = [1, s_h, s_w, 1], padding=padding)  
        with tf.variable_scope(name) as scope:
            kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o])       # define parameter_name here
            # This is the common-case. Convolve the input without any further complications.
            output = convolve(inp, kernel)                                               # define w*a
            # Add the biases
            if biased:
                biases = self.make_var('biases', [c_o])                                  # define parameter_name here
                output = tf.nn.bias_add(output, biases)                                  # define w*a + b
            if relu:
                # ReLU non-linearity
                output = tf.nn.relu(output, name=scope.name)                             # define activation (w*a + b)
            return output

    @layer
    def prelu(self, inp, name):
        with tf.variable_scope(name):
            i = int(inp.get_shape()[-1])
            alpha = self.make_var('alpha', shape=(i,))                                   # define prelu parameter_name here
            output = tf.nn.relu(inp) + tf.multiply(alpha, -tf.nn.relu(-inp))             # define prelu = ReLu(a) - alpha*ReLu(-a)
        return output
    # MTCNN conduct the nonlinearity by PReLU, 使用 Parametric ReLU (PReLU) 導入 nonlinearity activation
    # alpha is a learnable variable 是需要學習的參數
    
    @layer
    def max_pool(self, inp, k_h, k_w, s_h, s_w, name, padding='SAME'):                  # notice here : the default "SAME" padding
        self.validate_padding(padding)
        return tf.nn.max_pool(value = inp,
                              ksize=[1, k_h, k_w, 1],
                              strides=[1, s_h, s_w, 1],
                              padding=padding,
                              name=name)

    @layer
    def fc(self, inp, num_out, name, relu=True):
        with tf.variable_scope(name):
            input_shape = inp.get_shape()                        # get input shape, e.g. RNet fc input is [None, 3, 3, 64] [576, 128]
            if input_shape.ndims == 4:
                # The input is spatial. Vectorize it first.
                dim = 1
                for d in input_shape[1:].as_list():             # e.g. d = 3, 3, 64
                    dim *= int(d)                               # e.g. dim = 3 * 3 * 64
                feed_in = tf.reshape(inp, [-1, dim])            # e.g. feed_in = tf.reshape([None, 3, 3, 64], [576])
            else:
                feed_in, dim = (inp, input_shape[-1].value)     
            weights = self.make_var('weights', shape=[dim, num_out])   # e.g. shape = [3 * 3 * 64, 128]
            biases = self.make_var('biases', [num_out])                # e.g. shape = [128]
            op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b        # define operation = ReLu or w*a + b
            fc = op(feed_in, weights, biases, name=name)
            return fc


    """
    Multi dimensional softmax,
    refer to https://github.com/tensorflow/tensorflow/issues/210
    compute softmax along the dimension of target
    the native softmax only supports batch_size x dimension
    """
    @layer
    def softmax(self, target, axis, name=None):
        max_axis = tf.reduce_max(target, axis, keepdims=True)
        target_exp = tf.exp(target-max_axis)
        normalize = tf.reduce_sum(target_exp, axis, keepdims=True)
        softmax = tf.div(target_exp, normalize, name)
        return softmax

### Proposal Network (P-Net)
<br>To obtain the candidate windows and their bounding box regression vectors in a similar manner as <a href="https://arxiv.org/abs/1502.02766">Deep Dense Face Detector </a>(DDFD)
<br>Then we use the estimated bounding box regression vectors to calibrate the candidates. After that, we employ non-maximum suppression (NMS) to merge highly overlapped candidates.
<br>全積捲網路(No FC layer)，主要可以對應不同輸入尺寸，最大特色就是取代了sliding window(下方說明)
<br>於PNet, RNet, ONet 三層中層數最淺，主要功能是僅可能把判定可能存在人臉的位置框選出來
<p style="text-align:left;"><img src="images/MTCNN_Architectures_4.png"  style="width:636px;height:174px;" align="left">
<br><br><br><br><br><br><br><br>
<br>Since we jointly perform face detection and alignment, here we use four different kinds of data annotation in our training process:
<br>(i)   Negatives: Regions that the Intersec-tion-over-Union (IoU) ratio less than 0.3 to any ground-truth faces
<br>(ii)  Positives: IoU above 0.65 to a ground truth face
<br>(iii) Part faces: IoU between 0.4 and 0.65 to a ground truth face
<br>(iv)  Landmark faces: faces labeled 5 landmarks’ positions.
训练数据由四部分组成：
<br>Negatives图像 : 跟ground-truth相比iou值小于0.3
<br>Positives图像 : 跟ground-truth相比iou值大于0.65
<br>Part faces图像 : 跟ground-truth相比iou值大于0.4
<br>Landmark faces的是有標註 5 個faces landmark位置的图像。
<br>
<br>Negatives and positives are used for face classification tasks.
<br>positives and part faces are used for bounding box regression.
<br>Landmark faces are used for facial landmark localization.
<br>Negatives & positives : 主要用來做face classification tasks 的cost計算. (P-net)
<br>positives & part faces : 用來做bounding box regression 的cost計算.  (R-net)
<br>Landmark faces : 用來做facial landmark localization 的cost計算.       (O-net)
<br>上述各種cost(或lost)計算, 例如:是否存在人脸的类别损失只通过neg和pos数据来对参数进行更新，
<br>具體方式是通过label中的类别值做了一个遮罩来划分数据，只计算neg和pos的损失，不计算其他数据的损失
<br>论文中有个小技巧就是只通过前70%的数据进行更新参数，说是模型准确率会有提升，在代码中也都有体现，具体实现可以参考代码。
<br>
<br>The training data for each network is described as follows:
<br>P-Net : We randomly crop several patches from WIDER FACE <a href="https://arxiv.org/abs/1511.06523">[24]</a> to collect positives, negatives and part face. Then, we crop faces from CelebA <a href="https://arxiv.org/abs/1411.7766">[23]</a> as landmark faces.
<br>R-Net: We use first stage of our framework to detect faces from WIDER FACE <a href="https://arxiv.org/abs/1511.06523">[24]</a> to collect positives, negatives and part face while landmark faces are detected from CelebA <a href="https://arxiv.org/abs/1411.7766">[23]</a>.
<br>O-Net: Similar to R-Net to collect data but we use first two stages of our framework to detect faces.
<br>

PNet的输入都resize成12x12的输入，通過PNet得到了
<br>1. Classification Softmax結果 : 是否有人脸的概率(code中, convoloution得到的1x1x2結果, 會再做一個softmaxt), 維度為[batch,2]。
<br>2. Box_Bounding結果 :  即人臉框的資訊(x,y,w,h), 維度為[batch,4]。
<br>3. Face_Landmark結果 :  即人臉Face_Landmark的資訊, 5個點的(x,y), 維度為[batch,10]。
<br>
<br>利用下圖說明Fully Convolution Network(FCN)的優勢(No FC layer)，為什麼說取代了sliding window
<br>首先注意到PNet inputl為12x12x3, output為1x1x32 ([12, 12, 3] -> [10, 10, 10] -> [5, 5, 10] -> [3, 3, 16] -> [1, 1, 32])
<br>但是, 下一層的RNet inputl卻是24x24x3, why? 哪來的?
<br>為了方便說明, 這邊我們假設原始照片就是24x24x3, 因此RNet輸入24x24沒問題, 那PNet的12x12是哪來的?<br>
<br>現在我們先回到傳統的sliding window作法, 用一個12x12的slider window去跑, 設定stride=2, 那麼我們總共要跑7x7=49次sliding window
<br>如果我們直接把24x24的數據丟進去這個FCN, 我們會得到[24, 24, 3] -> [22, 22, 10] -> [11, 11, 10] -> [9, 9, 16] -> [7, 7, 32]
<br>因此如果為了簡化訓練資料, 直接用12x12來訓練FCN的PNet, 當我用24x24的資料丟進去時, 得到的7x7結果, 就相當於49個sliding window的結果, 一次完成
<br>各種不同输入尺度，都可以依此類推，取代sliding window, 如果這49個結果中, 某一個結果判定可能有目標, 再利用P-Net的BB結果回到24x24原圖把框框畫出來
<br><br>所以PNet的輸入設定12x12，主要為了速度考量，FCN的規劃也允許 R-Net / O-Net 可以用不同尺寸輸入。
<br>一張圖片輸入到PNet中會得到[1, 1, 2], [1, 1, 4], [1, 1, 10]的预测值，類似yolo Anchor box。
<br><img src="images/P-Net.png" style="width:425px;height:166px;" align="left">

In [4]:
class PNet(Network):
    def setup(self):
        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
             .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1')
             # tf.nn.conv2d(X, filter = fc1, strides = [1, 1, 1, 1], padding = "VALID") expect result [10, 10, 10]
             #                fc1 = [n_H, n_W, n_C, filters] ksize = [1, f, f, 1], stride = [1, s, s, 1]
             .prelu(name='PReLU1')
             # tf.keras.layers.PReLU(alpha_initializer='zeros', alpha_regularizer=None, alpha_constraint=None, shared_axes=None, **kwargs)
         
             .max_pool(2, 2, 2, 2, name='pool1')
             # tf.nn.max_pool(value = A1, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'VALID') expect result [5, 5, 10]
         
             .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2')
             # tf.nn.conv2d(X, filter = [3, 3, 10, 16], strides = [1, 1, 1, 1], padding = "VALID") expect result [3, 3, 16]
         
             .prelu(name='PReLU2')
             # tf.keras.layers.PReLU(alpha_initializer='zeros')
         
             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3')
             # tf.nn.conv2d(X, filter = [3, 3, 16, 32], strides = [1, 1, 1, 1], padding = "VALID") expect result [1, 1, 32]
         
             .prelu(name='PReLU3')
             # tf.keras.layers.PReLU(alpha_initializer='zeros')
         
             .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1')
             # tf.nn.conv2d(X, filter = [1, 1, 32, 2], strides = [1, 1, 1, 1], padding = "VALID") expect res ult[1, 1, 2]
             # use as face classification
         
             .softmax(3,name='prob1'))
             # softmax into 3 classes
            
        (self.feed('PReLU3') #pylint: disable=no-value-for-parameter
             .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2'))

## Refine Network (R-Net)
<br>Further rejects a large number of false candidates, performs calibration with bounding box regression, and NMS candidate merge.
<br>經過第一輪的PNet選秀後, 選出眾多判定可能人臉的boundbox, 篩選出分數比 設定值threshold[0] 高的, 再將這些分數較高的進行nonmaxinum suppression,<br> 淘汰nonmaxinum且IoU高於設定值(0.5)的圖片, 再進到第二輪RNet在判定一次，但要resize成24x24的作為RNet輸入。
<br>
<p style="text-align:left;"><img src="images/MTCNN_Architectures_5.png" style="width:636px;height:174px;" align="left">
<br>
<br><img src="images/R-Net.png" style="width:425px;height:166px;" align="left">

In [5]:
class RNet(Network):
    def setup(self):
        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
             .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1')
             .prelu(name='prelu1')
             .max_pool(3, 3, 2, 2, name='pool1')    # notice here is default padding = "SAME", thus why result [11,11, 28]
             .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2')
             .prelu(name='prelu2')
             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
             .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3')
             .prelu(name='prelu3')
             .fc(128, relu=False, name='conv4')
             .prelu(name='prelu4')
             .fc(2, relu=False, name='conv5-1')
             .softmax(1,name='prob1'))

        (self.feed('prelu4') #pylint: disable=no-value-for-parameter
             .fc(4, relu=False, name='conv5-2'))

## Output Network (O-Net)
<br>Similar to the second stage(R-Net), but in this stage we aim to describe the face in more details. 
<br>In particular, the network will output five facial landmarks’ positions.
<p style="text-align:left;"><img src="images/MTCNN_Architectures_6.png" style="width:636px;height:174px;" align="left">
<br><p style="text-align:left;"><img src="images/O-Net.png" style="width:692px;height:169px;" align="left">

In [6]:
class ONet(Network):
    def setup(self):
        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1')
             .prelu(name='prelu1')
             .max_pool(3, 3, 2, 2, name='pool1')
             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2')
             .prelu(name='prelu2')
             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3')
             .prelu(name='prelu3')
             .max_pool(2, 2, 2, 2, name='pool3')
             .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4')
             .prelu(name='prelu4')
             .fc(256, relu=False, name='conv5')
             .prelu(name='prelu5')
             .fc(2, relu=False, name='conv6-1')
             .softmax(1, name='prob1'))

        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
             .fc(4, relu=False, name='conv6-2'))

        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
             .fc(10, relu=False, name='conv6-3'))

In [7]:
# function [boundingbox] = bbreg(boundingbox,reg)
def bbreg(boundingbox,reg):
    """Calibrate bounding boxes"""
    if reg.shape[1]==1:
        reg = np.reshape(reg, (reg.shape[2], reg.shape[3]))

    w = boundingbox[:,2]-boundingbox[:,0]+1
    h = boundingbox[:,3]-boundingbox[:,1]+1
    b1 = boundingbox[:,0]+reg[:,0]*w
    b2 = boundingbox[:,1]+reg[:,1]*h
    b3 = boundingbox[:,2]+reg[:,2]*w
    b4 = boundingbox[:,3]+reg[:,3]*h
    boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ]))
    return boundingbox
 
def generateBoundingBox(imap, reg, scale, t):
    """Use heatmap to generate bounding boxes
    imap = (np.transpose(pnet(img).[1], (0,2,1,3)))[1], 即 face_classification result, 輸出結果將為[num_h_slide, num_w_slide]
    reg = (np.transpose(pnet(img).[0], (0,2,1,3)))[0], 即 bounding_box result 輸出結果將為[num_h_slide, num_w_slide, 4]
    boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
    
    """
    stride=2                       # because max pooling stride = 2
    cellsize=12

    imap = np.transpose(imap)      # 輸出結果將為[num_w_slide, num_h_slide], 相當於pyramid中每張圖片sliding window後的結果
    dx1 = np.transpose(reg[:,:,0]) # x1 for each bounding_box in imap, (shape = [num_w_slide, num_h_slide])
    dy1 = np.transpose(reg[:,:,1]) # y1 for each bounding_box in imap, (shape = [num_w_slide, num_h_slide])
    dx2 = np.transpose(reg[:,:,2]) # x2 for each bounding_box in imap, (shape = [num_w_slide, num_h_slide])
    dy2 = np.transpose(reg[:,:,3]) # y2 for each bounding_box in imap, (shape = [num_w_slide, num_h_slide])
    y, x = np.where(imap >= t)     # index mask for face_classification[num_w_slide, num_h_slide], see which one >= threshold[0]
                                   # we can know which sliding window >= threshold[0], by coordinate (num_w_slide=y, num_h_slide=x)
                                   # e.g. num_w_slide = y = [108 149 149 180], num_h_slide = x = [125 161 162 229]
    
    if y.shape[0]==1:             # if only 1 window was detected >= threshold[0]
        dx1 = np.flipud(dx1)      # why flip over ?? when only 1 detected??
        dy1 = np.flipud(dy1)      # why flip over ?? when only 1 detected??
        dx2 = np.flipud(dx2)      # why flip over ?? when only 1 detected??
        dy2 = np.flipud(dy2)      # why flip over ?? when only 1 detected??
    score = imap[(y,x)]
    reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ])) # vstack ensure matrix formate, final shape = [(num of > threshold), 4]
    
    if reg.size==0:               # if no bounding box was found
        reg = np.empty((0,3))
    bb = np.transpose(np.vstack([y,x]))          # bb = bounding box index
    q1 = np.fix((stride*bb+1)/scale)             # q1 = turn window back to original pic by index
    q2 = np.fix((stride*bb+cellsize-1+1)/scale) # q2 = turn window back to original pic by q1 + cellsize
    boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])
    return boundingbox, reg
 
# function pick = nms(boxes,threshold,type)
def nms(boxes, threshold, method):         # non-max suppression
    if boxes.size==0:
        return np.empty((0,3))
    x1 = boxes[:,0] # box coordinate x1
    y1 = boxes[:,1] # box coordinate y1
    x2 = boxes[:,2] # box coordinate x2
    y2 = boxes[:,3] # box coordinate y2
    s = boxes[:,4]  # box score
    area = (x2-x1+1) * (y2-y1+1)            # calculate box area
    I = np.argsort(s)                       # return the index from min to max
    pick = np.zeros_like(s, dtype=np.int16) # zeros matrix with same dimension in s.shape
    counter = 0
    while I.size>0:
        i = I[-1]                           # reverse direction, which means from max to min.
        pick[counter] = i                   # set pick[current_max_score_index, current_max_score_index ...] 
        counter += 1
        idx = I[0:-1]                       # the rest of boxes, except of max score one
        xx1 = np.maximum(x1[i], x1[idx])    # get intersection x1, np.maximum(current max ,rest boxes)
        yy1 = np.maximum(y1[i], y1[idx])    # get intersection y1
        xx2 = np.minimum(x2[i], x2[idx])    # get intersection x2
        yy2 = np.minimum(y2[i], y2[idx])    # get intersection y2
        w = np.maximum(0.0, xx2-xx1+1)      # intersection w
        h = np.maximum(0.0, yy2-yy1+1)      # intersection h
        inter = w * h                       # intersection area
        if method is 'Min':
            o = inter / np.minimum(area[i], area[idx])
        else:
            o = inter / (area[i] + area[idx] - inter)
        I = I[np.where(o<=threshold)]       # under current max score, delet IoU > threshold
    pick = pick[0:counter]
    return pick

# function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h)
def pad(total_boxes, w, h):
    """Compute the padding coordinates (pad the bounding boxes to square)"""
    tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32)  # tmpw = x2 - x1 (input box_width)
    tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32)  # tmph = y2 - y1 (input box_height)
    numbox = total_boxes.shape[0]

    dx = np.ones((numbox), dtype=np.int32)
    dy = np.ones((numbox), dtype=np.int32)
    edx = tmpw.copy().astype(np.int32)
    edy = tmph.copy().astype(np.int32)

    x = total_boxes[:,0].copy().astype(np.int32)   # bb x1 coord. (w-dir.)
    y = total_boxes[:,1].copy().astype(np.int32)   # bb y1 coord. (h-dir.)
    ex = total_boxes[:,2].copy().astype(np.int32)  # bb x2 coord. (w-dir.)
    ey = total_boxes[:,3].copy().astype(np.int32)  # bb y2 coord. (h-dir.)

    tmp = np.where(ex>w) # index of those x2 coordinate > photo
    edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1)   # for those x2>w, there is no value for out of photo area,
                                                             # thus new photo width edx = box_width + (photo_width-x2)
    ex[tmp] = w  # for those x2>w, x2 = w
    
    tmp = np.where(ey>h) # index of those y2 coordinate > photo
    edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1)  # for those y2>h, there is no value for out of photo area,
                                                            # thus new photoheight edy = box_height + (photo_height-y2)
    ey[tmp] = h  # for those y2>h, y2 = h

    tmp = np.where(x<1)  # index of those x1 coordinate < 1
    dx.flat[tmp] = np.expand_dims(2-x[tmp],1)               # for those x1<1, dx = 2-x1
    x[tmp] = 1  # for those x1<1, x1 = 1

    tmp = np.where(y<1)  # index of those y1 coordinate < 1
    dy.flat[tmp] = np.expand_dims(2-y[tmp],1)               # for those y1<1, dy = 2-y1
    y[tmp] = 1  # for those y1<1, y1 = 1
    # tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))         # new photo tmp.shape = [input box_height, input box_width, 3]
    # tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]      img[y1:y2, x1:x2,:]
    # bb coord. in target (dy,edy,dx,edx), bb coord. in source（y,ey,x,ex）
    # set a suitable pixel corresponding to original img
    return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph

# function [bboxA] = rerec(bboxA)
def rerec(bboxA):
    """Convert bboxA to square."""
    # re-rectangular box
    h = bboxA[:,3]-bboxA[:,1]
    w = bboxA[:,2]-bboxA[:,0]
    l = np.maximum(w, h)
    bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5 # try to expand the shorter edge to rectangular
    bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5 # try to expand the shorter edge to rectangular
    bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1)))
    return bboxA

def imresample(img, sz):
    im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_AREA) #@UndefinedVariable
            # 5-way to interpolation in cv2 : INTER_NEAREST、INTER_LINEAR、INTER_AREA、INTER_CUBICI、NTER_LANCZOS4
            # cv2.resize (h, w) 是相反的
    return im_data

    # This method is kept for debugging purpose
#     h=img.shape[0]
#     w=img.shape[1]
#     hs, ws = sz
#     dx = float(w) / ws
#     dy = float(h) / hs
#     im_data = np.zeros((hs,ws,3))
#     for a1 in range(0,hs):
#         for a2 in range(0,ws):
#             for a3 in range(0,3):
#                 im_data[a1,a2,a3] = img[int(floor(a1*dy)),int(floor(a2*dx)),a3]
#     return im_data


In [8]:
def create_mtcnn(sess, model_path):
    if not model_path:
        #model_path,_ = os.path.split(os.path.realpath(__file__))  # os.path.realpath(__file__): means current file invoke path
                                  # e.g if I invoke the file by python dirtest/test.py, then  __file__ == dirtest/test.py
                                  # split(__file__) == (head == dirtest, tail == test.py)
        model_path = os.getcwd()  # my py3.7 is not work for __file__, so I replace the code as left

    np.load.__defaults__=(None, True, True, 'latin1')   # declare load.__defaults__

    with tf.variable_scope('pnet'):
        data = tf.placeholder(tf.float32, (None,None,None,3), 'input')
        pnet = PNet({'data':data})
        #pnet.load(os.path.join(model_path, 'det1.npy'), sess)
        pnet.load(os.path.join(model_path, 'det1.npy'), sess) # if you encounter can't be loaded when allow_pickle=False, try this
                              # have to put det1,det2,det3 in the same folder
    with tf.variable_scope('rnet'):
        data = tf.placeholder(tf.float32, (None,24,24,3), 'input')
        rnet = RNet({'data':data})
        rnet.load(os.path.join(model_path, 'det2.npy'), sess)
                             # have to put det1,det2,det3 in the same folder
    with tf.variable_scope('onet'):
        data = tf.placeholder(tf.float32, (None,48,48,3), 'input')
        onet = ONet({'data':data})
        onet.load(os.path.join(model_path, 'det3.npy'), sess)
                            # have to put det1,det2,det3 in the same folder
            
    np.load.__defaults__=(None, False, True, 'ASCII')  # load.__defaults__  go back to original default
    
    pnet_fun = lambda img : sess.run(('pnet/conv4-2/BiasAdd:0', 'pnet/prob1:0'), feed_dict={'pnet/input:0':img})
    rnet_fun = lambda img : sess.run(('rnet/conv5-2/conv5-2:0', 'rnet/prob1:0'), feed_dict={'rnet/input:0':img})
    onet_fun = lambda img : sess.run(('onet/conv6-2/conv6-2:0', 'onet/conv6-3/conv6-3:0', 'onet/prob1:0'), feed_dict={'onet/input:0':img})
    return pnet_fun, rnet_fun, onet_fun

### Create scale pyramid and detect
<br>initially resize it to different scales to build an image pyramid, which is the input of the following three-stage cascaded framework
<br><p style="text-align:left;"><img src="images/MTCNN_Architectures_3.png"  style="width:561px;height:196px;" align="left">

In [9]:
def detect_face(img, minsize, pnet, rnet, onet, threshold, factor):
    """Detects faces in an image, and returns bounding boxes and points for them.
    img: input image
    minsize: minimum faces' size
    pnet, rnet, onet: caffemodel
    threshold: threshold=[th1, th2, th3], th1-3 are three steps's threshold
    factor: the factor used to create a scaling pyramid of face sizes to detect in the image.
    """
    factor_count=0
    total_boxes=np.empty((0,9))
    points=np.empty(0)
    h=img.shape[0]        # e.g. h = 1500
    w=img.shape[1]        # e.g. w = 989
    minl=np.amin([h, w])  # minl = np.amin([h, w]) = 120 (get min. edge length)
    m=12.0/minsize        # m = 12/20, 因為PNet是用12x12訓練, 所以如果要求的minsizee!=12, 則需縮放到12x才可使用PNet參數
    minl=minl*m           # minl = 120 * (12/20)

    # create scale list for pyramid 建立金字塔所需scale list
    scales=[]
    while minl>=12:             # while PNet inpout > 12, e.g. minl = 120*(12/20), 120*(12/20)*f, 120*(12/20)*f^2, ...
        scales += [m*np.power(factor, factor_count)]  # scale += m * (f^(0,1,2,...)), till minl < 12, e.g. 12/20 *[f^0, f^1, f^2, ...]
        minl = minl*factor                            # minl = minl * f, e.g. 120 * (12/20) * f, 120 * (12/20) * f^2, 
        factor_count += 1                             # factor_count = 0, 1, 2, 3... till minl < 12
    
    # first stage                                      # P-Net processing
    for scale in scales:                              # e.g. 12/20 *f^0, 12/20 *(f^0 + f^1), ...
        hs=int(np.ceil(h*scale))                      # sample h 依scale list逐次縮小到minsize
        ws=int(np.ceil(w*scale))                      # sample w 依scale list逐次縮小到minsize
        im_data = imresample(img, (hs, ws))           # im_data = 依scale list 逐次resample縮放到目標sample尺寸
        im_data = (im_data-127.5)*0.0078125           # mean-zero & normalize
        img_x = np.expand_dims(im_data, 0)            # expand dimension axis=0, [h_x, w_x, c_x] -> [None, h_x, w_x, c_x]
        img_y = np.transpose(img_x, (0,2,1,3))        # transpose, [None, h_x, w_x, c_x] -> [None, w_x, h_x, c_x]
        out = pnet(img_y)                             # PNet_out = [bounding_box, face_classification]
        out0 = np.transpose(out[0], (0,2,1,3))        # bounding_box result, 輸出結果為[1, n_h_slide, n_w_slide, 4], n 即input image尺寸以12x12 作sliding window的數量
        out1 = np.transpose(out[1], (0,2,1,3))        # face_classification result, 輸出結果為[1, n_h_slide, n_w_slide, 2], n 即input image尺寸以12x12 作sliding window的數量
                                                      # [0] for probability of no face, [1]for probability of get face, [0] + [1] = 1
        boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
                                                      # boxes = [None, 9]
        # inter-scale nms
        pick = nms(boxes.copy(), 0.5, 'Union')        # nms(boxes,threshold = 0.5,type = "Union")
        if boxes.size>0 and pick.size>0:
            boxes = boxes[pick,:]
            total_boxes = np.append(total_boxes, boxes, axis=0)

    numbox = total_boxes.shape[0]
    if numbox>0:
        pick = nms(total_boxes.copy(), 0.7, 'Union')   # nms(boxes,threshold = 0.7,type = "Union")
        total_boxes = total_boxes[pick,:]              # shape = [None, 9]
        regw = total_boxes[:,2]-total_boxes[:,0]       # sliding_window_width = q2(y) - q1(y)
        regh = total_boxes[:,3]-total_boxes[:,1]       # sliding_window_height = q2(x) - q1(x)
        qq1 = total_boxes[:,0]+total_boxes[:,5]*regw   # sliding window pixel_num + x1*width
        qq2 = total_boxes[:,1]+total_boxes[:,6]*regh   # sliding window pixel_num + y1*height
        qq3 = total_boxes[:,2]+total_boxes[:,7]*regw   # sliding window pixel_num + x2*width
        qq4 = total_boxes[:,3]+total_boxes[:,8]*regh   # sliding window pixel_num + y2*height
        total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]])) # [qq1, qq2, qq3, qq4, score]
        total_boxes = rerec(total_boxes.copy())        # re-rectangular box
        total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
                                                     # pad the bounding boxes to square), then we can send the box into RNet

    numbox = total_boxes.shape[0]
    if numbox>0:
        # second stage
        tempimg = np.zeros((24,24,3,numbox))
        for k in range(0,numbox):
            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] # set a suitable pixel corresponding to original img
            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
                tempimg[:,:,:,k] = imresample(tmp, (24, 24))  # resample into 24x24x3 before into RNet
            else:
                return np.empty()
        tempimg = (tempimg-127.5)*0.0078125            # mean-zero & normalize
        tempimg1 = np.transpose(tempimg, (3,1,0,2))
        out = rnet(tempimg1)                           # RNet_out = [bounding_box result, face_classification result]
        out0 = np.transpose(out[0])                    # bounding_box result, 輸出結果為[4, None]
        out1 = np.transpose(out[1])                    # face_classification result, 輸出結果為[2, None]
                                                       # [0] for probability of no face, [1]for probability of get face, [0] + [1] = 1
        score = out1[1,:]
        ipass = np.where(score>threshold[1])
        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
        mv = out0[:,ipass[0]]
        if total_boxes.shape[0]>0:
            pick = nms(total_boxes, 0.7, 'Union')      # nms(boxes,threshold = 0.7,type = "Union")
            total_boxes = total_boxes[pick,:]
            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
            total_boxes = rerec(total_boxes.copy())

    numbox = total_boxes.shape[0]
    if numbox>0:
        # third stage
        total_boxes = np.fix(total_boxes).astype(np.int32)
        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
        tempimg = np.zeros((48,48,3,numbox))
        for k in range(0,numbox):
            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
                tempimg[:,:,:,k] = imresample(tmp, (48, 48))
            else:
                return np.empty()
        tempimg = (tempimg-127.5)*0.0078125
        tempimg1 = np.transpose(tempimg, (3,1,0,2))
        out = onet(tempimg1)                           # ONet_out = [bounding_box result, facial landmark, face_classification result]
        out0 = np.transpose(out[0])                    # bounding_box result, 輸出結果為[4, None]
        out1 = np.transpose(out[1])                    # bounding_box result, 輸出結果為[10, None]
        out2 = np.transpose(out[2])                    # bounding_box result, 輸出結果為[2, None]
        score = out2[1,:]
        points = out1
        ipass = np.where(score>threshold[2])           
        points = points[:,ipass[0]]
        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
        mv = out0[:,ipass[0]]

        w = total_boxes[:,2]-total_boxes[:,0]+1
        h = total_boxes[:,3]-total_boxes[:,1]+1
        points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
        points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
        if total_boxes.shape[0]>0:
            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv))
            pick = nms(total_boxes.copy(), 0.7, 'Min') # nms(boxes,threshold = 0.7,type = "Min")
            total_boxes = total_boxes[pick,:]
            points = points[:,pick]
                
    return total_boxes, points


In [10]:
"""Performs face alignment and stores face thumbnails in the output directory."""
# MIT License
# 
# Copyright (c) 2016 David Sandberg
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from scipy import misc
import sys
import os
import argparse                       # creat a parser to 從命令列直接讀取引數, 在jupyter運行時會因為sys.argv[1:]不為空而報錯
                                       # David Sandberg原py檔案, 是在命令列呼叫, 我們於jupyter運行會有些小問題(解法見下方)
import tensorflow as tf
import numpy as np
#import facenet                       # 部分facenet所需函式已帶入
#import detect_face                   # David Sandberg 將MTCNN 各層架構寫在此檔案 (即上方各cell 因此無須再import)
import random
from time import sleep
import subprocess                    # Popen in store_revision_info() will need it
from skimage.transform import resize  # try this if you got error on 'scipy.misc' has no attribute 'imresize', scipy.misc.imresize is deprecated
import imageio                        # scipy.misc.imsave has been deprecated in newer Scipy versions.

In [11]:
def store_revision_info(src_path, output_dir, arg_string):
    try:
        # Get git hash
        cmd = ['git', 'rev-parse', 'HEAD']
        gitproc = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd=src_path)
        (stdout, _) = gitproc.communicate()
        git_hash = stdout.strip()
    except OSError as e:
        git_hash = ' '.join(cmd) + ': ' +  e.strerror
  
    try:
        # Get local changes
        cmd = ['git', 'diff', 'HEAD']
        gitproc = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd=src_path)
        (stdout, _) = gitproc.communicate()
        git_diff = stdout.strip()
    except OSError as e:
        git_diff = ' '.join(cmd) + ': ' +  e.strerror
    
    # Store a text file in the log directory
    rev_info_filename = os.path.join(output_dir, 'revision_info.txt')
    with open(rev_info_filename, "w") as text_file:
        text_file.write('arguments: %s\n--------------------\n' % arg_string)
        text_file.write('tensorflow version: %s\n--------------------\n' % tf.__version__)  # @UndefinedVariable
        text_file.write('git hash: %s\n--------------------\n' % git_hash)
        text_file.write('%s' % git_diff)

In [12]:
class ImageClass():
    "Stores the paths to images for a given class"
    def __init__(self, name, image_paths):
        self.name = name
        self.image_paths = image_paths
  
    def __str__(self):
        return self.name + ', ' + str(len(self.image_paths)) + ' images'
  
    def __len__(self):
        return len(self.image_paths)

In [13]:
def get_image_paths(facedir):
    image_paths = []
    if os.path.isdir(facedir):
        images = os.listdir(facedir)
        image_paths = [os.path.join(facedir,img) for img in images]
    return image_paths

In [14]:
def get_dataset(path, has_class_directories=True):
    dataset = []
    path_exp = os.path.expanduser(path)
    classes = [path for path in os.listdir(path_exp) \
                    if os.path.isdir(os.path.join(path_exp, path))]
    classes.sort()
    nrof_classes = len(classes)
    for i in range(nrof_classes):
        class_name = classes[i]
        facedir = os.path.join(path_exp, class_name)
        image_paths = get_image_paths(facedir)
        dataset.append(ImageClass(class_name, image_paths))
  
    return dataset

In [15]:
def to_rgb(img):
    w, h = img.shape
    ret = np.empty((w, h, 3), dtype=np.uint8)
    ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img
    return ret

In [16]:
def main(args):

    sleep(random.random())
    
    output_dir = os.path.expanduser(args.output_dir)    # 設定align後的人臉影象存放的路徑
    if not os.path.exists(output_dir):
        os.makedirs(output_dir_path)
        
    # Store some git revision info in a text file in the log directory  儲存一些配置引數等資訊
    #src_path,_ = os.path.split(os.path.realpath(__file__))
    src_path = os.getcwd()  # my py3.7 is not work for __file__, so I replace the code as left
    
    #acenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv))
    store_revision_info(src_path, output_dir, ' '.join(sys.argv))
    dataset_path = os.getcwd()
    dataset = get_dataset(os.path.join(dataset_path, 'input_dir'))    # 獲取LFW or Input資料集 獲取每個類別名稱以及該類別下所有圖片的絕對路徑
    
    print('Creating networks and loading parameters')
    
    # Creat session to load the pre-trained MTCNN model：
    # then get Proposal Network (P-Net), Refine Network (R-Net) 和 Output Network (O-Net)。
    
    with tf.Graph().as_default():
        #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction)
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
        with sess.as_default():
            #pnet, rnet, onet = align.detect_face.create_mtcnn(sess, None)  # 因為 create_mtcnn 被我放到正上方了, 因此改成直接呼叫
            pnet, rnet, onet = create_mtcnn(sess, None)                    # 建立MTCNN網路，並使用訓練好的參數 

    minsize = 20 # minimum size of face
    threshold = [ 0.6, 0.7, 0.7 ]  # three steps's threshold
    factor = 0.709 # scale factor

    # Add a random key to the filename to allow alignment using multiple processes
    random_key = np.random.randint(0, high=99999)
    bounding_boxes_filename = os.path.join(output_dir, 'bounding_boxes_%05d.txt' % random_key)
    
    with open(bounding_boxes_filename, "w") as text_file:  # 每個圖片中人臉所在的邊界框寫入記錄檔案中
        nrof_images_total = 0
        nrof_successfully_aligned = 0
        if args.random_order:
            random.shuffle(dataset)
        for cls in dataset:                                        # 獲取每一個人，以及對應的所有圖片的絕對路徑
            output_class_dir = os.path.join(output_dir, cls.name)  # 每一個人對應的輸出資料夾
            if not os.path.exists(output_class_dir):
                os.makedirs(output_class_dir)
                if args.random_order:
                    random.shuffle(cls.image_paths)
            for image_path in cls.image_paths:                   # through over all pics
                nrof_images_total += 1
                filename = os.path.splitext(os.path.split(image_path)[1])[0]
                output_filename = os.path.join(output_class_dir, filename+'.png')
                print(image_path)
                if not os.path.exists(output_filename):
                    try:
                        #img = misc.imread(image_path)
                        img = imageio.imread(image_path)         # try this if you got error on 'scipy.misc' has no attribute 'imread'
                    except (IOError, ValueError, IndexError) as e:
                        errorMessage = '{}: {}'.format(image_path, e)
                        print(errorMessage)
                    else:
                        if img.ndim<2:
                            print('Unable to align "%s"' % image_path)
                            text_file.write('%s\n' % (output_filename))
                            continue
                        if img.ndim == 2:
                            img = to_rgb(img)
                        img = img[:,:,0:3]
    
                        # 人臉檢測 bounding_boxes：表示邊界框 形狀為[n,5] 5對應x1,y1,x2,y2,score, 
                        # _：人臉關鍵點座標 形狀為 [n,10]
                        bounding_boxes, _ = detect_face(img, minsize, pnet, rnet, onet, threshold, factor) # detect face by MTCNN
                        nrof_faces = bounding_boxes.shape[0]  # 確認boundboxes個數
                        if nrof_faces>0:
                            det = bounding_boxes[:,0:4]            # det = 提出bounding box 座標(x1, y1, x2, y2)
                            det_arr = []                           # 儲存所有人臉框
                            img_size = np.asarray(img.shape)[0:2]  # e.g. img_size = [600 800] = [h w]
                            if nrof_faces>1:                       # 一張圖片中檢測多個人臉
                                if args.detect_multiple_faces:
                                    for i in range(nrof_faces):
                                        det_arr.append(np.squeeze(det[i]))
                                else:
                                    bounding_box_size = (det[:,2]-det[:,0])*(det[:,3]-det[:,1]) # 直接算size(w*h) = (x2-x1) * (y2-y1)
                                    img_center = img_size / 2
                                    offsets = np.vstack([ (det[:,0]+det[:,2])/2-img_center[1], (det[:,1]+det[:,3])/2-img_center[0] ]) # bb中心距離圖片中心的offset
                                    offset_dist_squared = np.sum(np.power(offsets,2.0),0)
                                    index = np.argmax(bounding_box_size-offset_dist_squared*2.0) # index依大小排序
                                    det_arr.append(det[index,:])
                            else:                                  # 只有一個人臉框
                                det_arr.append(np.squeeze(det))

                            for i, det in enumerate(det_arr):      # through over all bounding boxes
                                det = np.squeeze(det)
                                bb = np.zeros(4, dtype=np.int32)
                                bb[0] = np.maximum(det[0]-args.margin/2, 0)           # [4,]  邊界框擴大margin區域，並進行裁切
                                bb[1] = np.maximum(det[1]-args.margin/2, 0)           # [4,]  邊界框擴大margin區域，並進行裁切
                                bb[2] = np.minimum(det[2]+args.margin/2, img_size[1]) # [4,]  邊界框擴大margin區域，並進行裁切
                                bb[3] = np.minimum(det[3]+args.margin/2, img_size[0]) # [4,]  邊界框擴大margin區域，並進行裁切
                                cropped = img[bb[1]:bb[3],bb[0]:bb[2],:]               # 裁切
                                #scaled = misc.imresize(cropped, (args.image_size, args.image_size), interp='bilinear') # 縮放到指定大小，並儲存圖片，以及邊界框位置資訊
                                scaled = resize(cropped, output_shape=(args.image_size,args.image_size))  # try this if you got error on 'scipy.misc' has no attribute 'imresize', scipy.misc.imresize is deprecated
                                nrof_successfully_aligned += 1
                                filename_base, file_extension = os.path.splitext(output_filename)
                                if args.detect_multiple_faces:
                                    output_filename_n = "{}_{}{}".format(filename_base, i, file_extension)
                                else:
                                    output_filename_n = "{}{}".format(filename_base, file_extension)
                                #misc.imsave(output_filename_n, scaled)  #scipy.misc.imsave has been deprecated in newer Scipy versions.
                                imageio.imwrite(output_filename_n, scaled)
                                text_file.write('%s %d %d %d %d\n' % (output_filename_n, bb[0], bb[1], bb[2], bb[3]))
                        else:
                            print('Unable to align "%s"' % image_path)
                            text_file.write('%s\n' % (output_filename))
                            
    print('Total number of images: %d' % nrof_images_total)
    print('Number of successfully aligned images: %d' % nrof_successfully_aligned)

### Create dataset input directory before you run
<br>1. Need to creat a "input_dir" folder as a Input path 建立 "input_dir" 資料夾, 把要輸入的照片放裡面
<br>2. Need to creat a "output_dir" folder as a Output path(empty folder) 建立 "output_dir" 資料夾, 作為結果輸出路徑(空資料夾)
<br><p style="text-align:left;"><img src="images/dir_struc.png"  style="width:861px;height:196px;" align="left">

## Usage 1. batch photo detect 批次圖片偵測

In [None]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('input_dir', type=str, help='Directory with unaligned images.')
parser.add_argument('output_dir', type=str, help='Directory with aligned face thumbnails.')
parser.add_argument('--image_size', type=int, help='Image size (height, width) in pixels.', default=182)
parser.add_argument('--margin', type=int, help='Margin for the crop around the bounding box (height, width) in pixels.', default=44)
parser.add_argument('--random_order', help='Shuffles the order of images to enable alignment using multiple processes.', action='store_true')
parser.add_argument('--gpu_memory_fraction', type=float, help='Upper bound on the amount of GPU memory that will be used by the process.', default=1.0)
parser.add_argument('--detect_multiple_faces', type=bool, help='Detect and align multiple faces per image.', default=False)

args = parser.parse_args(args=["input_dir", "output_dir", "--image_size", "182", "--margin", "44", "--random_order",
                               "--gpu_memory_fraction", "0", "--detect_multiple_faces", "False"])
main(args)

### You will see result as show below in the output_dir
<br><p style="text-align:left;"><img src="images/output.png"  style="width:561px;height:196px;" align="left">

## Usage 2. Real time face detect 動態即時偵測

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six import string_types, iteritems
 
import sys
import os
import numpy as np
import tensorflow as tf
#from math import floor
import cv2
#import detect_face
import random
from time import sleep
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 
video = cv2.VideoCapture(0)

print('Creating networks and loading parameters')
 
with tf.Graph().as_default():
    #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1.0)
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
    with sess.as_default():
        pnet, rnet, onet = create_mtcnn(sess, None)
minsize = 20
threshold = [0.6, 0.7, 0.7]
factor = 0.709
while(video.isOpened()) :        # check ! (better usage than "while true")
    ret, frame = video.read()    # capture frame-by-frame
    
    if ret:                      # check ! (some webcam's need a "warmup")        
        bounding_boxes, _ = detect_face(frame, minsize, pnet, rnet, onet, threshold, factor)
        nrof_faces = bounding_boxes.shape[0]
        print('face number :{}'.format(nrof_faces))
        for face_position in bounding_boxes:
            face_position = face_position.astype(int)
            cv2.rectangle(frame, (face_position[0], face_position[1]), (face_position[2], face_position[3]), (0, 255, 0), 2)
        cv2.imshow('show', frame)    # Display the resulting frame
    if cv2.waitKey(5) & 0xFF == ord('q'): # close current frame in waitKey(nums) ms or by keyin"q"
        break
video.release()
cv2.destroyAllWindows()

### You will see result as show below
<br><p style="text-align:left;"><img src="images/output_1.png"  style="width:561px;height:196px;" align="left">

## optional

In [None]:
# here you can see the variables across all MTCNN architecture

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        #detect_face.create_mtcnn(sess, None)
        create_mtcnn(sess, None)                    # 因為 create_mtcnn 被我放到正上方了可以直接呼叫
        tensor_ops = tf.global_variables()
        for op in tensor_ops:
            print(str(op))
            
tf.reset_default_graph()  # clear existing variable in memory