Real time video demo for Face Emotion Recognition

This part set up an environment where you can process and manipulate images, display various types of content within the notebook, and handle HTML and JavaScript interactions.

In [1]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time

The js_to_image function is used to convert images received from the JavaScript side into a format suitable for processing with OpenCV in Python. Conversely, the bbox_to_bytes function takes processed bounding box information and converts it into a format suitable for overlaying on the video stream in the browser.

In [2]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

This part of code essentially sets up the face_cascade object to use the Haar Cascade classifier for detecting frontal faces, and it's initialized with the pre-trained model XML file. This object can then be used to detect faces in images or video streams.

In [3]:
# initialize the Haar Cascade face detection model
face_cascade = cv2.CascadeClassifier(cv2.samples.findFile(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'))

This code is designed to create an interactive live video stream using the webcam as input within a Jupyter notebook environment. It bridges the gap between Python and JavaScript to manage the video stream, capture frames, and provide the captured frame data back to the Python environment for analysis or further processing.

In [4]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }

    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }

    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);

      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);

      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);

      const instruction = document.createElement('div');
      instruction.innerHTML =
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };

      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);

      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();

      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }

      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }

      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;

      return {'create': preShow - preCreate,
              'show': preCapture - preShow,
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)

def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

 This part loads the pre-trained neural network model from a specific path in the Drive, defines a list of emotion labels, and assigns the emotion labels to a variable for later use in classification tasks.

In [5]:
# Ensure the /content/model folder exists
import os
if not os.path.exists('/content/model'):
    os.makedirs('/content/model')


!pip install gdown
import gdown

url = 'https://drive.google.com/uc?id=1PFF6omNRkpwRRKSVQIYRHBR1IA5kyQPU'
output = '/content/model/modelv10.h5'
gdown.download(url, output, quiet=False)

from tensorflow.keras.models import load_model
model = load_model('/content/model/modelv10.h5')

emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
class_labels = emotions



Downloading...
From: https://drive.google.com/uc?id=1PFF6omNRkpwRRKSVQIYRHBR1IA5kyQPU
To: /content/model/modelv10.h5
100%|██████████| 55.0M/55.0M [00:00<00:00, 112MB/s]


# New Section

This code captures the webcam video stream, detects faces, classifies the emotions associated with the detected faces, overlays bounding boxes and emotion labels on the frames, and tracks the frequency of each detected emotion.

In [6]:
# Initialize a dictionary to keep track of the frequency of each emotion detected
emotion_counts = {emotion: 0 for emotion in emotions}

from numpy.core.multiarray import result_type
# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0
while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # grayscale image for face detection
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # get face region coordinates
    faces = face_cascade.detectMultiScale(gray)

    # Process each detected face
    for (x, y, w, h) in faces:
        # Crop the detected face from the image
        cropped_face = img[y:y+h, x:x+w]

        # Resize the cropped face to match the input size of your FER model (e.g., 48x48)
        resized_face = cv2.resize(cropped_face, (48, 48))

        # Convert the resized face to grayscale
        gray_resized = cv2.cvtColor(resized_face, cv2.COLOR_RGB2GRAY)

        # Normalize pixel values to range between 0 and 1
        normalized = gray_resized / 255.0

        # Expand dimensions to match the shape that model expects
        normalized = np.expand_dims(normalized, axis=-1)
        normalized = np.expand_dims(normalized, axis=0)

        # Perform prediction using your FER model
        result = model.predict(normalized, verbose=0)
        emotion_result = class_labels[np.argmax(result)]

        # Update the emotion_counts dictionary
        emotion_counts[emotion_result] += 1


    font = cv2.FONT_HERSHEY_SIMPLEX

  #Use puText() method for
  #inserting text on video

    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)

    # grayscale image for face detection
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # get face region coordinates
    faces = face_cascade.detectMultiScale(gray)
    # get face bounding box for overlay
    for (x,y,w,h) in faces:
      bbox_array = cv2.rectangle(bbox_array,(x,y),(x+w,y+h),(255,0,0),2)
      cv2.putText(bbox_array, str(emotion_result), (0,50), font, 1, (0, 0, 255), 2, cv2.LINE_4);

    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes

# After the loop ends, find the emotion with the highest frequency
most_dominant_emotion = max(emotion_counts, key=emotion_counts.get)

print("The most dominant emotion detected during the camera capture was:", most_dominant_emotion)

<IPython.core.display.Javascript object>

The most dominant emotion detected during the camera capture was: Neutral
