Source: https://github.com/Ahmednull/L2CS-Net

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive/')

    import os
    os.chdir('/content/drive/MyDrive/Colab Notebooks/L2CS')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
! pip install -r requirements.txt  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! pip install git+https://github.com/elliottzheng/face-detection.git@master

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/elliottzheng/face-detection.git@master
  Cloning https://github.com/elliottzheng/face-detection.git (to revision master) to /tmp/pip-req-build-8st7hn6_
  Running command git clone -q https://github.com/elliottzheng/face-detection.git /tmp/pip-req-build-8st7hn6_


In [None]:
import argparse
import numpy as np
import cv2
import time

import torch
import torch.nn as nn
from torch.autograd import Variable
from torchvision import transforms
import torch.backends.cudnn as cudnn
import torchvision

from PIL import Image
from utils import select_device, draw_gaze
from PIL import Image, ImageOps

from face_detection import RetinaFace
from model import L2CS

from google.colab.patches import cv2_imshow

In [None]:
def draw_gaze_on_frame(frame):
  faces = detector(frame)
  if faces is not None: 
      for box, landmarks, score in faces:
          if score < .95:
              continue
          x_min=int(box[0])
          if x_min < 0:
              x_min = 0
          y_min=int(box[1])
          if y_min < 0:
              y_min = 0
          x_max=int(box[2])
          y_max=int(box[3])
          bbox_width = x_max - x_min
          bbox_height = y_max - y_min

          # Crop image
          img = frame[y_min:y_max, x_min:x_max]
          img = cv2.resize(img, (224, 224))
          img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
          im_pil = Image.fromarray(img)
          img=transformations(im_pil)
          img  = Variable(img).cuda(gpu)
          img  = img.unsqueeze(0) 
          
          # gaze prediction
          gaze_pitch, gaze_yaw = model(img)
          
          
          pitch_predicted = softmax(gaze_pitch)
          yaw_predicted = softmax(gaze_yaw)
          
          # Get continuous predictions in degrees.
          pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 4 - 180
          yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 4 - 180
          
          pitch_predicted= pitch_predicted.cpu().detach().numpy()* np.pi/180.0
          yaw_predicted= yaw_predicted.cpu().detach().numpy()* np.pi/180.0

          draw_gaze(x_min,y_min,bbox_width, bbox_height,frame,(pitch_predicted,yaw_predicted),color=(0,0,255))
          cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1)
  return frame

In [None]:
cudnn.enabled = True
arch= 'ResNet50'
batch_size = 1
cam = 0
gpu = select_device(0, batch_size=batch_size)
snapshot_path = 'models/L2CSNet_gaze360.pkl'

transformations = transforms.Compose([
    transforms.Resize(448),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

model=getArch(arch, 90)
print('Loading snapshot.')
saved_state_dict = torch.load(snapshot_path)
model.load_state_dict(saved_state_dict)
model.cuda(gpu)
model.eval()


softmax = nn.Softmax(dim=1)
detector = RetinaFace(gpu_id=0)
idx_tensor = [idx for idx in range(90)]
idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
x=0

print('opening video')
cap = cv2.VideoCapture("bertrand.mp4")

w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
video_path = "bertrand-gaze.avi"
fourcc = cv2.VideoWriter_fourcc('H','2','6','4')
fourcc = cv2.VideoWriter_fourcc('F','M','P','4')
fps = float(cap.get(cv2.CAP_PROP_FPS))
video = cv2.VideoWriter(video_path, fourcc, fps, (w,h)) 
video = cv2.VideoWriter(video_path, fourcc, 15.0, (1080,720))

# Check if the webcam is opened correctly
if not cap.isOpened():
    raise IOError("Cannot open webcam")

print('process each video frame')
with torch.no_grad():
    i = 0
    while True:
        success, frame = cap.read()
        i += 1
        if not success: continue; print('failed')
        start_fps = time.time()
        frame = draw_gaze_on_frame(frame)

        myFPS = 1.0 / (time.time() - start_fps)
        cv2.putText(frame, 'FPS: {:.1f}'.format(myFPS), (10, 20),cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 1, cv2.LINE_AA)

        print(str(i), end=',')
        video.write(frame)

    video.release()

In [None]:
! ffmpeg -i bertrand-gaze.avi -vcodec h264 -acodec mp2 bertrand-gaze.mp4

In [None]:
from IPython.display import HTML
from base64 import b64encode
mp4 = open('bertrand-gaze.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

## Real time webcam feed

In [None]:
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time

In [None]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

In [None]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)
  
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

In [None]:
# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 
while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)

    # grayscale image for face detection
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # get face region coordinates
    faces = face_cascade.detectMultiScale(gray)
    # get face bounding box for overlay
    for (x,y,w,h) in faces:
      bbox_array = cv2.rectangle(bbox_array,(x,y),(x+w,y+h),(255,0,0),2)

    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes

In [None]:
cudnn.enabled = True
arch= 'ResNet50'
batch_size = 1
cam = 0
gpu = select_device(0, batch_size=batch_size)
snapshot_path = 'models/L2CSNet_gaze360.pkl'

transformations = transforms.Compose([
    transforms.Resize(448),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

model=getArch(arch, 90)
print('Loading snapshot.')
saved_state_dict = torch.load(snapshot_path)
model.load_state_dict(saved_state_dict)
model.cuda(gpu)
model.eval()


softmax = nn.Softmax(dim=1)
detector = RetinaFace(gpu_id=0)
idx_tensor = [idx for idx in range(90)]
idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
x=0

# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 


with torch.no_grad():
    i = 0
    while True:

        # get the video frame
        js_reply = video_frame(label_html, bbox)
        if not js_reply:
            break
            
        # convert JS response to OpenCV Image
        frame = js_to_image(js_reply["img"])

        # create transparent overlay for bounding box
        bbox_array = np.zeros([480,640,4], dtype=np.uint8)

        # draw the face and gaze
        start_fps = time.time()
        frame = draw_gaze_on_frame(frame)

        myFPS = 1.0 / (time.time() - start_fps)
        cv2.putText(frame, 'FPS: {:.1f}'.format(myFPS), (10, 20),cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 1, cv2.LINE_AA)

        bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
        # convert overlay of bbox into bytes
        bbox_bytes = bbox_to_bytes(bbox_array)
        # update bbox so next frame gets new overlay
        bbox = bbox_bytes   




while True:

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)

    # grayscale image for face detection
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # get face region coordinates
    faces = face_cascade.detectMultiScale(gray)
    
    # get face bounding box for overlay
    for (x,y,w,h) in faces:
      bbox_array = cv2.rectangle(bbox_array,(x,y),(x+w,y+h),(255,0,0),2)

    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes