In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from pathlib import Path
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torchvision.transforms.functional as TF
from torchvision import models
from torch.utils.data import Dataset
from torch.utils.data import random_split, DataLoader
#from torchmetrics.functional import accuracy
from torch.nn.utils.rnn import pad_sequence
import torchvision.transforms as T
import argparse
import imutils
import time
import dlib
import cv2
from google.colab.patches import cv2_imshow
import os
import math
import csv

import sys
sys.path.append('/content/drive/MyDrive/NAPOLI/code/mtcnn/')
from mtcnn import MTCNN

# **Head pose estimation**

In [None]:
eyes_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_eye.xml")

class PoseEstimator:
    def __init__(self, weights=None):
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      self.detector =  MTCNN()

    def detect_faces(self, image, image_shape_max=640):
      '''
      Performs face detection using retinaface method with speed boost and initial quality checks based on whole image size

      Parameters
      ----------
      image : uint8
          image for face detection.
      image_shape_max : int, optional
          maximum size (in pixels) of image. The default is None.

      Returns
      -------
      float array
          bounding boxes and score.
      float array
          landmarks.

      '''
      image_shape = image.shape[:2]

      # perform image resize for faster detection
      if image_shape_max:
          scale_factor = max([1, max(image_shape) / image_shape_max])
      else:
          scale_factor = 1

      if scale_factor > 1:
          scaled_image = cv2.resize(image, (0, 0), fx = 1/scale_factor, fy = 1/scale_factor)

          start = time.time()
          print("[INFO[ performing face detection with MTCNN...")
          boxes, points = self.detector.detect_faces(scaled_image)
          end = time.time()
          print("[INFO] face detection took {:.4f} seconds".format(end - start))

          boxes[:,:4] *= scale_factor
          points *= scale_factor
      else:
          start = time.time()
          print("[INFO[ performing face detection with MTCNN...")
          boxes, points = self.detector.detect_faces(image)
          end = time.time()
          print("[INFO] face detection took {:.4f} seconds".format(end - start))

      return boxes, points

    def draw_landmarks(self, image, boxes, points):
      '''
      Parameters
      ----------
      frame : TYPE
          RGB image
      bb : TYPE - Array of float64, Size = (5,)
          coordinates of bounding box for the selected face.
      points : TYPE - Array of float32, Size = (10,)
          coordinates of landmarks for the selected faces.

      Returns
      -------
      None.

      '''
      font = cv2.FONT_HERSHEY_COMPLEX # Text in video
      font_size = 0.6
      blue = (0, 0, 255)
      green = (0,128,0)
      red = (255, 0, 0)

      #boxes, points = self.detect_faces(image)

      boxes = boxes.astype(int)
      points = points.astype(int)
      # draw rectangle and landmarks on face
      cv2.rectangle(image, (boxes[0], boxes[1]), (boxes[2], boxes[3]), red, 1)
      cv2.circle(image, (int(points[0]), int(points[5])), 2, blue, 2)# left eye
      cv2.circle(image, (int(points[1]), int(points[6])), 2, blue, 2)# right eye
      cv2.circle(image, (int(points[2]), int(points[7])), 2, blue, 2)# nose
      cv2.circle(image, (int(points[3]), int(points[8])), 2, blue, 2)# mouth - left
      cv2.circle(image, (int(points[4]), int(points[9])), 2, blue, 2)# mouth - right

      cv2_imshow(image)

      #w = int(boxes[2])-int(boxes[0]) # width
      #h = int(boxes[3])-int(boxes[1]) # height


    def one_face(self, frame, bbs, pointss):
      """
      Parameters
      ----------
      frame : TYPE
          RGB image (numpy array).
      bbs : TYPE - Array of flaot64, Size = (N, 5)
          coordinates of bounding boxes for all detected faces.
      pointss : TYPE - Array of flaot32, Size = (N, 10)
          coordinates of landmarks for all detected faces.
      Returns
      -------
      bb : TYPE - Array of float 64, Size = (5,)
          coordinates of bounding box for the selected face.
      points : TYPE
          coordinates of five landmarks for the selected face.
      """
      # select only process only one face (center ?)
      offsets = [(bbs[:,0]+bbs[:,2])/2-frame.shape[1]/2,
                (bbs[:,1]+bbs[:,3])/2-frame.shape[0]/2]
      offset_dist = np.sum(np.abs(offsets),0)
      index = np.argmin(offset_dist)
      bb = bbs[index]
      points = pointss[:,index]
      return bb, points

    def find_smile(self, points):
      dx_eyes = points[1] - points[0]# between pupils
      dx_mout = points[4] - points[3]# between mouth corners
      smile_ratio = dx_mout/dx_eyes
      return smile_ratio

    def find_roll(self, points):
      """
      Parameters
      ----------
      points : TYPE - Array of float32, Size = (10,)
          coordinates of landmarks for the selected faces.
      Returns
      -------
      TYPE
          roll of face.

      """
      return points[6] - points[5]

    def find_yaw(self, points):
      """
      Parameters
      ----------
      points : TYPE - Array of float32, Size = (10,)
          coordinates of landmarks for the selected faces.
      Returns
      -------
      TYPE
          yaw of face.

      """
      le2n = points[2] - points[0]
      re2n = points[1] - points[2]
      return le2n - re2n

    def find_pitch(self, points):
      """
      Parameters
      ----------
      points : TYPE - Array of float32, Size = (10,)
          coordinates of landmarks for the selected faces.
      Returns
      -------
      Pitch
      """
      eye_y = (points[5] + points[6]) / 2
      mou_y = (points[8] + points[9]) / 2
      e2n = eye_y - points[7]
      n2m = points[7] - mou_y
      return e2n / n2m


# **Gaze positionn RESNET**
using the head pose


In [None]:
#preprocessing DATA per training della SVM

class dataset(Dataset):

  def __init__(self, csv_file):
    self.sample_elements = []
    # sample_elements = [(sample_1), (sample_2),..]
    # sample_1 = [(FEATURES OCCHI, POSIZIONE TESTA), LABLE)] etc.
    self.eyes_features = []
    self.roll_pitch_yaw = []
    self.pupil_coords = []
    self.paths = []
    self.labels = []
    self.faces_cropped = []
    self.names = []
    self.csv_file = csv_file
    self.model = models.resnet50(weights='DEFAULT')
    self.extract_path_label_from_csv()


  def extract_path_label_from_csv(self):

    #read CSV
    with open(self.csv_file, mode ='r') as file:

      csvFile = csv.reader(file)

      for lines in csvFile:
        if '0_0' not in lines[0]:
          id_class = lines[1]
          name = lines[0]
          self.paths.append("/content/drive/MyDrive/NAPOLI/DATASET_FACE/PNG/"+name[:3]+"/"+name+".png")
          self.labels.append(id_class)
          self.names.append(name)

    print(self.paths) #[0])
    print(self.labels) #[0])

    return

  def extract_eyestensor_headpose(self):
    #print(self.path)
    est = PoseEstimator()

    for i in self.paths:
      print(i)
      rpy = []
      pupils = []
      image = cv2.imread(i)

      image = imutils.resize(image, width=600)
      image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)# convert to rgb
      image_rgb = cv2.flip(image_rgb, 1)# flip for user friendliness

      bounding_boxes, landmarks = est.detect_faces(image_rgb)

      #print(bounding_boxes)

      if len(bounding_boxes) > 0:
        if len(bounding_boxes) >= 2:
          bounding_boxes, landmarks = est.one_face(image_rgb, bounding_boxes, landmarks)
          roll = est.find_roll(landmarks)
          pitch = est.find_pitch(landmarks)
          yaw = est.find_yaw(landmarks)
          lx = landmarks[0]
          ly = landmarks[5]
          rx = landmarks[1]
          ry = landmarks[6]

          '''
          bounding_boxes = bounding_boxes.astype(int)
          for (x, y, w, h) in bounding_boxes[0]:
            face_crop = image[y:y + h, x:x + w]

          self.faces_cropped.append(face_crop)
          print(face_crop)
          print(len(face_crop))
          '''
        else:
          roll = est.find_roll(landmarks)
          pitch = est.find_pitch(landmarks)
          yaw = est.find_yaw(landmarks)
          lx = landmarks[0]
          ly = landmarks[5]
          rx = landmarks[1]
          ry = landmarks[6]

          print(bounding_boxes)

          '''
          #bounding_boxes = bounding_boxes.astype(int)
          for (x, y, w, h) in bounding_boxes[0]:
            face_crop = image[y:y + h, x:x + w]

          print(face_crop)
          print(len(face_crop))
          self.faces_cropped.append(face_crop)
          '''

      else:
        print("NO FACE FOUND")
        roll = 0.0
        pitch = 0.0
        yaw = 0.0
        lx = 0.0
        ly = 0.0
        rx = 0.0
        ry = 0.0


      gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)# convert to rgb
      eyes = eyes_cascade.detectMultiScale(gray, 1.1, 5, minSize = [40,40])
      #print(eyes)

      # qui prendo un occhio alla volta, chiamo la extract_features
      # appendo in eyes_tensor direttamente le features estratte da entrambi gli occhi

      actual_eyes = []

      for (x, y, w, h) in eyes:

        eye_crop = image[y:y + h, x:x + w]

        cv2.rectangle(gray, (x, y), (x + w, y + h), (0, 255, 0), 2)

        eye_crop = TF.resize(Image.fromarray(eye_crop), size=[224, 224])
        eye_crop = TF.to_tensor(eye_crop)
        eye_crop = TF.normalize(eye_crop, [0.5], [0.5])

        features = self.extract_eyes_features(eye_crop)

        actual_eyes.append(features)

      # show the output image
      #if len(eyes) > 2:
      #  cv2_imshow(gray)

      #QUI DOBBIAMO COMBINARE LE FEATURES SALVATE IN ACTUAL EYES

      if len(actual_eyes) >= 1:
        if len(actual_eyes) == 1:
          final_features = actual_eyes[0]
        elif len(actual_eyes) > 1:
          final_features = actual_eyes[0] + actual_eyes[1]
          final_features = final_features/2
      else:
        final_features = torch.zeros(1, 2048, 1, 1)

      #print(final_features)
      print(final_features.size())

      #ESTRAZIONE HEAD POSE
      rpy = [roll, pitch, yaw]
      pupils = [lx, ly, rx, ry]

      self.eyes_features.append(final_features)
      self.roll_pitch_yaw.append(rpy)
      self.pupil_coords.append(pupils)


  def extract_eyes_features(self, eye):
    with torch.no_grad():
      # strip the last layer
      feature_extractor = torch.nn.Sequential(*list(self.model.children())[:-1])

      features = feature_extractor(eye.unsqueeze(0)) # output now has the features corresponding to input eye
      #print(features.shape)

    return features


  def process_data(self):

    print(len(self.labels))
    for i in range(len(self.labels)):
      j = 0
      eyes = []
      #print(len(self.eyes_features[i][0]))
      while j < 2048:
        eyes.append(self.eyes_features[i][0][j][0][0].numpy().tolist())
        j += 1
      head_pose = np.array(self.roll_pitch_yaw[i])
      pupil_pose = np.array(self.pupil_coords[i])

    #  QUI DOBBIAMO CREARE I SAMPLE DEL DATASET

      #print(len(eyes))
      self.sample_elements.append([np.array(eyes), np.array(head_pose), np.array(pupil_pose)])

    return self.sample_elements, self.labels

  def __len__(self):
    # returns the number of samples in our dataset
    return len(self.sample_elements)

  def __getitem__(self, i):
    return self.sample_elements[i]

In [None]:
data = dataset('/content/drive/MyDrive/NAPOLI/Dataset_volti_num_ultime.csv')

In [None]:
data.extract_eyestensor_headpose()

In [None]:
samples, labels = data.process_data()

In [None]:
#WRITE THE CSV FILE

#write the header
header = []
for i in range(2048):
  header.append("feature_"+str(i))

header.append("roll")
header.append("pitch")
header.append("yawn")
header.append("lx")
header.append("ly")
header.append("rx")
header.append("ry")
header.append("label")
print(header)
print(len(header))


f = "/content/drive/MyDrive/NAPOLI/featuresClassNet.csv"

with open(f, 'w') as csvfile:
  writer = csv.writer(csvfile)

  writer.writerow(header)

  j = 0
  for i in samples:
    m = np.concatenate((i[0], i[1], i[2], labels[j]), axis=None)
    j += 1

    writer.writerow(m)
    print(m)
    print(len(m))

    #print(len(i[0]))
    #print(len(i[1]))
  print("------------------------------")
  print(labels)


# **NN con features**

In [7]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/NAPOLI/features.csv", delimiter=',', header=0)

In [8]:
df.shape
print(df.shape)
print(df)

(906, 2056)
     feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0     0.124908   0.110323   0.090570   0.070253   0.069563   0.097745   
1     0.040399   0.099569   0.150302   0.008295   0.073564   0.049669   
2     0.062410   0.062787   0.170935   0.019593   0.029291   0.125141   
3     0.076095   0.096624   0.081543   0.003994   0.122282   0.130735   
4     0.056082   0.173097   0.077761   0.025570   0.066399   0.192416   
..         ...        ...        ...        ...        ...        ...   
901   0.016053   0.039013   0.024190   0.000000   0.062924   0.013192   
902   0.000000   0.066819   0.064590   0.000000   0.000000   0.038171   
903   0.001461   0.093059   0.045031   0.016148   0.007072   0.002039   
904   0.029964   0.091694   0.055376   0.029828   0.029236   0.056840   
905   0.023906   0.224821   0.141547   0.000098   0.002153   0.060928   

     feature_6  feature_7  feature_8  feature_9  ...  feature_2046  \
0     0.061416   0.173676   0.073394   0.

In [9]:
from torch.nn import Module, Conv2d, Linear, MaxPool2d, ReLU, LogSoftmax
from torch import flatten
# set the matplotlib backend so figures can be saved in the background
import matplotlib
matplotlib.use("Agg")
# import the necessary packages
from sklearn.metrics import classification_report
from torch.utils.data import random_split
import torch.utils.data as data_utils
from torchvision.transforms import ToTensor
from torchvision.datasets import KMNIST
from torch.optim import Adam, Adagrad
from torch import nn
import matplotlib.pyplot as plt
import numpy as np
import argparse
import torch
import time
from torchvision import transforms
from sklearn.preprocessing import normalize


class ClassNet(Module):

  def __init__(self, numChannels, classes):
		# call the parent constructor

    super(ClassNet, self).__init__()  #input(30, 1, 1, 2055) 7
		# initialize first set of CONV => RELU => POOL layers    [(I - K +2 *P) / S] +1
    self.conv1 = Conv2d(in_channels=numChannels, out_channels=20, kernel_size=(1, 3))  #(30, 20, 1, 2053) 5
    self.relu1 = ReLU()
    self.maxpool1 = MaxPool2d(kernel_size=(1, 2), stride=(2, 2))  #(30, 20, 1, 1026)   #nn.AdaptiveMaxPool2d
		# initialize second set of CONV => RELU => POOL layers
    self.conv2 = Conv2d(in_channels=20, out_channels=50, kernel_size=(1, 1))  #(30, 50, 1, 1022)  5
    self.relu2 = ReLU()
    self.maxpool2 = MaxPool2d(kernel_size=(1, 2), stride=(2, 2))  #(30, 50, 1, 511)
		# initialize first (and only) set of FC => RELU layers
    self.fc1 = Linear(in_features=50, out_features=50)  #30, 25550   25550
    self.relu3 = ReLU()
		# initialize our softmax classifier
    self.fc2 = Linear(in_features=50, out_features=classes)
    self.logSoftmax = LogSoftmax(dim=1)

  def forward(self, x):
		# pass the input through our first set of CONV => RELU => POOL layers
    x = self.conv1(x)
    x = self.relu1(x)
    x = self.maxpool1(x)
		# pass the output from the previous layer through the second
		# set of CONV => RELU => POOL layers
    x = self.conv2(x)
    x = self.relu2(x)
    x = self.maxpool2(x)
		# flatten the output from the previous layer and pass it
		# through our only set of FC => RELU layers
    x = flatten(x, 1)
    x = self.fc1(x)
    x = self.relu3(x)
    # pass the output to our softmax classifier to get our output predictions
    x = self.fc2(x)
    output = self.logSoftmax(x)
		# return the output predictions
    return output

In [10]:
k = np.zeros(9)
print(k)

k[3] = 1.
print(k)

[0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0.]


In [11]:
# define training hyperparameters
INIT_LR = 1e-3
BATCH_SIZE = 3
EPOCHS = 500
# define the train and val splits
TRAIN_SPLIT = 0.75
VAL_SPLIT = 1 - TRAIN_SPLIT
# set the device we will be using to train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df_shuffled = df.sample(frac=1).reset_index()

# Creating np arrays
target = df_shuffled['label'].values
#target = [str(i) for i in target]
features = df_shuffled[['roll', 'pitch', 'yawn', 'lx', 'ly', 'rx', 'ry']].values       #.drop(['label','index'], axis=1)

features_norm = normalize(features, norm='l2', axis=1, copy=True, return_norm=False)
print(features)


onek_target = []
for i in target:
  k = np.zeros(9)
  k[i-1] = 1.
  onek_target.append(k)

onek_target = np.array(onek_target)
print(onek_target)

features_tensor = torch.tensor(features_norm)
target_tensor = torch.tensor(onek_target)

# Passing to DataLoader
dataset = data_utils.TensorDataset(features_tensor, target_tensor)
#data_loader = data_utils.DataLoader(train, batch_size=10, shuffle=True)

print("[INFO] generating the train/validation split...")
numTrain = int(len(target) * TRAIN_SPLIT)+1
print(numTrain)
numVal = int(len(target) * VAL_SPLIT)
print(numVal)

(trainData, valData) = random_split(dataset, [numTrain, numVal], generator=torch.Generator().manual_seed(42))

trainDataLoader = data_utils.DataLoader(trainData, shuffle=True, batch_size=BATCH_SIZE)
valDataLoader = data_utils.DataLoader(valData, batch_size=BATCH_SIZE)

[[ 5.3335266e+00  1.1416677e+00 -4.4258120e+00 ...  4.0548608e+02
   3.4222220e+02  4.1081960e+02]
 [-2.0569763e+00  1.3807433e+00  7.0079895e+01 ...  4.7759680e+02
   4.0881192e+02  4.7553983e+02]
 [ 6.5351870e+00  1.0055187e+00 -5.8905640e+00 ...  4.3758722e+02
   3.8277118e+02  4.4412240e+02]
 ...
 [ 6.5588380e+00  1.4125171e+00 -4.9880676e+00 ...  4.4116254e+02
   3.6130273e+02  4.4772137e+02]
 [-7.3913574e-02  1.3413517e+00 -6.5923157e+00 ...  4.3330432e+02
   3.3666098e+02  4.3323040e+02]
 [ 7.8082275e-01  1.2093598e+00 -2.0735764e+01 ...  4.2114685e+02
   3.3470090e+02  4.2192767e+02]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[INFO] generating the train/validation split...
680
226


In [12]:
# initialize the model
print("[INFO] initializing the ClassNet model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ClassNet(numChannels=1, classes=9).to(device)
# initialize our optimizer and loss function
opt = Adam(model.parameters(), lr=INIT_LR)   #Adam
lossFn = nn.MSELoss()

# measure how long training is going to take
print("[INFO] training the network...")
startTime = time.time()

[INFO] initializing the ClassNet model...
[INFO] training the network...


In [13]:
# loop over our epochs
for e in range(0, EPOCHS):
	# set the model in training mode
	model.train()
	# initialize the total training and validation loss
	totalTrainLoss = 0
	totalValLoss = 0
	# initialize the number of correct predictions in the training
	# and validation step
	trainCorrect = 0
	valCorrect = 0

  # loop over the training set
	for (x, y) in trainDataLoader:
		# send the input to the device
		x = x.to(dtype=torch.float32).unsqueeze(1).unsqueeze(1)
		y = y.to(dtype=torch.float32)
		#print(x.shape)
		#print(x,y)
		(x, y) = (x.to(device), y.to(device))
		# perform a forward pass and calculate the training loss
		pred = model(x)
		#print(y.size)
		#print(pred.size)
		#print(y)
		#print(pred)
		loss = lossFn(pred, y)
		# zero out the gradients, perform the backpropagation step,
		# and update the weights
		opt.zero_grad()
		loss.backward()
		opt.step()
		# add the loss to the total training loss so far and
		# calculate the number of correct predictions
		totalTrainLoss += loss
		print(pred.argmax(1))
		#print(y.argmax(1))
		trainCorrect += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

  # switch off autograd for evaluation
	with torch.no_grad():
		# set the model in evaluation mode
		model.eval()
		# loop over the validation set
		for (x, y) in valDataLoader:
			x = x.to(dtype=torch.float32).unsqueeze(1).unsqueeze(1)
			y = y.to(dtype=torch.float32)
	 		# send the input to the device
			(x, y) = (x.to(device), y.to(device))
			# make the predictions and calculate the validation loss
			pred = model(x)
			totalValLoss += lossFn(pred, y)
			# calculate the number of correct predictions
			valCorrect += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()


	# calculate the training and validation accuracy
	trainCorrect = trainCorrect / len(trainDataLoader.dataset)
	valCorrect = valCorrect / len(valDataLoader.dataset)
	# update our training history

	print(trainCorrect)
	print(valCorrect)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tensor([7, 1, 2], device='cuda:0')
tensor([1, 5, 3], device='cuda:0')
tensor([8, 7, 6], device='cuda:0')
tensor([4, 4, 4], device='cuda:0')
tensor([0, 3, 6], device='cuda:0')
tensor([1, 6, 8], device='cuda:0')
tensor([8, 5, 4], device='cuda:0')
tensor([3, 3, 7], device='cuda:0')
tensor([5, 7, 3], device='cuda:0')
tensor([7, 6, 7], device='cuda:0')
tensor([3, 8, 8], device='cuda:0')
tensor([1, 6, 5], device='cuda:0')
tensor([4, 3, 4], device='cuda:0')
tensor([3, 6, 5], device='cuda:0')
tensor([1, 3, 4], device='cuda:0')
tensor([3, 3, 4], device='cuda:0')
tensor([8, 4, 2], device='cuda:0')
tensor([6, 4, 6], device='cuda:0')
tensor([5, 2, 2], device='cuda:0')
tensor([8, 4, 3], device='cuda:0')
tensor([4, 4, 4], device='cuda:0')
tensor([8, 6, 3], device='cuda:0')
tensor([7, 0, 7], device='cuda:0')
tensor([4, 8, 5], device='cuda:0')
tensor([7, 6, 2], device='cuda:0')
tensor([8, 8, 1], device='cuda:0')
tensor([5, 7, 2], device=