In [1]:
from __future__ import print_function, division

import math
#import os
import torch
import pandas as pd
from skimage import transform
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import cv2
from tqdm import tqdm

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
class GazeEstimationDataset(Dataset):
    def __init__(self, csv_file, root_dir, trans=None):
        self.root_dir = root_dir
        self.trans = trans
        self.frame = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        img_name = self.frame.iloc[idx, 0]
        img = cv2.imread(img_name)
        img_norm = cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        #rgb_img = np.repeat(image[..., np.newaxis], 3, -1)
        face_img_coor = np.fromstring(self.frame.iloc[idx, 3][1:int(len(self.frame.iloc[idx, 3]) - 1)],
                                      sep=',', dtype=int)
        face_img = img_norm[face_img_coor[0]: face_img_coor[0] + face_img_coor[2],
                   face_img_coor[1]: face_img_coor[1] + face_img_coor[3], :]

        opt_flow_face = np.fromstring(self.frame.iloc[idx, 5][0][1:int(len(self.frame.iloc[idx, 4]) - 1)],
                                      sep=',', dtype=int)
        sample = {'face': face_img, 'opt_flow': opt_flow_face, 'x': self.frame.iloc[idx, 1], 'y': self.frame.iloc[idx, 2]}
        if self.trans:
            sample = self.trans(sample)
        return sample


class Rescale(object):
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        img = sample['face']

        # h, w = image.shape[:2]
        # if isinstance(self.output_size, int):
        #    if h > w:
        #        new_h, new_w = self.output_size * h / w, self.output_size
        #    else:
        #        new_h, new_w = self.output_size, self.output_size * w / h
        # else:
        new_h, new_w = self.output_size, self.output_size
        new_h, new_w = int(new_h), int(new_w)

        img = transform.resize(img, (new_h, new_w))

        x, y = sample['x'], sample['y']
        opt_flow = sample['opt_flow']

        return {'face': img, 'opt_flow': opt_flow, 'x': x, 'y': y}

    '''opt_flow': opt_flow'''

class ToTensor(object):
    def __call__(self, sample):
        img = sample['face']
        opt_flow = sample['opt_flow']
        x, y = (sample['x'] + 800) / 1600, (sample['y'] + 800) / 1600
        img = img.transpose((2, 0, 1))
        return {'face': torch.from_numpy(img).type(torch.DoubleTensor),
                'opt_flow': torch.from_numpy(opt_flow).type(torch.DoubleTensor),
                'gt_coor': torch.tensor([x, y]).type(torch.DoubleTensor)}

    '''opt_flow': torch.from_numpy(opt_flow.values)'''

"""
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 15, 3)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(15, 5, 8)
        self.pool2 = nn.MaxPool2d(3,3)
        self.conv3 = nn.Conv2d(5, 3, 16)
        self.fc1 = nn.Linear(59536, 120)
        self.d1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(120, 84)
        self.d2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(84, 2)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool1(F.relu(self.conv2(x)))
        x = self.pool2(F.relu(self.conv3(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
"""

class NetFace(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(59536, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [4]:
class GazeEstimationDatasetEyes(Dataset):
    def __init__(self, csv_file, root_dir, trans=None):
        self.frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.trans = trans

    def __len__(self):
        return int(len(self.frame))

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        img_name = self.frame.iloc[idx, 0]
        opt_flow_eyes = np.fromstring(self.frame.iloc[idx, 5][1][1:int(len(self.frame.iloc[idx, 4]) - 1)],
                                      sep=',', dtype=int)

        img = cv2.imread(img_name)
        img_norm = cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)

        eyes_roi = np.fromstring(self.frame.iloc[idx, 4][1:int(len(self.frame.iloc[idx, 4]) - 1)],
                                      sep=',', dtype=int)
        x_eye, y_eye, w, h = eyes_roi[0], eyes_roi[1], eyes_roi[2], eyes_roi[3]

        eyes_img = img_norm[y_eye: y_eye + h, x_eye: x_eye + w]

        x_gt = (self.frame.iloc[idx, 1] + 800) / 1600
        y_gt = (self.frame.iloc[idx, 2] + 800) / 1600

        sample = {'eyes_img': eyes_img, 'opt_flow': opt_flow_eyes, 'x': x_gt, 'y': y_gt}

        if self.trans:
            sample = self.trans(sample)

        return sample

class RescaleEyes(object):
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        img = sample['eyes_img']

        # h, w = image.shape[:2]
        # if isinstance(self.output_size, int):
        #    if h > w:
        #        new_h, new_w = self.output_size * h / w, self.output_size
        #    else:
        #        new_h, new_w = self.output_size, self.output_size * w / h
        # else:
        new_h, new_w = self.output_size, self.output_size
        new_h, new_w = int(new_h), int(new_w)

        img = transform.resize(img, (new_h, new_w))

        x, y = sample['x'], sample['y']
        opt_flow = sample['opt_flow']

        return {'eyes_img': img, 'opt_flow': opt_flow, 'x': x, 'y': y}

class ToTensorEyes(object):
    def __call__(self, sample):
        img = sample['eyes_img']
        x, y = sample['x'], sample['y']
        opt_flow = sample['opt_flow']
        img = img.transpose((2, 0, 1))
        return {'eyes_img': torch.from_numpy(img).type(torch.DoubleTensor),
                'opt_flow': torch.from_numpy(opt_flow).type(torch.DoubleTensor),
                'gt_coor': torch.tensor([x, y]).type(torch.DoubleTensor)}

class NetEyes(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(13456, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)

    def forward(self, x1, x2):
        x1 = self.pool(F.relu(self.conv1(x1)))
        x1 = self.pool(F.relu(self.conv2(x1)))
        x1 = torch.flatten(x1, 1)  # flatten all dimensions except batch
        x1 = F.relu(self.fc1(x1))
        x1 = F.relu(self.fc2(x1))
        x1 = self.fc3(x1)

        return x1, x2


In [5]:
dataset = GazeEstimationDatasetEyes(csv_file="full_face/total_opt_flow.csv", root_dir="")
transformed_dataset = GazeEstimationDataset(csv_file="full_face/total_opt_flow.csv", root_dir="",
                                            trans=transforms.Compose([Rescale(256), ToTensor()]))
transformed_dataset_eyes = GazeEstimationDatasetEyes(csv_file="full_face/total_opt_flow.csv", root_dir="",
                                            trans=transforms.Compose([RescaleEyes(128), ToTensorEyes()]))

datasets containing full face pictures

In [11]:
train_size = int(0.8 * len(transformed_dataset))
test_size = len(transformed_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(transformed_dataset, [train_size, test_size])

train_size = int(0.95 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

In [12]:
batch_size = 15
trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
valloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

net = NetFace().to(device)
net = net.double()
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [13]:
n = 7
min_valid_loos = np.inf
for epoch in range(n):
    with tqdm(trainloader, unit="batch") as tepoch:
        running_loss = 0.0
        for data in tepoch:
            tepoch.set_description(f"Training {epoch}")
            faces = data['face']
            labels = data['gt_coor']

            optimizer.zero_grad()

            output = net(faces)

            train_loss = criterion(output, labels)
            train_loss.backward()
            optimizer.step()
            batch_dist = 0

            for i in range (len(output)):
                out_x, out_y = output[i][0], output[i][1]
                lab_x, lab_y = labels[i][0], labels[i][1]
                dx = out_x - lab_x
                dy = out_y - lab_y
                dist = math.sqrt(dx*dx + dy*dy)
                batch_dist += dist


            #correct = (abs(output - labels)).sum().item()
            distance = batch_dist / len(output)
            running_loss += train_loss.item()
            tepoch.set_postfix(train_loss=train_loss.item(), distance=100*distance)

        #print("train loss value: ", running_loss/len(trainloader))

    with tqdm(valloader, unit="batch") as tepoch:
        with torch.no_grad():
            net.eval()
            for data in tepoch:
                tepoch.set_description(f"Validation {epoch}")
                faces = data['face']
                labels = data['gt_coor']
                #labels = [0.5, 0.5]
                #labels = torch.Tensor(labels).type(torch.DoubleTensor)
                output = net(faces)
                loss = criterion(output, labels)
                val_loss = loss.item()*faces.size(0)
                #dec = min_valid_loos > val_loss
                #if dec:
                #    min_valid_loos = val_loss
                batch_dist = 0
                for i in range (len(output)):
                    out_x, out_y = output[i][0], output[i][1]
                    lab_x, lab_y = labels[i][0], labels[i][1]
                    dx = out_x - lab_x
                    dy = out_y - lab_y
                    dist = math.sqrt(dx*dx + dy*dy)
                    batch_dist += dist

                #correct = (abs(output - labels)).sum().item()
                val_dist = batch_dist / len(output)
                tepoch.set_postfix(val_loss=val_loss, val_acc=100*val_dist)

        #print("validation done for current epoch")


print('done')
path = './trained_face.pth'
torch.save(net.state_dict(), path)

Training 0: 100%|██████████| 212/212 [09:13<00:00,  2.61s/batch, distance=14.8, train_loss=0.0128] 
Validation 0: 100%|██████████| 12/12 [00:18<00:00,  1.57s/batch, val_acc=15.2, val_loss=0.0361]
Training 1: 100%|██████████| 212/212 [08:15<00:00,  2.34s/batch, distance=7.01, train_loss=0.00278]
Validation 1: 100%|██████████| 12/12 [00:19<00:00,  1.60s/batch, val_acc=12.3, val_loss=0.0315]
Training 2: 100%|██████████| 212/212 [08:15<00:00,  2.34s/batch, distance=9.33, train_loss=0.0063] 
Validation 2: 100%|██████████| 12/12 [00:23<00:00,  1.92s/batch, val_acc=5.72, val_loss=0.00511]
Training 3: 100%|██████████| 212/212 [08:19<00:00,  2.36s/batch, distance=6.38, train_loss=0.00232] 
Validation 3: 100%|██████████| 12/12 [00:19<00:00,  1.58s/batch, val_acc=7.55, val_loss=0.00929]
Training 4: 100%|██████████| 212/212 [08:17<00:00,  2.34s/batch, distance=6.81, train_loss=0.00279]
Validation 4: 100%|██████████| 12/12 [00:20<00:00,  1.74s/batch, val_acc=6.73, val_loss=0.00725]
Training 5: 100%

done





In [15]:
total = 0
correct = 0
path = './trained_face.pth'
net.load_state_dict(torch.load(path))
#print(net)
distances = []
with torch.no_grad():
    with tqdm(testloader, unit="batch") as tepoch:
        for data in tepoch:
            image = data['face']
            #labels = [0.5, 0.5]
            labels = data['gt_coor']
            output = net(image)
            dim = len(output)
            batch_dist = 0
            for i in range(dim):
                out_x, out_y = output[i][0], output[i][1]
                lab_x, lab_y = labels[i][0], labels[i][1]
                dx = out_x - lab_x
                dy = out_y - lab_y
                dist = math.sqrt(dx*dx + dy*dy)
                batch_dist += dist

            distance = batch_dist / len(output)
            distances.append(distance)
            tepoch.set_postfix(distance=100*distance)
print("done")
print(distances)

100%|██████████| 56/56 [01:50<00:00,  1.98s/batch, distance=3.36]

done
[0.05949966309459023, 0.05008026359080435, 0.049768659299329845, 0.04446968858562382, 0.06018011717998062, 0.06520673701226878, 0.054877263733902705, 0.05257904073971813, 0.0654016836113503, 0.04631345636011396, 0.037437151873058856, 0.05287306299666801, 0.06530292640592636, 0.05040494480239445, 0.0625805197268185, 0.041152478204878454, 0.05512800461471632, 0.05201234359598573, 0.054557489941675294, 0.056298208386637634, 0.052464191443160585, 0.05000741437636013, 0.06917310803758644, 0.0524028067617749, 0.06694999963731285, 0.04598571104888542, 0.05056822905929117, 0.06839889129801113, 0.0596348074286696, 0.050054309815522016, 0.048046268244693234, 0.048398270123249106, 0.04453860711864628, 0.04768868600521359, 0.04796741313275347, 0.08302831311896339, 0.05876924737572374, 0.0480315858407602, 0.04482895411587731, 0.06476792094868924, 0.04908412966978647, 0.049750267276777685, 0.052077996718446144, 0.04105272163071614, 0.04605363337644323, 0.06619465842824361, 0.06746140996674176, 




dataset containing only eye portion 40FPS

In [6]:
train_size = int(0.8 * len(transformed_dataset_eyes))
test_size = len(transformed_dataset_eyes) - train_size
train_dataset_eyes, test_dataset_eyes = torch.utils.data.random_split(transformed_dataset_eyes, [train_size, test_size])

train_size = int(0.95 * len(train_dataset_eyes))
val_size = len(train_dataset_eyes) - train_size
train_dataset_eyes, val_dataset_eyes = torch.utils.data.random_split(train_dataset_eyes, [train_size, val_size])

In [7]:
batch_size = 15
trainloader_eyes = DataLoader(train_dataset_eyes, batch_size=batch_size, shuffle=True, num_workers=0)
valloader_eyes = DataLoader(val_dataset_eyes, batch_size=batch_size, shuffle=True, num_workers=0)
testloader_eyes = DataLoader(test_dataset_eyes, batch_size=batch_size, shuffle=False, num_workers=0)

net = NetEyes().to(device)
net = net.double()
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [8]:
n = 7
min_valid_loos = np.inf
for epoch in range(n):
    with tqdm(trainloader_eyes, unit="batch") as tepoch:
        running_loss = 0.0
        for data in tepoch:
            tepoch.set_description(f"Training {epoch}")
            inputs = data['eyes_img']
            labels = data['gt_coor']
            optical_flow = data['opt_flow']

            optimizer.zero_grad()

            output = net(inputs)

            train_loss = criterion(output, labels)
            train_loss.backward()
            optimizer.step()
            batch_dist = 0

            for i in range (len(output)):
                out_x, out_y = output[i][0], output[i][1]
                lab_x, lab_y = labels[i][0], labels[i][1]
                dx = out_x - lab_x
                dy = out_y - lab_y
                dist = math.sqrt(dx*dx + dy*dy)
                batch_dist += dist


            #correct = (abs(output - labels)).sum().item()
            distance = batch_dist / len(output)
            running_loss += train_loss.item()
            tepoch.set_postfix(train_loss=train_loss.item(), distance=100*distance)

        #print("train loss value: ", running_loss/len(trainloader))

    with tqdm(valloader_eyes, unit="batch") as tepoch:
        with torch.no_grad():
            net.eval()
            for data in tepoch:
                tepoch.set_description(f"Validation {epoch}")
                inputs = data['eyes_img']
                labels = data['gt_coor']
                #labels = [0.5, 0.5]
                #labels = torch.Tensor(labels).type(torch.DoubleTensor)
                output = net(inputs)
                loss = criterion(output, labels)
                val_loss = loss.item()*inputs.size(0)
                #dec = min_valid_loos > val_loss
                #if dec:
                #    min_valid_loos = val_loss
                batch_dist = 0
                for i in range (len(output)):
                    out_x, out_y = output[i][0], output[i][1]
                    lab_x, lab_y = labels[i][0], labels[i][1]
                    dx = out_x - lab_x
                    dy = out_y - lab_y
                    dist = math.sqrt(dx*dx + dy*dy)
                    batch_dist += dist

                #correct = (abs(output - labels)).sum().item()
                val_dist = batch_dist / len(output)
                tepoch.set_postfix(val_loss=val_loss, val_acc=100*val_dist)

        #print("validation done for current epoch")


print('done')
path = './trained_eyes.pth'
torch.save(net.state_dict(), path)

Training 0: 100%|██████████| 212/212 [03:13<00:00,  1.10batch/s, distance=27.9, train_loss=0.0491]
Validation 0: 100%|██████████| 12/12 [00:08<00:00,  1.48batch/s, val_acc=33.1, val_loss=0.171]
Training 1: 100%|██████████| 212/212 [02:57<00:00,  1.20batch/s, distance=17.2, train_loss=0.0172] 
Validation 1: 100%|██████████| 12/12 [00:07<00:00,  1.64batch/s, val_acc=16.6, val_loss=0.0697]
Training 2: 100%|██████████| 212/212 [02:54<00:00,  1.21batch/s, distance=13.4, train_loss=0.0199] 
Validation 2: 100%|██████████| 12/12 [00:07<00:00,  1.64batch/s, val_acc=11.5, val_loss=0.0228]
Training 3: 100%|██████████| 212/212 [02:53<00:00,  1.22batch/s, distance=14.5, train_loss=0.0139] 
Validation 3: 100%|██████████| 12/12 [00:07<00:00,  1.70batch/s, val_acc=6.84, val_loss=0.0101]
Training 4: 100%|██████████| 212/212 [02:54<00:00,  1.22batch/s, distance=20.1, train_loss=0.0293] 
Validation 4: 100%|██████████| 12/12 [00:07<00:00,  1.63batch/s, val_acc=7.96, val_loss=0.0116]
Training 5: 100%|█████

done





In [10]:
total = 0
correct = 0
path = './trained_eyes.pth'
net.load_state_dict(torch.load(path))
#print(net)
distances = []
with torch.no_grad():
    with tqdm(testloader_eyes, unit="batch") as tepoch:
        for data in tepoch:
            image = data['eyes_img']
            labels = data['gt_coor']
            # labels = [0.5, 0.5]
            output = net(image)
            dim = len(output)
            batch_dist = 0
            for i in range(dim):
                out_x, out_y = output[i][0], output[i][1]
                lab_x, lab_y = labels[i][0], labels[i][1]
                dx = out_x - lab_x
                dy = out_y - lab_y
                dist = math.sqrt(dx*dx + dy*dy)
                batch_dist += dist

            distance = batch_dist / len(output)
            distances.append(distance)
            tepoch.set_postfix(distance=100*distance)
print("done")
print(distances)

100%|██████████| 56/56 [01:03<00:00,  1.13s/batch, distance=11.4]

done
[0.12739867741249106, 0.09682358642166573, 0.08164905252296582, 0.09630584700019643, 0.09587479534256883, 0.11953552710988823, 0.0636588508440594, 0.09676803806236373, 0.11630367745271228, 0.11247561559901084, 0.11312693093211175, 0.11385132228717781, 0.09680331447751142, 0.0957849750326049, 0.07190886332558279, 0.06707805867741085, 0.09860276822285488, 0.10981700551410885, 0.11797860999200772, 0.10199024381985942, 0.08362565417991831, 0.09790807099261754, 0.06609095476282197, 0.10382791859815178, 0.08434410285654992, 0.10175774664616283, 0.07944261670650951, 0.09306411229481222, 0.09180763790448282, 0.11642367890964543, 0.0844357677636685, 0.10486390031321458, 0.10970180833165312, 0.16795082698862393, 0.12877746632424933, 0.10772881507813083, 0.1005329992585542, 0.09109110041059862, 0.08288591814377742, 0.07724711224557818, 0.1399541185519757, 0.1074644732561188, 0.10683463557550178, 0.11445796315709313, 0.09572891366128175, 0.1485456672980298, 0.08762541677775527, 0.088449074554


