In [16]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [17]:
!pip install tensorboardX
!pip install --quiet pytorch-lightning>=1.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
import os
import collections
import numpy as np
import random

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers.csv_logs import CSVLogger

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

import sys
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.optim.lr_scheduler import ExponentialLR
import numpy as np
import time
import shutil
import time
import datetime
import argparse
import os
import torch.nn.init as init
import torch.nn.functional as F
from math import floor
from math import ceil
import math

In [19]:
os.chdir('/content/gdrive/MyDrive/gaze_project')
os.getcwd()

'/content/gdrive/MyDrive/gaze_project'

In [20]:
# create dataset
class MyDataset(data.Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __getitem__(self, index):
        feature, target = self.features[index], self.labels[index]
        return feature, target
    
    def __len__(self):
        return len(self.features)

    
# load data.    
def LoadData(dataset_dir, batch_size):

    print("\nLoading the training dataset")
    trainingX = torch.from_numpy(np.load(dataset_dir + 'trainingX.npy')).float()
    trainingY = torch.from_numpy(np.load(dataset_dir + 'trainingY.npy')).float()    
    print('\nTraining Data Size: {}'.format(list(trainingX.size())))

    train_dataset = MyDataset(trainingX, trainingY)

    train_loader = data.DataLoader(dataset=train_dataset, num_workers=8, batch_size=batch_size, shuffle=True)

    print("\nLoading the testing dataset")
    testX = torch.from_numpy(np.load(dataset_dir + 'testX.npy')).float()
    testY = torch.from_numpy(np.load(dataset_dir + 'testY.npy')).float()

    test_size = testX.size()
    test_new_size = test_size[0]//2

    test_X = testX[0:test_new_size, :]
    test_Y = testY[0:test_new_size, :]

    print('\nTest Data Size: {}'.format(list(test_X.size())))
    test_dataset = MyDataset(test_X, test_Y)

    test_loader = data.DataLoader(dataset=test_dataset, num_workers=8, batch_size=batch_size, shuffle=False)

    print("\nLoading the validation dataset")
    validationX = testX[test_new_size:, :]
    validationY = testY[test_new_size:, :]

    print('\nValidation Data Size: {}'.format(list(validationX.size())))
    validation_dataset = MyDataset(validationX, validationY)

    validation_loader = data.DataLoader(dataset=validation_dataset, num_workers=8, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader, validation_loader

In [21]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class MultiheadAttention(nn.Module):
    
    def __init__(self, input_dim, embed_dim, n_heads):
        super().__init__()
        assert embed_dim % n_heads == 0, "Embedding dimension must be 0 modulo number of heads."
        
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.head_dim = embed_dim // n_heads
        
        self.qkv_proj = nn.Linear(input_dim, 3*embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)
        
        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)

    def forward(self, x, mask=None, return_attention=False):
        batch_size, seq_length, embed_dim = x.size()
        qkv = self.qkv_proj(x)
        
        qkv = qkv.reshape(batch_size, seq_length, self.n_heads, 3*self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3) # [Batch, Head, SeqLen, Dims]
        q, k, v = qkv.chunk(3, dim=-1)
        
        values, attention = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims]
        values = values.reshape(batch_size, seq_length, embed_dim)
        o = self.o_proj(values)
        
        if return_attention:
            return o, attention
        else:
            return o

In [22]:
class EncoderBlock(nn.Module):
    
    def __init__(self, input_dim, n_heads, dim_ff, dropout=0.0):

        super().__init__()
        
        # Attention layer
        self.self_attn = MultiheadAttention(input_dim, input_dim, n_heads)
        
        # Two-layer MLP
        self.linear_net = nn.Sequential(
            nn.Linear(input_dim, dim_ff),
            nn.Dropout(dropout),
            nn.ReLU(inplace=True),
            nn.Linear(dim_ff, input_dim)
        )
        
        # Layers to apply in between the main layers
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):

        attn_out = self.self_attn(x, mask=mask)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)
        
        linear_out = self.linear_net(x)
        x = x + self.dropout(linear_out)
        x = self.norm2(x)
        
        return x

class TransformerEncoder(nn.Module):
    
    def __init__(self, n_layers, **block_args):
        super().__init__()
        self.layers = nn.ModuleList([EncoderBlock(**block_args) for _ in range(n_layers)])

    def forward(self, x, mask=None):
        for l in self.layers:
            x = l(x, mask=mask)
        return x

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=5000):

        super().__init__()

        pos_enc = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pos_enc[:, 0::2] = torch.sin(position * div_term)
        pos_enc[:, 1::2] = torch.cos(position * div_term)
        pos_enc = pos_enc.unsqueeze(0)
        
        self.register_buffer('pos_enc', pos_enc, persistent=False)

    def forward(self, x):
        x = x + self.pos_enc[:, :x.size(1)]
        return x

In [23]:
class PatchEmbed(nn.Module):

    def __init__(self, img_size, patch_size, in_chans, embed_dim):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2


        self.proj = nn.Conv2d(
                in_chans,
                embed_dim,
                kernel_size=patch_size,
                stride=patch_size,
        )

    def forward(self, x):

        x = self.proj(
                x
            )  
        x = x.flatten(2)  
        x = x.transpose(1, 2)

        return x


class Attention(nn.Module):

    def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.):
        super().__init__()
        self.n_heads = n_heads
        self.dim = dim
        self.head_dim = dim // n_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_p)

    def forward(self, x):

        n_samples, n_tokens, dim = x.shape

        if dim != self.dim:
            raise ValueError

        qkv = self.qkv(x)  # (n_samples, n_patches + 1, 3 * dim)
        qkv = qkv.reshape(
                n_samples, n_tokens, 3, self.n_heads, self.head_dim
        )  # (n_smaples, n_patches + 1, 3, n_heads, head_dim)
        qkv = qkv.permute(
                2, 0, 3, 1, 4
        )  # (3, n_samples, n_heads, n_patches + 1, head_dim)

        q, k, v = qkv[0], qkv[1], qkv[2]
        k_t = k.transpose(-2, -1)  # (n_samples, n_heads, head_dim, n_patches + 1)
        dp = (
           q @ k_t
        ) * self.scale # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = dp.softmax(dim=-1)  # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = self.attn_drop(attn)

        weighted_avg = attn @ v  # (n_samples, n_heads, n_patches +1, head_dim)
        weighted_avg = weighted_avg.transpose(
                1, 2
        )  # (n_samples, n_patches + 1, n_heads, head_dim)
        weighted_avg = weighted_avg.flatten(2)  # (n_samples, n_patches + 1, dim)

        x = self.proj(weighted_avg)  # (n_samples, n_patches + 1, dim)
        x = self.proj_drop(x)  # (n_samples, n_patches + 1, dim)

        return x


class MLP(nn.Module):

    def __init__(self, in_features, hidden_features, out_features, p=0.3):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)

    def forward(self, x):

        x = self.fc1(
                x
        ) # (n_samples, n_patches + 1, hidden_features)
        x = self.act(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.fc2(x)  # (n_samples, n_patches + 1, out_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, out_features)

        return x


class Block(nn.Module):

    def __init__(self, dim, n_heads, mlp_ratio=4.0, qkv_bias=True, p=0.3, attn_p=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
        self.attn = Attention(
                dim,
                n_heads=n_heads,
                qkv_bias=qkv_bias,
                attn_p=attn_p,
                proj_p=p
        )
        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
        hidden_features = int(dim * mlp_ratio)
        self.mlp = MLP(
                in_features=dim,
                hidden_features=hidden_features,
                out_features=dim,
        )

    def forward(self, x):

        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))

        return x


class VisionTransformer(nn.Module):

    def __init__(
            self,
            img_size,
            patch_size,
            in_chans,
            n_output,
            embed_dim,
            depth,
            n_heads,
            mlp_ratio=4.,
            qkv_bias=True,
            p=0.3,
            attn_p=0.3,
    ):
        super().__init__()

        self.patch_embed = PatchEmbed(
                img_size=img_size,
                patch_size=patch_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
        )
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(
                torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim)
        )
        self.pos_drop = nn.Dropout(p=p)

        self.blocks = nn.ModuleList(
            [
                Block(
                    dim=embed_dim,
                    n_heads=n_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    p=p,
                    attn_p=attn_p,
                )
                for _ in range(depth)
            ]
        )

        self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
        self.head = nn.Linear(embed_dim, n_output)


    def forward(self, x):

        n_samples = x.shape[0]
        x = self.patch_embed(x)

        cls_token = self.cls_token.expand(
                n_samples, -1, -1
        )  # (n_samples, 1, embed_dim)
        x = torch.cat((cls_token, x), dim=1)  # (n_samples, 1 + n_patches, embed_dim)
        x = x + self.pos_embed  # (n_samples, 1 + n_patches, embed_dim)
        x = self.pos_drop(x)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)

        cls_token_final = x[:, 0]
        x = self.head(cls_token_final)

        return x

In [24]:
class CosineWarmupScheduler(torch.optim.lr_scheduler._LRScheduler):
    
    def __init__(self, optimizer, warm_up, max_iters):
        self.warm_up = warm_up
        self.max_n_iters = max_iters
        super().__init__(optimizer)
        
    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]
    
    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_n_iters))
        if epoch <= self.warm_up:
            lr_factor *= epoch * 1.0 / self.warm_up
        return lr_factor

In [25]:
class GazeTransformer(pl.LightningModule):
    def __init__(self, 
                 input_dim_h, 
                 model_dim_h,
                 num_out_h, 
                 input_dim_s, 
                 model_dim_s,
                 num_out_s, 
                 n_heads, 
                 n_layers_h, 
                 n_layers_s,
                 lr, 
                 seq_length,
                 warm_up, 
                 max_iters,
                 criterion, 
                 dropout=0.0, 
                 input_dropout=0.0,):
        super().__init__()

        # model params
        self.input_dim_h = input_dim_h
        self.model_dim_h =model_dim_h
        self.num_out_h = num_out_h
        self.input_dim_s = input_dim_s
        self.model_dim_s = model_dim_s
        self.num_out_s = num_out_s
        self.n_heads = n_heads
        self.n_layers_h = n_layers_h
        self.n_layers_s = n_layers_s
        self.lr = lr
        self.seq_length = seq_length
        self.warm_up = warm_up
        self.max_iters = max_iters
        self.dropout = dropout
        self.input_dropout = input_dropout
        self.criterion = criterion

        self.seq_size = self.seq_length * self.input_dim_h

        prd_fc_linear_size_1 = 128
        prd_fc_input_size = self.num_out_s + self.num_out_h

        # Encoder for head object sequence
        self.input_net_h = nn.Sequential(
            nn.Dropout(self.input_dropout),
            nn.Linear(self.input_dim_h, self.model_dim_h)
        )
        # Positional encoding for sequences
        self.positional_encoding_h = PositionalEncoding(d_model=self.model_dim_h)
        # Transformer
        self.transformer_h = TransformerEncoder(n_layers=self.n_layers_h,
                                              input_dim=self.model_dim_h,
                                              dim_ff=2*self.model_dim_h,
                                              n_heads=self.n_heads,
                                              dropout=self.dropout)
        # Output classifier per sequence lement
        self.output_net_h = nn.Sequential(
            nn.Linear(self.model_dim_h, self.model_dim_h),
            nn.LayerNorm(self.model_dim_h),
            nn.ReLU(inplace=True),
            nn.Dropout(self.dropout),
            nn.Linear(self.model_dim_h, self.num_out_h)
        )


        # Encoder for saliency features
        self.saliencyTransformer = VisionTransformer(img_size= 24,
                                                    in_chans= 2,
                                                    patch_size=4,
                                                    embed_dim= 32,
                                                    depth= self.n_layers_s,
                                                    n_heads =8,
                                                    qkv_bias= True,
                                                    mlp_ratio= 4,
                                                    n_output=32)

        self.prd_fc = nn.Sequential(
            nn.Linear(prd_fc_input_size, prd_fc_linear_size_1),
            nn.BatchNorm1d(prd_fc_linear_size_1),
            nn.ReLU(),
            nn.Dropout(p = self.dropout)
        )

        self.prd_cnn = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=2,kernel_size=128, padding='valid'),
             )
        
    def forward(self, x, mask=None, add_positional_encoding=True):

        head_obj_seq = x[:, 0:self.seq_size]       
        sal_features = x[:, self.seq_size:]

        head_obj_seq = head_obj_seq.reshape(-1, self.seq_length, self.input_dim_h)

        head_obj_seq = self.input_net_h(head_obj_seq)

        if add_positional_encoding:
            head_obj_seq = self.positional_encoding_h(head_obj_seq)
        head_obj_seq = self.transformer_h(head_obj_seq, mask=mask)
        head_obj_seq = head_obj_seq[:, -1, :]
        head_obj_seq = self.output_net_h(head_obj_seq)

        sal_features = sal_features.reshape(-1, 2, 24, 24)

        sal_features = self.saliencyTransformer(sal_features)
        
        batch, _ = sal_features.shape

        prd_inp = torch.cat((head_obj_seq, sal_features), 1)

        fc_out = self.prd_fc(prd_inp)
        fc_out = fc_out.view(batch, 1, -1)
        out = self.prd_cnn(fc_out).view(batch, -1)

        return out
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        # Apply lr scheduler per step
        lr_scheduler = CosineWarmupScheduler(optimizer, 
                                             warm_up=self.warm_up, 
                                             max_iters=self.max_iters)
        return [optimizer], [{'scheduler': lr_scheduler, 'interval': 'step'}]

    def training_step(self, batch, batch_idx):
        features, labels = batch

        features = features.to(device)
        labels = labels.to(device)

        output = self(features)

        loss = self.criterion(output, labels)

        self.log('train_loss', loss)

        return loss

    def validation_step(self, batch, batch_idx):
        features, labels = batch
        features = features.to(device)
        labels = labels.to(device)

        output = self(features)

        loss = self.criterion(output, labels)

        # Calling self.log will surface up scalars for you in TensorBoard
        self.log("val_loss", loss, prog_bar=True)
        return loss


    def test_step(self, batch, batch_idx):
        features, labels = batch
        features = features.to(device)
        labels = labels.to(device)

        output = self(features)

        loss = self.criterion(output, labels)

        prd_error = 0
        ver_error = 0
        hor_error = 0

        for i in range(output.size(0)):
            prd_error += CalAngularDist(labels[i, 0:2], output[i, 0:2])
            ver_error += abs(labels[i, 0] - output[i, 0])
            hor_error += abs(labels[i, 1] - output[i, 1])

        mean_ver_error = ver_error / output.size(0)
        mean_hor_error = hor_error / output.size(0)
        mean_prd_error = prd_error / output.size(0)

        pixel_pred = AngularCoord2PixelCoord(output[0])
        pixel_gth = AngularCoord2PixelCoord(labels[0])

        prd_x.append(pixel_pred[0])
        prd_y.append(pixel_pred[1])
        gth_x.append(pixel_gth[0])
        gth_y.append(pixel_gth[1])

        self.log("test_loss", loss, prog_bar=True)
        self.log("test_ang_error", mean_prd_error, prog_bar=True)
        self.log("mean_ver_error", mean_ver_error, prog_bar=True)
        self.log("mean_hor_error", mean_hor_error, prog_bar=True)
        return loss



In [26]:
def get_args(train=True):
    args = dict()
    args['seq_length'] = 50
    args['seq_feature_num'] = 11
    args['sal_feature_num'] = 576
    # the dropout rate of the model.
    args['dropout_rate'] = 0.5   
    args['gradient_clip'] = 0.1  
    # the directory that saves the dataset.
    args['datasetDir'] = 'DGaze_TrainTest/'
    # the number of total epochs to run
    args['epochs'] = 30
    # the batch size
    args['batch_size'] = 64
    # the initial learning rate.
    args['lr'] = 5e-4
    args['model_dim_h'] = 32
    args['model_dim_s'] = 32
    args['n_heads'] = 8
    args['n_layers_h'] = 3
    args['n_layers_s'] = 3
    args['num_out_h'] = 128
    args['num_out_s'] = 32
    args['warm_up'] = 2
    return args

In [27]:
def CalAngularDist(gth, prd):

	vertical_fov = math.pi*110/180;

	screen_w = 1080
	screen_h = 1200
	screen_center_x = 0.5*screen_w
	screen_center_y = 0.5*screen_h

	screen_dist = 0.5* screen_h/math.tan(vertical_fov/2)
	

	gth = AngularCoord2ScreenCoord(gth)
	prd = AngularCoord2ScreenCoord(prd)

	gth[0] = gth[0]*screen_w
	gth[1] = gth[1]*screen_h
	prd[0] = prd[0]*screen_w
	prd[1] = prd[1]*screen_h
	
	#the distance between eye and gth.
	eye2gth = np.sqrt(np.square(screen_dist) + np.square(gth[0] - screen_center_x) + np.square(gth[1] - screen_center_y))
	#the distance between eye and prd.
	eye2prd = np.sqrt(np.square(screen_dist) + np.square(prd[0] - screen_center_x) + np.square(prd[1] - screen_center_y))
	#the distance between gth and prd.
	gth2prd = np.sqrt(np.square(prd[0] - gth[0]) + np.square(prd[1] - gth[1]))
	
	#the angular distance between gth and prd.
	angular_dist = 180/math.pi*math.acos((np.square(eye2gth) + np.square(eye2prd) - np.square(gth2prd))/(2*eye2gth*eye2prd))
	return angular_dist

def AngularCoord2PixelCoord(angular_coord):
	screen_w = 1080
	screen_h = 1200

	screen_coord = AngularCoord2ScreenCoord(angular_coord);

	pixel_coord = np.zeros(2)

	pixel_coord[0] = screen_coord[0]*screen_w
	pixel_coord[1] = screen_coord[1]*screen_h

	return pixel_coord
	
def AngularCoord2ScreenCoord(angular_coord):

	vertical_fov = math.pi*110/180

	screen_w = 1080
	screen_h = 1200

	screen_dist = 0.5* screen_h/math.tan(vertical_fov/2)
	
	screen_coord = np.zeros(2)

	screen_coord[0] = (screen_dist * math.tan(math.pi*angular_coord[0] / 180) + 0.5*screen_w) / screen_w

	screen_coord[1] = (screen_dist * math.tan(-math.pi*angular_coord[1] / 180) + 0.5*screen_h) / screen_h
	return screen_coord

In [28]:
def main(args, train=True):

    # Load dataset
    train_loader, test_loader, validation_loader = LoadData(args['datasetDir'], args['batch_size'])

    # Create the model.
    root_dir = os.getcwd()

    print('\n==> Starting...')

    csv_logger = CSVLogger('./', name='final', version='3'),


    trainer = Trainer(
        default_root_dir=root_dir,
        max_epochs=args['epochs'],
        logger=csv_logger,
        gpus=1,
        log_every_n_steps=1,
        gradient_clip_val=args['gradient_clip'],
        progress_bar_refresh_rate=1
    )
    
    model = GazeTransformer(
        input_dim_h =args['seq_feature_num'], 
        model_dim_h=args['model_dim_h'],
        num_out_h=args['num_out_h'], 
        input_dim_s=args['sal_feature_num'], 
        model_dim_s=args['model_dim_s'],
        num_out_s=args['num_out_s'], 
        seq_length = args['seq_length'],
        n_heads= args['n_heads'], 
        n_layers_h = args['n_layers_h'], 
        n_layers_s = args['n_layers_s'], 
        lr= args['lr'], 
        warm_up= args['warm_up'], 
        max_iters = args['epochs'],
        criterion = nn.L1Loss(), 
        dropout=args['dropout_rate'], 
        input_dropout=0.0
    )

    if train:
      print('\n==> Training...')
      trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=validation_loader)

      # plot 
      metrics = pd.read_csv('./final/0/metrics.csv')
      train_loss = metrics[['train_loss', 'step', 'epoch']][~np.isnan(metrics['train_loss'])]
      val_loss = metrics[['val_loss', 'epoch']][~np.isnan(metrics['val_loss'])]

      fig, axes = plt.subplots(1, 2, figsize=(16, 5), dpi=100)
      axes[0].set_title('Train loss per batch')
      axes[0].plot(train_loss['step'][::2000], train_loss['train_loss'][::2000])
      axes[1].set_title('Validation loss per epoch')
      axes[1].plot(val_loss['epoch'], val_loss['val_loss'], color='orange')
      plt.show(block = True)

      print(f"Train loss: {train_loss['train_loss'].iloc[-1]:.3f}")
      print(f"Val loss:   {val_loss['val_loss'].iloc[-1]:.3f}")

    else:
      print('\n==> Testing...')
      chk_path = "./final/1/checkpoints/epoch=3-step=65356.ckpt"
      model2 = model.load_from_checkpoint(chk_path,         
                                          input_dim_h =args['seq_feature_num'], 
                                          model_dim_h=args['model_dim_h'],
                                          num_out_h=args['num_out_h'], 
                                          input_dim_s=args['sal_feature_num'], 
                                          model_dim_s=args['model_dim_s'],
                                          num_out_s=args['num_out_s'], 
                                          n_heads= args['n_heads'], 
                                          n_layers_h = args['n_layers_h'], 
                                          n_layers_s = args['n_layers_s'],  
                                          lr= args['lr'], 
                                          seq_length = args['seq_length'],
                                          warm_up= args['warm_up'], 
                                          max_iters = args['epochs'],
                                          criterion = nn.L1Loss(), 
                                          dropout=args['dropout_rate'], 
                                          input_dropout=0.0)

      trainer.test(model=model2, dataloaders=test_loader)



In [29]:
def plotComparisonGraph(gth_x, gth_y, prd_x, prd_y):
  s = [5] * len(prd_x)
  plt.figure(figsize=(12,9))

  gth = plt.scatter(gth_x, gth_y, s, color = '#88c999')

  prd = plt.scatter(prd_x, prd_y, s, color = 'hotpink')
  plt.xlim(0, 1080)
  plt.ylim(0, 1200)

  plt.title("Predicted gaze positions versus ground truth", fontsize=16)

  plt.xlabel("Horizontal /pixel", fontsize=16)
  plt.ylabel("Vertical /pixel", fontsize=16)

  plt.legend((prd, gth),
            ('Predicted', 'Ground Truth'),
            scatterpoints=1,
            loc='lower left',
            ncol=3,
            fontsize=12)

  plt.show()

In [30]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

# set the random seed to ensure reproducibility
seed_everything(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

train = True

prd_x = []
prd_y = []
gth_x = []
gth_y = []

args = get_args()
main(args, train)

if not train:
  plotComparisonGraph(gth_x, gth_y, prd_x, prd_y)

Global seed set to 42


Device: cuda

Loading the training dataset

Training Data Size: [1045654, 1702]

Loading the testing dataset


  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs



Test Data Size: [349152, 1702]

Loading the validation dataset

Validation Data Size: [349152, 1702]

==> Starting...

==> Training...


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                  | Type               | Params
-------------------------------------------------------------
0 | criterion             | L1Loss             | 0     
1 | input_net_h           | Sequential         | 384   
2 | positional_encoding_h | PositionalEncoding | 0     
3 | transformer_h         | TransformerEncoder | 25.6 K
4 | output_net_h          | Sequential         | 5.3 K 
5 | saliencyTransformer   | VisionTransformer  | 41.5 K
6 | prd_fc                | Sequential         | 20.9 K
7 | prd_cnn               | Sequential         | 258   
-------------------------------------------------------------
94.0 K    Trainable params
0         Non-trainable params
94.0 K    Total params
0.376     Total estimated model params size (MB)
  f"Experiment logs directory {self.log_dir} exists and is not empty."


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


KeyError: ignored