## Importing Needed Packages
If any of the packages can't be imported, installation must be done first.

In [1]:
import pandas as pd
import numpy as np
import math
import json
import pickle
import itertools

from scipy.io import arff
import matplotlib.pyplot as plt

import glob
import os

## Data Overview

Number of class to be predicted: 4
- Fixation
- Smooth Pursuit
- Saccade
- Noise

In [2]:
## Loading the arff file
file_dir = 'Dataset/GazeCom/GazeCom_ground_truth/beach/AAF_beach.arff' ## Replace with the directory of the selected file in the dataset
arff_overview = arff.loadarff(file_dir)

## Checking the attributes of the file
arff_overview[1].names()

['time',
 'x',
 'y',
 'confidence',
 'handlabeller1',
 'handlabeller2',
 'handlabeller_final']

Dataset description:
- label to use: handlabeller_final
- values:
- 1.0 = Fixation
- 2.0 = Saccades
- 3.0 = Smooth Pursuit
- 4.0 = Noise


In [4]:
## Reading the arff file using Pandas DataFrame
df_overview = pd.DataFrame(arff_overview[0])
df_overview

Unnamed: 0,time,x,y,confidence,handlabeller1,handlabeller2,handlabeller_final
0,1000.0,590.9,5.2,1.0,4.0,4.0,4.0
1,5000.0,590.9,5.2,1.0,4.0,4.0,4.0
2,9000.0,590.6,5.0,1.0,4.0,4.0,4.0
3,13000.0,590.4,5.0,1.0,4.0,4.0,4.0
4,17000.0,589.8,5.2,1.0,4.0,4.0,4.0
...,...,...,...,...,...,...,...
5003,20124000.0,707.6,665.2,1.0,1.0,1.0,1.0
5004,20128000.0,706.0,668.1,1.0,1.0,1.0,1.0
5005,20132000.0,706.7,670.2,1.0,1.0,1.0,1.0
5006,20136000.0,707.6,675.2,1.0,1.0,1.0,1.0


## Feature Engineering

This step is meant to:
- Get new features from the dataset: speed, acceleration, diretion, standard deviation, and distance from several temporal window sizes
- Converting the arff files, add the new features, and saving them in .csv format.

In [284]:
def get_start_end(i, step, window, conf_min, df):
    if step == window: ## applies for window size of 1
        start = i - step
        end = i
    else:
        start = i - step
        end = i + step
    
    if start < 0 or df['confidence'][start] < conf_min:
        start = i
    if end >= len(df) or df['confidence'][end] < conf_min:
        end = i 
    
    return start, end

In [285]:
def get_velocity(df, speed_col_name, direction_col_name, window):
    ## set 0 values, must be float!
    df[speed_col_name] = 0.0
    df[direction_col_name] = 0.0

    step = np.math.ceil(window/2)
    conf_min = 0.75
    for i in range(0, len(df)):
        if df['confidence'][i] < conf_min:
            continue
        start, end = get_start_end(i, step, window, conf_min, df)
        if start == end:
            continue
        difference_x = df['x'][end] - df['x'][start]
        difference_y = df['y'][end] - df['y'][start]
        
        hypotenuse = np.math.sqrt(difference_x**2 + difference_y**2)
        time_delta = (df['time'][end] - df['time'][start])/1000000 ## time in microseconds, convert to seconds

        ## Assigning
        df[speed_col_name][i] = hypotenuse/time_delta
        df[direction_col_name][i] = np.math.atan2(difference_y, difference_x) ## calculate the arctangent of delta_y/delta_x

    return df

In [286]:
def get_std(df, std_col_name, displacement_col_name, window):
    ## set 0 values, must be float!
    df[std_col_name] = 0.0
    df[displacement_col_name] = 0.0

    step = np.math.ceil(window/2)
    conf_min = 0.75
    for i in range(0, len(df)):
        if df['confidence'][i] < conf_min:
            continue
        start, end = get_start_end(i, step, window, conf_min, df)
        if start == end:
            continue
        list_x, list_y = [], []
        displacement_x, displacement_y = 0, 0
        for j in range(start, end + 1):
            list_x.append(df['x'][j])
            list_y.append(df['y'][j])
        ## Calculating the difference between all set of courses
        for j in range(start, end):
            displacement_x += df['x'][j + 1] - df['x'][j]
            displacement_y += df['y'][j + 1] - df['y'][j]
        
        hypotenuse = np.math.sqrt(displacement_x**2 + displacement_y**2)
        ## Calculating the standard deviation of x's and y's over the full window, calculating the mean between x and y
        std_x = np.std(list_x)
        std_y = np.std(list_y)
        std = np.mean([std_x, std_y])
        
        ## Assigning
        df[std_col_name][i] = std
        df[displacement_col_name][i] = hypotenuse

    return df

In [287]:
def get_acceleration(df, accl_col_name, speed_col_name, direction_col_name, window):
    ## set 0 values, must be float!
    df[accl_col_name] = 0.0
    w = window
    window = 1

    conf_min = 0.75
    step = np.math.ceil(window/2)
    for i in range(0, len(df)):
        if df['confidence'][i] < conf_min:
            continue
        start, end = get_start_end(i, step, window, conf_min, df)
        if start == end:
            continue
        ## Vx = Vo*cos(alpha)
        v_start_x = df[speed_col_name][start]*np.math.cos(df[direction_col_name][start])
        v_end_x = df[speed_col_name][end]*np.math.cos(df[direction_col_name][end])
        
        ## Vy = Vo*sin(alpha)
        v_start_y = df[speed_col_name][start]*np.math.sin(df[direction_col_name][start])
        v_end_y = df[speed_col_name][end]*np.math.sin(df[direction_col_name][end])

        time_delta = (df['time'][end] - df['time'][start])/1000000 ## time in microseconds, convert to seconds
        
        ## Calculating each dimension's acceleration
        accl_x = (v_end_x - v_start_x)/time_delta
        accl_y = (v_end_y - v_start_y)/time_delta
        accl = np.math.sqrt(accl_x**2 + accl_y**2)
        
        ## Assigning
        df[accl_col_name][i] = accl

    return df

In [288]:
def preprocessing(df, window_size):
    temp = df.copy()
    for i in window_size:
        speed_col_name = 'speed_' + str(i)
        direction_col_name = 'direction_' + str(i)
        std_col_name = 'std_' + str(i)
        displacement_col_name = 'displacement_' + str(i)
        accl_col_name = 'acceleration_' + str(i)

        temp = get_velocity(temp, speed_col_name, direction_col_name, i)
        temp = get_acceleration(temp, accl_col_name, speed_col_name, direction_col_name, i)
        temp = get_std(temp, std_col_name, displacement_col_name, i)   

    df = temp

    return df

In [5]:
## Get a list of all the video names in the dataset. The path can be changed depending on where the file is located.
video_param = json.load(open('Dataset/GazeCom/GazeCom_video_parameters.json'))
video_names = video_param['video_names']
video_names

['beach',
 'breite_strasse',
 'bridge_1',
 'bridge_2',
 'bumblebee',
 'doves',
 'ducks_boat',
 'ducks_children',
 'golf',
 'holsten_gate',
 'koenigstrasse',
 'puppies',
 'roundabout',
 'sea',
 'st_petri_gate',
 'st_petri_market',
 'st_petri_mcdonalds',
 'street']

In [290]:
## set window sizes
window_size = [1, 2, 4, 8, 16, 32, 64, 128] 

In [291]:
## Adding new features to all .arff files in the ground_truth folder.
for name in video_names:
    print("Preprocessing folder: " + str(name))
    print('')
    for filename in glob.glob(r'Dataset/GazeCom/GazeCom_ground_truth/' + str(name) + '/*.arff'):
        arff_file = arff.loadarff(filename)
        print("Preprocessing file: " + str(filename))
        df = pd.DataFrame(arff_file[0])

        df = preprocessing(df, window_size)

        ## Save to new folder
        df.to_csv(f'Dataset/GazeCom/GazeCom_preprocessed_final/{name}/{os.path.basename(filename)[:-5]}.csv', index = True)

Preprocessing folder: st_petri_mcdonalds

Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\AAF_st_petri_mcdonalds.arff
Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\ALK_st_petri_mcdonalds.arff
Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\APS_st_petri_mcdonalds.arff
Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\C1K_st_petri_mcdonalds.arff
Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\CCB_st_petri_mcdonalds.arff
Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\CCE_st_petri_mcdonalds.arff
Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\CCF_st_petri_mcdonalds.arff
Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\CCK_st_petri_mcdonalds.arff
Preprocessing file: Dataset/GazeCom/GazeCom_ground_truth/st_petri_mcdonalds\CCM_st_petri_mcdonalds.arff
Preprocessing file: Da

## Data Augmentation

This step is meant to do:
- All the .csv files in the new folder will be augmented into a single HDF5 file.
- Before the augmentation, a feature selection step is employed.

`keys_to_keep`: All the features that will be divided by the PPD constant

`keys_all`: All the features that will be augmented inside the HDF5 file

In [15]:
features_to_keep = ['speed', 'direction', 'acceleration']
windows_to_keep = 7
keys_to_keep = []

if 'xy' in features_to_keep:
    keys_to_keep += ['x', 'y']

if 'speed' in features_to_keep:
    keys_to_keep += ['speed_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_to_keep]]
if 'direction' in features_to_keep:
    keys_to_keep += ['direction_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_to_keep]]
if 'acceleration' in features_to_keep:
    keys_to_keep += ['acceleration_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_to_keep]]
if 'stddev' in features_to_keep:
    keys_to_keep += ['std_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_to_keep]]
if 'displacement' in features_to_keep:
    keys_to_keep += ['displacement_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_to_keep]]

In [293]:
keys_all = []
windows_all = 8

keys_all += ['x', 'y']
keys_all += ['speed_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_all]]
keys_all += ['direction_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_all]]
keys_all += ['acceleration_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_all]]
keys_all += ['std_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_all]]
keys_all += ['displacement_{}'.format(i) for i in (1, 2, 4, 8, 16, 32, 64, 128)[:windows_all]]

In [17]:
keys_to_convert_to_degrees = ['x', 'y'] + [k for k in keys_to_keep if 'speed_' in k or 'acceleration_' in k]
keys_to_convert_to_degrees = set(keys_to_convert_to_degrees).intersection(keys_to_keep)

In [19]:
## Adding attributes, this is general to the GazeCom dataset and might be different for different types of eye movement datasets
width_px = 1280
height_px = 720
width_mm = 400
height_mm = 225.0
distance = 450.0

## Creating a function to calculate the PPD
def calculate_ppd(width_px, height_px, width_mm, height_mm, distance):
    theta_w = 2 * math.atan(width_mm / (2 * distance)) * 180. / math.pi
    theta_h = 2 * math.atan(height_mm / (2 * distance)) * 180. / math.pi

    ppdx = width_px / theta_w
    ppdy = height_px / theta_h

    return (ppdx + ppdy) / 2

ppd = calculate_ppd(width_px, height_px, width_mm, height_mm, distance)

In [20]:
## Checking the value of the PPD constant
ppd

26.178149399649236

In [299]:
num_classes = 5
preprocessed_location = 'Dataset/GazeCom/GazeCom_preprocessed_final/' + '/{}/*.csv'
x = []
y = []
y_onehot = []

## Augmenting process
for i in video_names:
    fnames = sorted(glob.glob(preprocessed_location.format(i)))

    x.append([])
    y.append([])
    y_onehot.append([])
    for f in fnames:
        df = pd.read_csv(f, index_col = 0)
        df_label = df['handlabeller_final']
        df = df.drop(columns = ['time', 'confidence', 'handlabeller1', 'handlabeller2', 'handlabeller_final'])
        for k in keys_to_convert_to_degrees:
            df[k] /= ppd
        x[-1].append(np.hstack([np.reshape(np.array(df[key]), (-1, 1)) for key in keys_all]))
        assert x[-1][-1].dtype == np.float64
        y[-1].append(df_label.astype('int64'))
        y_onehot[-1].append(np.eye(num_classes)[y[-1][-1]])
print(df.columns)

Index(['x', 'y', 'speed_1', 'direction_1', 'acceleration_1', 'std_1',
       'displacement_1', 'speed_2', 'direction_2', 'acceleration_2', 'std_2',
       'displacement_2', 'speed_4', 'direction_4', 'acceleration_4', 'std_4',
       'displacement_4', 'speed_8', 'direction_8', 'acceleration_8', 'std_8',
       'displacement_8', 'speed_16', 'direction_16', 'acceleration_16',
       'std_16', 'displacement_16', 'speed_32', 'direction_32',
       'acceleration_32', 'std_32', 'displacement_32', 'speed_64',
       'direction_64', 'acceleration_64', 'std_64', 'displacement_64',
       'speed_128', 'direction_128', 'acceleration_128', 'std_128',
       'displacement_128'],
      dtype='object')


In [300]:
## Saving the augmented data into HDF5 file
path_h5 = 'Dataset/GazeCom/GazeCom_h5/GazeCom_preprocessed_fixed_ppd_720.h5'
pickle.dump({'data_X': x, 'data_Y': y, 'data_Y_one_hot': y_onehot},
                        open(path_h5, 'wb'))