# Analyse the difference between the hand-labeled data and the classification algorithm

Many parts of this code have been written by Pelin Kömürlüoğlu.

Import the packages

In [None]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics 
import seaborn as sns

PATH_FOREYE = "./data/processed/MAD_sacc" 

# Compute durations and reject outliers:

In [None]:
# load the hand-labeled data
for_eye = pd.read_csv('extendedCombinedTable.csv')
for_eye = for_eye.drop(['Unnamed: 0', 'Var1'], axis=1)
display(for_eye)

''' 
The data contains the results of the hand-labeled data:
- time, valid, xcood, coord, zcoord: used to labeld the data
- validity_handLabel: 1 if valid, 0 if not
- handLabel: either 2 for gaze or 1 for saccade
- validity_algoLabel: 1 if valid, 0 if not 
- handLabel2: combine validity_handLabel & handLabel
- handFixationOnset: 2.0 for each gaze onset
'''

In [None]:
''' 
Code adjusted from Pelin Kömürlüoğlu.

Define gaze and saccade onsets and offsets.
This is done to calculate the durations more easily.
'''

# first: take out invalid
for_eye_rej = for_eye[for_eye['validity_handLabel'] == 1].reset_index()
# and save the invalid separately
for_eye_inv = for_eye[for_eye['validity_handLabel'] == 0]



# second: Find the beginning and the end of the saccades and gazes
# Loop twice, one for the start one for the end. To store the indices to extract the time points in Step 2.2

# define saccade start (change from 2 to 1)
saccade_start = []
for i in range(len(for_eye_rej['handLabel2'])): 
    # Check if row 0 is saccade or not (since it cannot compare to a previous row)
    if i == 0 and for_eye_rej.loc[i, 'handLabel2'] == 1:
        #start_saccade_idx = 0
        saccade_start.append(i)
        continue # continue with the next iteration
    if for_eye_rej.loc[i, 'handLabel2'] == 1 and for_eye_rej.loc[i-1, 'handLabel2'] != 1:
        #start_saccade_idx = i
        saccade_start.append(i)
 
 
# define saccade end (change from 1 to 2)
saccade_end = []
for j in range(len(for_eye_rej['handLabel2'])):      
    # Check if the last row is saccade or not (since it cannot compare to a next row)
    if j == len(for_eye_rej['handLabel2']) and for_eye_rej.loc[j, 'handLabel2'] == 1:
        #end_saccade_idx = 0
        saccade_end.append(j)

        #continue # nowhere to continue, lol
    if for_eye_rej.loc[j, 'handLabel2'] == 1 and for_eye_rej.loc[j+1, 'handLabel2'] != 1:
        saccade_end.append(j)

# define gaze onset (change from 1 to 2)
fixation_start = []
for i in range(len(for_eye_rej['handLabel2'])): 
    # Check if row 0 is saccade or not (since it cannot compare to a previous row)
    if i == 0 and for_eye_rej.loc[i, 'handLabel2'] == 2:
        #start_saccade_idx = 0
        fixation_start.append(i)
        continue # continue with the next iteration
    if for_eye_rej.loc[i, 'handLabel2'] == 2 and for_eye_rej.loc[i-1, 'handLabel2'] != 2:
        #start_saccade_idx = i
        fixation_start.append(i)

# define gaze end (change from 2 to 1)
fixation_end = []
for j in range(len(for_eye_rej['handLabel2'])):      
    # Check if the last row is fixation or not (since it cannot compare to a next row)
    if j == len(for_eye_rej['handLabel2'])-1 and for_eye_rej.loc[j, 'handLabel2'] == 2:
        fixation_end.append(j)
        break
        #continue # nowhere to continue, lol
    if for_eye_rej.loc[j, 'handLabel2'] == 2 and for_eye_rej.loc[j+1, 'handLabel2'] != 2:
        fixation_end.append(j)




Calculate the durations of fixations and saccades

In [None]:
# using the defined event onsets and offsets, we calculate the duration of each event (end - start)

# this is done separately for saccades
duration_saccade = []
for i in range(len(saccade_end)):
    duration_saccade.append(for_eye_rej['time'][saccade_end[i]]-for_eye_rej['time'][saccade_start[i]])

# and gazes
duration_fixation = []
for i in range(len(fixation_end)):
    duration_fixation.append(for_eye_rej['time'][fixation_end[i]]-for_eye_rej['time'][fixation_start[i]])
    

# then we combine them to a datafram - first for saccades
int_sacc = list(zip(saccade_start,saccade_end,duration_saccade))
int_sacc = pd.DataFrame(int_sacc,columns=["saccade_start","saccade_end","duration_saccade",],)
# the for gazes
int_gaze = list(zip(fixation_start,fixation_end,duration_fixation))
int_gaze = pd.DataFrame(int_gaze,columns=["fixation_start","fixation_end","duration_fixation",],)

In [None]:
# The two dfs (gaze + saccade durations) are added to have one common df
dur = [] # save duration
onset = [np.nan] * len(for_eye_rej) # save events
# to loop through the gaze and saccade dfs
gz_cnt = 0
sc_cnt = 0
# loop through the data:
# check if the current index corresponds to a gaze or saccade onset
for i in for_eye_rej.index.tolist():
    # gaze onset:
    if gz_cnt < len(int_gaze) and i == int_gaze["fixation_start"][gz_cnt]:
        # save the timepoint in onset
        onset[i] = 2.0
    # saccade onset
    if sc_cnt < len(int_sacc) and i == int_sacc["saccade_start"][sc_cnt]:
        onset[i] = 1.0
    # gaze offset
    if gz_cnt < len(int_gaze) and i == int_gaze["fixation_end"][gz_cnt]:
        # add the duration to each element in the gaze
        dur = dur + [int_gaze["duration_fixation"][gz_cnt]]*(1 + int_gaze["fixation_end"][gz_cnt]-int_gaze["fixation_start"][gz_cnt])
        # count up the current position in the df
        gz_cnt = gz_cnt + 1
    # saccade offset
    elif sc_cnt < len(int_sacc) and i == int_sacc["saccade_end"][sc_cnt]:
        # add the duration for each element in the saccade 
        dur = dur + [int_sacc["duration_saccade"][sc_cnt]]*(1 + int_sacc["saccade_end"][sc_cnt]-int_sacc["saccade_start"][sc_cnt])
        # count up position index
        sc_cnt = sc_cnt + 1
# add the results to the common df
for_eye_rej['durations'] = dur
for_eye_rej['onsets'] = onset


In [None]:
''' 
Outlier calculation - This code is adapted from '7v_ET_classification_algorithm.ipynb'

Long events are rejected
'''

# Now calculate outliers as done previously
gaze = for_eye_rej[for_eye_rej["onsets"] == 2.0]
sacc = for_eye_rej[for_eye_rej["onsets"] == 1.0]

# for shorter code, get the durations as lists
g_len = gaze["durations"].tolist()
s_len = sacc["durations"].tolist()

# median of the absolute deviation (* 1.4826 when assuming normal distribution):
gaze_mad = np.nanmedian(abs(g_len - np.nanmedian(g_len))) * 1.4826
sacc_mad = np.nanmedian(abs(s_len - np.nanmedian(s_len))) * 1.4826

# calculate the median to reject data
g_len_med = np.nanmedian(g_len)
s_len_med = np.nanmedian(s_len)

# now add a column to the df that is nan if it is an outlier and the duration if it is not
# separate between gaze and saccade (but now take ach sample)
gaze = for_eye_rej[for_eye_rej["handLabel2"] == 2]
sacc = for_eye_rej[for_eye_rej["handLabel2"] == 1]

# Deviations:
# then to calculate the deviations using the entire df for easy lookup later
gaze_mad_z = abs(gaze["durations"].tolist() - g_len_med) / gaze_mad
sacc_mad_z = abs(sacc["durations"].tolist() - s_len_med) / sacc_mad

# get the outliers for gaze and saccade seperately
gaze_mad_z[gaze_mad_z > 3.5] = np.nan
sacc_mad_z[sacc_mad_z > 3.5] = np.nan

# save the data:
for_eye_rej = pd.concat([gaze, sacc])
# save all outliers for gaze + saccades + nan for invalid data 
for_eye_rej["long_events"] = gaze_mad_z.tolist() + sacc_mad_z.tolist()
# sort the data
for_eye_rej = for_eye_rej.sort_index()

In [None]:
# now combine the valid and invalid data so you only have one df left
for_eye_rej.set_index('index', inplace=True)
for_eye_fin = pd.concat([for_eye_rej, for_eye_inv], sort=False).sort_index()

# Compare the hand-labeled data and the algorithm classification

In [None]:
'''
For easier comparison, we create a df for the algorithm and hand-labeled data,
that consist of:
- 0 == invalid samples
- 1 == saccade samples
- 2 == gaze samples
- 3 == outliers
'''

# Algorithm:
# load the data
recordings = pd.read_csv("./recordings_village.csv", index_col=0)
ids = recordings.index.tolist()
idd = ids[:1]
for_eye = pd.read_csv(f"{PATH_FOREYE}/correTS_mad_wobig_{idd[0]}.csv", index_col=0)
# invalid 
invalid = for_eye[for_eye['valid'] == 0.0]
# take them out 
for_eye = for_eye[for_eye['valid'] == 1.0]
# outliers
outliers = for_eye[for_eye["long_events"].isnull()]
# take them out
for_eye = for_eye[~for_eye["long_events"].isnull()]
# gaze and saccade dfs
gaze = for_eye[~for_eye["isFix"].isnull()]
sacc = for_eye[for_eye["isFix"].isnull()]
# combine the data 
for_eye = pd.concat([invalid, outliers, gaze, sacc])
for_eye["all_events"] = [0]*len(invalid) + [3]*len(outliers) + [2]*len(gaze) + [1]*len(sacc)
# sort the data
for_eye = for_eye.sort_index()


# Hand Labels:
# invalid
invalid = for_eye_fin[for_eye_fin['validity_handLabel'] == 0]
# take them out
for_eye_fin = for_eye_fin[for_eye_fin['validity_handLabel'] == 1]
# outliers
outliers = for_eye_fin[for_eye_fin["long_events"].isnull()]
# take them out
for_eye_fin = for_eye_fin[~for_eye_fin["long_events"].isnull()]
# gaze and saccade dfs
gaze = for_eye_fin[for_eye_fin["handLabel2"] == 2]
sacc = for_eye_fin[for_eye_fin["handLabel2"] == 1]
# combine the data
for_eye_rej = pd.concat([invalid, outliers, gaze, sacc])
for_eye_rej["all_events_hand"] = [0]*len(invalid) + [3]*len(outliers) + [2]*len(gaze) + [1]*len(sacc)
# sort the data
for_eye_rej = for_eye_rej.sort_index()


# have both in the same df: so add the algorithm classification to the hand-labeled df
for_eye_rej['all_events_algo'] = for_eye["all_events"].tolist()

## Sample-by-sample comparison (Confusion Martix)

In [None]:
# now, let's compute the confusion matrix
confusion_matrix = metrics.confusion_matrix(for_eye_rej["all_events_hand"].to_numpy(), for_eye_rej['all_events_algo'].to_numpy()) 
# use heatmap to plot the results
sns.heatmap(confusion_matrix, annot=True, fmt='g');
# important, the order corresponds to the number (so 1 == saccade and not outlier)

## Number of gazes & saccades

In [None]:
# Hand label:
# count the number of gazes: get rid of validity and outliers
valid = for_eye_rej[for_eye_rej['validity_handLabel'] == 1]
valid = valid[~valid["long_events"].isnull()]
gazes = valid[valid["onsets"] == 2.0]
sacc = valid[valid["onsets"] == 1.0]
print("Hand Labeling:")
print(f"nr gazes: {len(gazes)}")
print(f"nr saccades: {len(sacc)}")
print()

# Algorithm:
recordings = pd.read_csv("./recordings_village.csv", index_col=0)
ids = recordings.index.tolist()
idd = ids[:1]
valid = pd.read_csv(f"{PATH_FOREYE}/correTS_mad_wobig_{idd[0]}.csv", index_col=0)
valid = valid[valid['valid'] == 1.0]
valid = valid[~valid["long_events"].isnull()]
gazes = valid[valid['events'] == 2.0]
sacc = valid[valid['events'] == 1.0]
print("Algorithm Classification:")
print(f"nr gazes: {len(gazes)}")
print(f"nr saccades: {len(sacc)}")

## Compare jitter in onsets

In [None]:
# load the data and define gazes
valid = for_eye_rej[for_eye_rej['validity_handLabel'] == 1] # remove invalid data
valid = valid[~valid["long_events"].isnull()] # remove outliers
gaze_hand = valid[valid["onsets"] == 2.0]
gaze_hand = gaze_hand.index.tolist() # for this, we only need the indicies

# Algorithm:
# load the data
recordings = pd.read_csv("./recordings_village.csv", index_col=0)
ids = recordings.index.tolist()
idd = ids[:1]
valid = pd.read_csv(f"{PATH_FOREYE}/correTS_mad_wobig_{idd[0]}.csv", index_col=0)
valid = valid[valid['valid'] == 1.0] # remove invalid data
valid = valid[~valid["long_events"].isnull()] # remove outliers
gaze_algo = valid[valid['events'] == 2.0]
gaze_algo = gaze_algo.index.tolist() # for this, we only need the indicies

# what values the distances can take
counters = list(np.linspace(-10, 10, num=21,dtype=int))
# define array to save the numbers - used for the plot
counts = {-10:0, -9:0, -8:0, -7:0, -6:0, -5:0, -4:0, -3:0, -2:0, -1:0, 0:0,
          1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0,100:0,}
long_counts = [] # here we simply add the value for median and iqr calculation
# now loop through the algorithm gazes and find the closest match
for g, gz in enumerate(gaze_algo[:]): # count, item
    # compute the distance in samples for each gaze onset in the hand-labeled data
    dist = [gz - i for i in gaze_hand]
    # we get the min distance. For this we use the absolute value
    max_el = min(dist, key=abs)
    # to check whether the distance between - and + are the same, we also check the second closest
    # therefore, we remove the closest element
    dist.remove(max_el)
    # and caluclate the new minimum
    max_el2 = min(dist, key=abs)
    # if the distance is close enough to be counted
    if max_el in counters:
        # first, if the abs two counters are identical (so - and + have the same distance)
        if abs(max_el2) == abs(max_el):
            # we add a 0.5 to both elements
            counts[max_el] = counts[max_el] + 0.5
            counts[max_el2] = counts[max_el2] + 0.5
            # and we add both counters to the list we use to compute the median
            long_counts = long_counts + [max_el,max_el2] # we add both
        # if the two counters are not identical 
        else:
            # we add a 1 to the minimum distance
            counts[max_el] = counts[max_el] + 1
            # and then we add the element twice to the list, to account for the possibility of having two closest
            long_counts = long_counts + [max_el,max_el] 
    # if the distance is too big, we count the number in the df
    else:
        counts[100] = counts[100] + 1
# we create a df
counts = pd.DataFrame(counts, index = ['nr shifts']).transpose()
# for plotting, we remove the counter with the distances too far away.
counts_assigned = counts.drop([100])


In [None]:
# plot the results
ax = counts_assigned.plot.bar(rot=0,width=0.7)

In [None]:
# compute the median and IQR
print(f'median difference : {np.median(long_counts)}')
q75, q25 = np.nanpercentile(long_counts, [75, 25])
iqr_dist = q75 - q25
print(f"IQR gaze distance: {iqr_dist} ({q25}:{q75})")

## Make a combined plot: confusion matrix + bar plot

In [None]:
# important, this code relies on all the previous cells, so they have to be run first

gaze_color_1 = "#066da8"

# set plotting parameters for:
labelsize = 40 # text
legendsize = 40 # ledgend
ticksize = 30 # ticks
numbersize = 60 # A, B etc.
fname = "Arial" # font name

# prepare the figure
plt.figure(figsize=(32, 15), constrained_layout=True)
sns.set_style("white") 
plt.rcParams["font.family"] = fname
ax1 = plt.subplot2grid(shape=(3, 25), loc=(0, 0), rowspan=3, colspan=12)
ax2 = plt.subplot2grid(shape=(3, 25), loc=(0, 13), rowspan=3, colspan=12)

# heatmap - for the sample-by-sample comparison
# define the axis ticks
axis_labels = ['invalid','saccade','gaze','outlier']
# plot the results
sns.heatmap(confusion_matrix,
            cmap="YlGnBu",
            annot=True, 
            annot_kws={"fontsize":ticksize,"fontname":fname},
            fmt='g',
            ax=ax1,
            xticklabels=axis_labels, 
            yticklabels=axis_labels,
            cbar=False
           ) 
# adjust the axis labels
ax1.set_ylabel("Hand-Labeled", fontsize=labelsize, fontname=fname)
ax1.set_xlabel("Algorithm", fontsize=labelsize, fontname=fname)
# set axis ticks + font
for label in ax1.get_xticklabels(): # change tick font
    label.set_fontproperties(fname)
for label in ax1.get_yticklabels():
    label.set_fontproperties(fname)
ax1.yaxis.set_tick_params(labelsize=ticksize)  # change tick size
ax1.xaxis.set_tick_params(labelsize=ticksize)


# barplot - for the gaze onset accuracy
# first, we multiply the frames by 11 to have the number in ms
frame_to_ms = [x* 11 for x in counts_assigned.index.tolist()]
# plot the results
plt.bar(frame_to_ms,counts_assigned['nr shifts'], color=gaze_color_1, width=10)
# define xticks
plt.xticks([-110,-55,0,55,110])
# set axis ticks + font
for label in ax2.get_xticklabels(): # change tick font
    label.set_fontproperties(fname)
for label in ax2.get_yticklabels():
    label.set_fontproperties(fname)
ax2.yaxis.set_tick_params(labelsize=ticksize)  # change tick size
ax2.xaxis.set_tick_params(labelsize=ticksize)
ax2.set_xlabel("Temporal Offset (ms)", fontsize=labelsize,fontname=fname)
ax2.set_ylabel("Count", fontsize=labelsize,fontname=fname)

# set plot labels
ax1.set_title("A", fontsize=numbersize, fontweight="bold",loc="left", x=-0.08, y=1.05, pad=-30, fontname=fname)  
ax2.set_title("B", fontsize=numbersize, fontweight="bold",loc="left", x=-0.08, y=1.05, pad=-30, fontname=fname) 
plt.show()