## 1. **Dataset Preprocessing:** Load, Clean, Preprocess data

In [None]:
# =======================================================
# =========== Load the dataset ==========================
# =======================================================
import pandas as pd
import os

import warnings 
warnings.filterwarnings('ignore')
directory = 'dataset/' # local directory
directory = '/kaggle/input/jules-varne/dataset/' # kaggle directory

df = pd.DataFrame()

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Read the CSV file into a DataFrame
        currFile = pd.read_csv(os.path.join(directory, filename))
        # Concatenate the DataFrame to the existing one
        df = pd.concat([df, currFile], ignore_index=True)


# =======================================================
# =============== Cleaning the dataset ==================
# =======================================================
# Remove columns

columns = [
    "RecordingTime [ms]", # Done - no null values actually there
    "Time of Day [h:m:s:ms]", # 
    "Participant", # Done: Remove Missing Value, Remove unusables, null, undefined and such values, 
    "Pupil Diameter Right [mm]", # Code is there, just put it together.
    "Pupil Diameter Left [mm]", # Code is there, just put it together.
    "Point of Regard Right X [px]", # ??
    "Point of Regard Right Y [px]", # ??
    "Point of Regard Left X [px]", # ??
    "Point of Regard Left Y [px]", # ??
    "Category Right", # Done - Encoding, Missing Value
    "Category Left", # DONE - Encoding, Missing Value
]


df_0_columns_separated = df[columns]

# --------------------------------------------------

df = df_0_columns_separated 
df["RecordingTime [ms]"].describe()

is_null_in_column = df['RecordingTime [ms]'].isnull().any()
is_null_in_column # No null values then!

df_1_rec_time_processed = df

# --------------------------------------------------


df = df_1_rec_time_processed
df["Time of Day [h:m:s:ms]"].describe()

is_null_in_column = df['Time of Day [h:m:s:ms]'].isnull().any()
is_null_in_column # same, no null

df_2_time_of_day = df
# --------------------------------------------------
# print("Before: ")
# print(df["Participant"].unique())

# Issue: some are in string type, some are in number type

df = df_2_time_of_day
df = df[(df['Participant'] != 'Unidentified(Neg)') & (df['Participant'] != 'Unidentified(Pos)')]
df["Participant"] = pd.to_numeric(df["Participant"], errors='coerce')

# print("After: ")
# print(df_processed_test["Participant"].unique())

df_3_participant = df
# --------------------------------------------------
df = df_3_participant


column_name = "Pupil Diameter Right [mm]"

df[column_name] = df[column_name].astype(str) 
df = df[pd.to_numeric(df[column_name], errors='coerce').notnull()]
df[column_name] = pd.to_numeric(df[column_name], errors='coerce') 
df = df[df[column_name] != 0] 

column_name = "Pupil Diameter Left [mm]"

df[column_name] = df[column_name].astype(str)
df = df[pd.to_numeric(df[column_name], errors='coerce').notnull()]  
df[column_name] = pd.to_numeric(df[column_name], errors='coerce') 
df = df[df[column_name] != 0] 


df_5_pupil_both = df
# --------------------------------------------------
point_of_regard_idx = [
    "Point of Regard Right X [px]",
    "Point of Regard Right Y [px]",
    "Point of Regard Left X [px]",
    "Point of Regard Left Y [px]",
]


for point in point_of_regard_idx:
    # drop null
    df.dropna(subset=[point], inplace=True)
    df = df.drop(df[df[point] == '-'].index)
    a = pd.to_numeric(df[point])

df_9_point_of_regard = df_5_pupil_both
# --------------------------------------------------
# Processing - Category Left, Category Right

stays = [
    "Fixation",
    "Saccade",
    "Blink"
]

goes = [
    "Separator",
    "-",
    "Left Click",
    # Null
]

# 1. Remove missing data
# 2. Convert to numeric values??

df = df_9_point_of_regard

df = df[df['Category Left'].isin(stays)]
df = df[df['Category Right'].isin(stays)]

df = pd.get_dummies(df, columns=['Category Left'], prefix=['Category Left'])
df = pd.get_dummies(df, columns=['Category Right'], prefix=['Category Right'])
df.head()
# df.had()

df_final = df
df = df_final
# --------------------------------------------------
# mapping subId to ASD / TD 
metadata_dir = "./Metadata_Participants.csv" #local dir
metadata_dir = "/kaggle/input/junes-verne-metadata/Metadata_Participants.csv" #kaggle dir

metadata = pd.read_csv(metadata_dir)
subjectClass = dict()
for idx, row in metadata.iterrows():
    subject = (row["ParticipantID"])
    category = (row["Class"])
    subjectClass[subject] = 1 if category == 'ASD' else 0

def getClass(subjectId):
    return subjectClass[subjectId]

# for index, row in df.iterrows():
#     subId = row["Participant"]
#     print(subId)
# --------------------------------------------------

After grouping by sessions is done, we can do this.

- Consider each individual session as a singular data unit.
- From each units of data, compute all the feautures, and feed it into the classifier for proper patterns recognition
- Goal is to find as many features, correlated to ASD vs TD as possible.
- Hypothesis is there should be subtle patterns in eye movement speeds, fixation duration, and other similar charecteristics
- Eye pupil diameters reveal one's interest in a certain object / event / topic. Based on this, we can calculate how long a subject is able to hold his/her interest, and possibility is, that people with ASD will have different duration they can hold interests, ideally more on objects, less on people, and maybe similar to ADHD, there is some other patterns that otherwise would've been overlooked, in the topic of interest. Possible patterns:
    - How long can a subject hold interest
    - How often does he/she change it
    - Fatigue? How long till he loses interest, in a given session (is this correlated?)
    - Saccade movement, and pupil diameter, is there any correlation / patterns
- Eye movement is typically faster in ASD (ref. - )
- Ability to focus on an object right after fast switching of gaze, is slower among ASD (ref. - )
- Eye movements should be far less among TD than ASD, ASD people's gaze movement should be faster, little erratic, and possibly a bit jittery. Unable to focus.
- 

## 2. **Feature Engineering**: Separate Sessions, Compute Outputs, Compute Features, Remove Unnecessary Columns

In [12]:
# =======================================================
# =========== The Feature Compute Functions =============
# =======================================================

import math
import numpy as np
import time

def getDistanceBetweenPoints(row, prevRow):
    leftX = float(row["Point of Regard Left X [px]"])
    leftY = float(row["Point of Regard Left Y [px]"])
    rightX = float(row["Point of Regard Right X [px]"])
    rightY = float(row["Point of Regard Right Y [px]"])

    x1 = (leftX + rightX) / 2
    y1 = (leftY + rightY) / 2

    prevLeftX = float(prevRow["Point of Regard Left X [px]"])
    prevLeftY = float(prevRow["Point of Regard Left Y [px]"])
    prevRightX = float(prevRow["Point of Regard Right X [px]"])
    prevRightY = float(prevRow["Point of Regard Right Y [px]"])

    x2 = (prevLeftX + prevRightX) / 2
    y2 = (prevLeftY + prevRightY) / 2

    distance = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

    return distance


def getChangesInDistanceBetweenPoints(currDist, prevDist):
    return abs(currDist - prevDist)

def getGazeSpeed(dist, duration):
    return dist / duration


# ==========================================================
# ================== Separate Sessions =====================
# ==========================================================

allSessions = []
y = []
currSession = pd.DataFrame()

prevId = -1
cnt = 0

start_time = time.time()

for index, row in df.iterrows():
    if row["Participant"] != prevId:
        allSessions.append(currSession.T)
        y.append(row["Participant"])
        currSession = pd.DataFrame()
    prevId = row["Participant"]
    currSession = pd.concat([currSession, row], axis=1)

print("--- %s seconds ---" % (time.time() - start_time))

# allSessions = pd.DataFrame(allSessions)
# allSessions


# ISSUE: This cell takes too lonAg to run. Maybe I should compute this and save a version on the storage, as a local file?
#        Then I can open it without having to compute the thing each time?

# ==================================================================
# ============== Compute the output variables ======================
# ==================================================================

y = []
for x in X:
    y.append(getClass(x[0][0]))
    continue
len(y)

# ==================================================================
# ============== Compute the features, and add them ================
# ============== Add remove the unnecessary features ===============
# ==================================================================



for j in range(len(X)):
    df = X[j]    
    # ---------------- Step 1 ---------------        
    prevRow = None
    prevDist = 0
    
    feat1_dist = []
    feat2_dist_diffs = []

    for i, row in df.iterrows():
        if prevRow is None:
            prevRow = row
        dist = getDistanceBetweenPoints(row, prevRow)
        dist_diff = getChangesInDistanceBetweenPoints(dist, prevDist)
        # speed = 

        prevRow = row
        prevDist = dist

        feat1_dist.append(dist)
        feat2_dist_diffs.append(dist_diff)
    # ---------------- Step 2 ---------------
    # To DataFrame
    feat1_dist_df = pd.DataFrame(feat1_dist)
    feat2_dist_diffs_df = pd.DataFrame(feat2_dist_diffs)

    # Set column names / feature names
    feat1_dist_df.columns = ["Distance"]
    feat2_dist_diffs_df.columns = ["Distance Difference"]
    
    # ---------------- Step 3 ---------------
    as_is = [
        "Participant",
        "Category Left_Blink",
        "Category Left_Fixation",
        "Category Left_Saccade",
        "Category Right_Blink",
        "Category Right_Fixation",
        "Category Right_Saccade",
        "Pupil Diameter Right [mm]",
        "Pupil Diameter Left [mm]",
    ]

    df_as_is = df[as_is] # First taking the as is ones

    # Combine all the features into the single input variable. 
    result = pd.concat([df_as_is, feat1_dist_df, feat2_dist_diffs_df], axis=1, join='outer')
    X[j] = result
    
    
# =========================================================
# =============== Pad the dataset, to make  ===============
# =============== all of them same sized    ===============
# =========================================================

for i in range(len(X)):
    X[i] = np.array(X[i]

def pad_dataset(data, target_length):
    num_padding_rows = target_length - data.shape[0]
    padding = np.zeros((num_padding_rows, data.shape[1]))
    padded_data = np.concatenate((data, padding))
    return padded_data

max_shape = max([data.shape[0] for data in X])
padded_X = [pad_dataset(data, max_shape) for data in X]
padded_X = np.array(padded_X)

X = np.array(padded_X)
y = np.array(y)

--- 1142.4740943908691 seconds ---


In [13]:
X = allSessions[:]

So there are 568 Sessions. Not bad. Here's the output shape:



## **NOTE**: Generating output data (expected categories, ASD vs TD)
ASD = 0

TD = 1

In [95]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Ensure correct data types
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y, dtype=np.int32)

# Check for NaN or Inf values
# assert not np.any(np.isnan(X_train)), "X contains NaN values"
# assert not np.any(np.isinf(X_train)), "X contains Inf values"
# assert not np.any(np.isnan(y_train)), "y contains NaN values"
# assert not np.any(np.isinf(y_train)), "y contains Inf values"

# Define the input shape
input_shape = (24110, 11)

# Create the model
model = Sequential()

# Add an LSTM layer
model.add(LSTM(64, input_shape=input_shape))

# Add a dense output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Fit the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Save the model
model.save('lstm_model.h5')


Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 11s/step - accuracy: 0.6890 - loss: nan
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 10s/step - accuracy: 0.7133 - loss: nan
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 10s/step - accuracy: 0.6939 - loss: nan
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 10s/step - accuracy: 0.6843 - loss: nan
Epoch 5/10
[1m 4/18[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m2:16[0m 10s/step - accuracy: 0.6875 - loss: nan

KeyboardInterrupt: 

In [114]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler


# Ensure correct data types
X_train = np.array(X, dtype=np.float32)
y_train = np.array(y, dtype=np.int32)

# Check for NaN or Inf values
# assert not np.any(np.isnan(X_train)), "X contains NaN values"
# assert not np.any(np.isinf(X_train)), "X contains Inf values"
# assert not np.any(np.isnan(y_train)), "y contains NaN values"
# assert not np.any(np.isinf(y_train)), "y contains Inf values"

# Define the input shape
input_shape = (24110, 11)

# Create the model
model = Sequential()

# Add LSTM layers
model.add(LSTM(128, return_sequences=True, input_shape=input_shape))
model.add(Dropout(0.2))  # Add dropout for regularization
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))  # Add dropout for regularization
model.add(LSTM(32))
model.add(Dropout(0.2))  # Add dropout for regularization

# Add a dense output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Define the learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch % 5 == 0 and epoch != 0:
        return lr * 0.9  # Reduce learning rate by 10% every 5 epochs
    else:
        return lr

# Compile the model with Adam optimizer and learning rate scheduling
opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Define learning rate scheduler callback
lr_schedule = LearningRateScheduler(lr_scheduler)

# Fit the model with learning rate scheduler callback
model.fit(X_train, y_train, epochs=20, batch_size=32, callbacks=[lr_schedule])


# Save the model
model.save('lstm_model.h5')


Epoch 1/20
[1m 6/18[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:08:06[0m 341s/step - accuracy: 0.6608 - loss: nan

KeyboardInterrupt: 

### Compute feature for each different session

In [None]:
# Compute the features
tempX = X
feat1 = []
feat2 = []

for i in range(len(tempX)):
    currX = tempX[i]

    feat1_dist = []
    feat2_dist_diffs = []
    # feat3_speed = []

    prevRow = None
    prevDist = 0
    for i, row in df.iterrows():
        if prevRow is None:
            prevRow = row
        dist = getDistanceBetweenPoints(row, prevRow)
        dist_diff = getChangesInDistanceBetweenPoints(dist, prevDist)
        # speed = 

        prevRow = row
        prevDist = dist

        feat1_dist.append(dist)
        feat2_dist_diffs.append(dist_diff)
    
    # TODO: add each feat as row to each feat of tempX[i]
    feat1_df = pd.DataFrame(feat1_dist)
    feat2_df = pd.DataFrame(feat2_dist_diffs)
    tempX[i] = pd.concat([tempX[i], feat1_df], axis=1)
    tempX[i] = pd.concat([tempX[i], feat2_df], axis=1)

    
    # print(feat1_dist)
    # print(feat2_dist_diffs)
    # feat1.append(feat1_dist)
    # feat2.append(feat2_dist_diffs)

feat1