In [None]:
import pandas as pd
import os

import warnings
warnings.filterwarnings('ignore')

directory = 'dataset/'
df = pd.DataFrame()

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Read the CSV file into a DataFrame
        currFile = pd.read_csv(os.path.join(directory, filename))
        # Concatenate the DataFrame to the existing one
        df = pd.concat([df, currFile], ignore_index=True)

In [None]:
# Remove columns

columns = [
    "RecordingTime [ms]", # Done - no null values actually there
    "Time of Day [h:m:s:ms]", # 
    "Participant", # Done: Remove Missing Value, Remove unusables, null, undefined and such values, 
    "Pupil Diameter Right [mm]", # Code is there, just put it together.
    "Pupil Diameter Left [mm]", # Code is there, just put it together.
    "Point of Regard Right X [px]", # ??
    "Point of Regard Right Y [px]", # ??
    "Point of Regard Left X [px]", # ??
    "Point of Regard Left Y [px]", # ??
    "Category Right", # Done - Encoding, Missing Value
    "Category Left", # DONE - Encoding, Missing Value
]


df_0_columns_separated = df[columns]

In [None]:
import matplotlib.pyplot as plt

df = df_0_columns_separated 
df["RecordingTime [ms]"].describe()

is_null_in_column = df['RecordingTime [ms]'].isnull().any()
is_null_in_column # No null values then!

df_1_rec_time_processed = df

In [None]:


import matplotlib.pyplot as plt

df = df_1_rec_time_processed
df["Time of Day [h:m:s:ms]"].describe()

is_null_in_column = df['Time of Day [h:m:s:ms]'].isnull().any()
is_null_in_column # same, no null

df_2_time_of_day = df

In [None]:
# print("Before: ")
# print(df["Participant"].unique())

# Issue: some are in string type, some are in number type

df = df_2_time_of_day
df = df[(df['Participant'] != 'Unidentified(Neg)') & (df['Participant'] != 'Unidentified(Pos)')]
df["Participant"] = pd.to_numeric(df["Participant"], errors='coerce')

# print("After: ")
# print(df_processed_test["Participant"].unique())

df_3_participant = df

In [None]:
df = df_3_participant


column_name = "Pupil Diameter Right [mm]"

df[column_name] = df[column_name].astype(str) 
df = df[pd.to_numeric(df[column_name], errors='coerce').notnull()]
df[column_name] = pd.to_numeric(df[column_name], errors='coerce') 
df = df[df[column_name] != 0] 

column_name = "Pupil Diameter Left [mm]"

df[column_name] = df[column_name].astype(str)
df = df[pd.to_numeric(df[column_name], errors='coerce').notnull()]  
df[column_name] = pd.to_numeric(df[column_name], errors='coerce') 
df = df[df[column_name] != 0] 


df_5_pupil_both = df

In [None]:
point_of_regard_idx = [
    "Point of Regard Right X [px]",
    "Point of Regard Right Y [px]",
    "Point of Regard Left X [px]",
    "Point of Regard Left Y [px]",
]


for point in point_of_regard_idx:
    # drop null
    df.dropna(subset=[point], inplace=True)
    df = df.drop(df[df[point] == '-'].index)
    a = pd.to_numeric(df[point])

df_9_point_of_regard = df_5_pupil_both

In [None]:
# Processing - Category Left, Category Right

stays = [
    "Fixation",
    "Saccade",
    "Blink"
]

goes = [
    "Separator",
    "-",
    "Left Click",
    # Null
]

# 1. Remove missing data
# 2. Convert to numeric values??

df = df_9_point_of_regard

df = df[df['Category Left'].isin(stays)]
df = df[df['Category Right'].isin(stays)]



df = pd.get_dummies(df, columns=['Category Left'], prefix=['Category Left'])
df = pd.get_dummies(df, columns=['Category Right'], prefix=['Category Right'])
df.head()
# df.had()

df_final = df
df = df_final

In [None]:
# mapping subId to ASD / TD 
metadata = pd.read_csv("./Metadata_Participants.csv")
subjectClass = dict()
for idx, row in metadata.iterrows():
    subject = (row["ParticipantID"])
    category = (row["Class"])
    subjectClass[subject] = 1 if category == 'ASD' else 0

def getClass(subjectId):
    return subjectClass[subjectId]

# for index, row in df.iterrows():
#     subId = row["Participant"]
#     print(subId)

In [None]:
y = []
# Assigning class
for index, row in df.iterrows():
    y.append(getClass(row["Participant"]))

y
X = df

In [None]:
# df.to_csv('dataset_preprocessed.csv', index=False)

After grouping by sessions is done, we can do this.

- Consider each individual session as a singular data unit.
- From each units of data, compute all the feautures, and feed it into the classifier for proper patterns recognition
- Goal is to find as many features, correlated to ASD vs TD as possible.
- Hypothesis is there should be subtle patterns in eye movement speeds, fixation duration, and other similar charecteristics
- Eye pupil diameters reveal one's interest in a certain object / event / topic. Based on this, we can calculate how long a subject is able to hold his/her interest, and possibility is, that people with ASD will have different duration they can hold interests, ideally more on objects, less on people, and maybe similar to ADHD, there is some other patterns that otherwise would've been overlooked, in the topic of interest. Possible patterns:
    - How long can a subject hold interest
    - How often does he/she change it
    - Fatigue? How long till he loses interest, in a given session (is this correlated?)
    - Saccade movement, and pupil diameter, is there any correlation / patterns
- Eye movement is typically faster in ASD (ref. - )
- Ability to focus on an object right after fast switching of gaze, is slower among ASD (ref. - )
- Eye movements should be far less among TD than ASD, ASD people's gaze movement should be faster, little erratic, and possibly a bit jittery. Unable to focus.
- 

# Feature Engineering
Features to compute:
- [ ] Gaze Speed
- [ ] Fixation Duration

As is:

- [ ] Pupil Diameter
- [ ] Category left, right
- [ ] 

Not sure if necessary:
- [ ] Point of regards
- [ ] Time of day (Can be used for separating sessions)
- [ ] Recording Time (can be used for separating sessions)

---

After:

- Distance between points
- Gaze Speed
- Fixation duration
- Eye movement type (Fixation, Saccade, Blink) - 6 Cols, after encoding
- Pupil Diameter (Left, Right)
- Changes / Difference in fixation duration between consequent points
- Changes / Difference in distance between consequent points


In [None]:
allSessions = []
currSession = pd.DataFrame()

prevId = -1
for index, row in df.iterrows():
    # if prev id is not same as curr id, then push the currSession to allSessions. And create new session
    if row["Participant"] != prevId:
        allSessions.append(currSession)
        currSession = pd.DataFrame()
        # pass
    prevId = row["Participant"]
    # currSession.append(row)
    currSession =  pd.concat([df1, df2], ignore_index=True, sort=False)



In [None]:
import math 

def getDistanceBetweenPoints(row, prevRow):
    leftX = float(row["Point of Regard Left X [px]"])
    leftY = float(row["Point of Regard Left Y [px]"])
    rightX = float(row["Point of Regard Right X [px]"])
    rightY = float(row["Point of Regard Right Y [px]"])

    x1 = (leftX + rightX) / 2
    y1 = (leftY + rightY) / 2

    prevLeftX = float(prevRow["Point of Regard Left X [px]"])
    prevLeftY = float(prevRow["Point of Regard Left Y [px]"])
    prevRightX = float(prevRow["Point of Regard Right X [px]"])
    prevRightY = float(prevRow["Point of Regard Right Y [px]"])

    x2 = (prevLeftX + prevRightX) / 2
    y2 = (prevLeftY + prevRightY) / 2

    distance = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

    return distance


def getChangesInDistanceBetweenPoints(currDist, prevDist):
    return abs(currDist - prevDist)

def getGazeSpeed(dist, duration):
    return dist / duration

In [None]:
feat1_dist = []
feat2_dist_diffs = []
# feat3_speed = []

prevRow = None
prevDist = 0
for i, row in df.iterrows():
    if prevRow is None:
        prevRow = row
    dist = getDistanceBetweenPoints(row, prevRow)
    dist_diff = getChangesInDistanceBetweenPoints(dist, prevDist)
    # speed = 

    prevRow = row
    prevDist = dist

    feat1_dist.append(dist)
    feat2_dist_diffs.append(dist_diff)

print(feat1_dist)
print(feat2_dist_diffs)

In [None]:
# To DataFrame

feat1_dist = pd.DataFrame(feat1_dist)
feat2_dist_diffs = pd.DataFrame(feat2_dist_diffs)

In [None]:
feat1_dist.columns = ["Distance"]
feat2_dist_diffs.columns = ["Distance Difference"]

In [None]:
as_is = [
    "Participant",
    "Category Left_Blink",
    "Category Left_Fixation",
    "Category Left_Saccade",
    "Category Right_Blink",
    "Category Right_Fixation",
    "Category Right_Saccade",

    "Pupil Diameter Right [mm]",
    "Pupil Diameter Left [mm]",
]

df_as_is = df[as_is] # First taking the as is ones

# Now the concat

result = pd.concat([df_as_is, feat1_dist, feat2_dist_diffs], axis=1, join='outer')
result


In [None]:
result

# Splitting Sessions

prevId = -1
allSessions = []
currSession = []
for i, row in result.iterrows():
    if row["Participant"] != prevId:
        prevId = row["Participant"]
        allSessions.append(currSession)
        currSession = []
    currSession.append(row)

allSessions