In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
import json

path = "data/Result1/Different_All.csv"
data = pd.read_csv(path)
data = data.dropna(axis=0)
data = data.sample(frac=1, random_state=5).reset_index(drop=True)

data.head()

Unnamed: 0,x1,y1,x2,y2,x3,y3,x4,y4,x5,y5,...,pupil_left_std,pupil_right_avg,pupil_right_max,pupil_right_min,pupil_right_std,pupil_together_avg,pupil_together_max,pupil_together_min,pupil_together_std,participant
0,927.0,577.0,929.0,582.0,930.0,584.0,930.0,588.0,932.0,587.0,...,0.126596,3.477593,3.774,3.204,0.180669,3.479151,3.653,3.249,0.147083,4
1,973.0,486.0,975.0,488.0,976.0,487.0,975.0,489.0,975.0,489.0,...,0.067162,3.122516,3.24,3.004,0.084039,3.190429,3.283,3.092,0.074853,21
2,751.0,470.0,757.0,467.0,746.0,461.0,745.0,466.0,749.0,455.0,...,0.087482,3.860538,4.007,3.722,0.064207,3.860484,3.953,3.72,0.071442,10
3,898.0,288.0,896.0,277.0,903.0,287.0,905.0,291.0,906.0,292.0,...,0.056034,3.334779,3.424,3.21,0.05219,3.453826,3.53,3.37,0.050547,11
4,922.0,352.0,929.0,348.0,923.0,342.0,922.0,344.0,920.0,344.0,...,0.063451,3.255419,3.33,3.141,0.056137,3.282686,3.356,3.191,0.05682,27


In [19]:
class LoadResult1Data:
    def __init__(self, path) -> None:
        data = pd.read_csv(path)
        data = data.dropna(axis=0)
        data = data.sample(frac=1, random_state=5).reset_index(drop=True)

        self.data = data
        self.exceptlist = ['participant']

    def set_domain(self, domain_name:str):
        columnList = self.data.columns.to_list()
        if domain_name == 'all':
            pass
        elif domain_name == "mfcc":
            for i in range(1,13):
                columnList.remove(domain_name+str(i))
                self.exceptlist = columnList
        else:
            avg_name = domain_name+'_avg'
            max_name = domain_name+'_max'
            min_name = domain_name+'_min'
            std_name = domain_name+'_std'
            if domain_name in columnList:
                columnList.remove(domain_name)
            if avg_name in columnList:
                columnList.remove(avg_name)
            if max_name in columnList:
                columnList.remove(max_name)
            if min_name in columnList:
                columnList.remove(min_name)
            if std_name in columnList:
                columnList.remove(std_name)
            self.exceptlist = columnList

    def take_x(self, data=pd.DataFrame()):
        if data.empty:
            x_data = self.data.loc[:,~self.data.columns.isin(self.exceptlist)]
        else:
            x_data = data.loc[:,~self.data.columns.isin(self.exceptlist)]
        return x_data

    def take_y(self, data=pd.DataFrame()):
        if data.empty:
            y_data = self.data['participant']
        else:
            y_data = data['participant']
        return y_data
    
    def get_data(self):
        return self.data

    def split_data(self, train_data_ratio, valid_data_ratio, test_data_ratio):
        if train_data_ratio+valid_data_ratio+test_data_ratio != 1:
            raise Exception("train + valid + test should be equal 1.")
        
        kf = StratifiedKFold(n_splits=10)
        x_data = self.take_x().to_numpy()
        y_data = self.take_y().to_numpy()
        index_blocks = []

        for _, test in kf.split(x_data, y_data):
            index_blocks.append(test.tolist())

        train_data_block = []
        valid_data_block = []
        test_data_block = []

        train_data_index = list(range(0, int(train_data_ratio*10)))
        valid_data_index = list(range(int(train_data_ratio*10), int((train_data_ratio+valid_data_ratio)*10)))
        test_data_index = list(range(int((train_data_ratio+valid_data_ratio)*10), 10))

        for i in range(10):
            if i in train_data_index:
                train_data_block.extend(index_blocks[i])
            elif i in valid_data_index:
                valid_data_block.extend(index_blocks[i])
            elif i in test_data_index:
                test_data_block.extend(index_blocks[i])

        return train_data_block, valid_data_block, test_data_block

    def ml_train(self, train_data_block):
        x_train_data = self.take_x().to_numpy()[train_data_block]
        y_train_data = self.take_y().to_numpy()[train_data_block]
        # warnings.filterwarnings('ignore')
        rf_model = RandomForestClassifier(random_state=0)  # Random Forest
        rf_model.fit(x_train_data, y_train_data)
        return rf_model

    def ml_test(self, model:RandomForestClassifier, test_data_block):
        test_x = self.take_x().to_numpy()[test_data_block]
        test_y = self.take_y().to_numpy()[test_data_block]
        pred_y = model.predict(test_x)
        accuracy = accuracy_score(test_y, pred_y)
        f1 = f1_score(test_y, pred_y, average='macro')
        results = {"acc":accuracy, "f1":f1}
        return results

    
    def get_indexlist(self):
        return self.data.index.to_list()

In [20]:
test1 = LoadResult1Data(path)
test1.set_domain("mfcc")
test1.take_x()

Unnamed: 0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12
0,540492.388143,301065.056335,175705.949479,-23433.816881,-8703.613937,80033.564828,34278.115101,193.161685,-17377.442583,-8091.885376,463.990473,-6629.912974
1,428247.585872,253669.088271,114074.590541,7332.881977,-259.127266,9158.536101,28090.178949,1621.189095,-8855.748057,822.617423,725.226580,8608.905349
2,665844.602931,309549.556252,157585.130819,-44480.185153,14682.319897,63519.250222,-6122.357675,-21235.430671,-13242.129214,-30301.820380,-28204.055364,-40920.208176
3,272217.012224,57878.969666,29878.048661,13877.215061,25931.636403,-10061.717459,-4425.320890,1131.410250,-13236.426911,6379.289146,4070.461031,-2426.862106
4,471461.674071,208665.197740,99125.963057,46989.094344,21204.330549,-27138.593894,-19474.258639,17668.386988,35426.906157,7677.679485,34201.705315,103690.926724
...,...,...,...,...,...,...,...,...,...,...,...,...
32368,231199.781142,73294.473495,-5631.945029,-12361.869496,-21038.575734,83.492805,41736.617457,51597.926025,56054.420137,9537.258336,6322.634531,-3489.388947
32369,332343.141670,204802.736966,106861.131448,44151.780603,6872.771359,2590.120038,-15334.449328,-27094.857242,-15385.919045,-29751.454854,-7320.503832,82782.856395
32370,260620.639778,124496.358946,69565.956169,5664.623612,-1641.467686,-21615.976832,8702.900479,10079.431902,-3414.769237,-30468.914403,-16780.569104,-22730.464742
32371,398042.009666,293943.081846,152312.342337,161108.707635,129540.350056,77701.020281,27174.062795,11851.603989,9761.860139,25960.463781,2285.360637,30750.454981


In [23]:
gazeFeatureList = ['gaze_motion', 'gaze_velocity', 'gaze_rotation', 'reaction_time', 'fixation_duration', 
                   'fixation_dispersion', 'fixation_count', 'saccade_duration', 'saccade_velocity',
                   'saccade_amplitude', 'saccade_dispersion', 'saccade_count', 'mfcc', 'pupil_left',
                   'pupil_right', 'pupil_together', 'all']
path = "data/Result1/Different_All.csv"
train_data_ratio = 0.9
valid_data_ratio = 0
test_data_ratio = 0.1

for gazeFeature in gazeFeatureList:
    print("========")
    print(gazeFeature)
    differentResult1 = LoadResult1Data(path)
    differentResult1.set_domain(gazeFeature)
    trb, vdb, teb = differentResult1.split_data(train_data_ratio, valid_data_ratio, test_data_ratio)
    model = differentResult1.ml_train(trb)
    # print("train done")
    testResult = differentResult1.ml_test(model, teb)
    this_acc = testResult['acc']
    this_f1 = testResult['f1']
    print(f"Acc: {this_acc}, F1: {this_f1}")

gaze_motion
Acc: 0.05872602415344542, F1: 0.058337808106777966
gaze_velocity
Acc: 0.18233483305706844, F1: 0.17873416868591963
gaze_rotation
Acc: 0.0639355908122188, F1: 0.06373176010658285
reaction_time
Acc: 0.07695950745915227, F1: 0.06586245639987988
fixation_duration
Acc: 0.10158654984608098, F1: 0.09964146663030206
fixation_dispersion
Acc: 0.09069381955955481, F1: 0.08950243086155549
fixation_count
Acc: 0.07814349988160076, F1: 0.0285268187836659
saccade_duration
Acc: 0.13544873312810798, F1: 0.11694945240369403
saccade_velocity
Acc: 0.08453705896282264, F1: 0.07238464995550416
saccade_amplitude
Acc: 0.09306180440445182, F1: 0.08136289340130842
saccade_dispersion
Acc: 0.1004025574236325, F1: 0.09720227105794713
saccade_count
Acc: 0.07388112716078617, F1: 0.027866216602870513
mfcc
Acc: 0.18422922093298602, F1: 0.17281048077526986
pupil_left
Acc: 0.20909306180440446, F1: 0.20421781830464644
pupil_right
Acc: 0.22969452995500828, F1: 0.22870680278035777
pupil_together
Acc: 0.206488278

In [24]:
pathList = ["EyeMovement", 'Fixation', "MFCC", 'Pupil', "RawGaze", "Saccade"]
train_data_ratio = 0.9
valid_data_ratio = 0
test_data_ratio = 0.1

for pathName in pathList:
    print("========")
    print(pathName)
    path = f"data/Result1/Different_{pathName}.csv"
    differentResult1 = LoadResult1Data(path)
    differentResult1.set_domain(gazeFeature)
    trb, vdb, teb = differentResult1.split_data(train_data_ratio, valid_data_ratio, test_data_ratio)
    model = differentResult1.ml_train(trb)
    # print("train done")
    testResult = differentResult1.ml_test(model, teb)
    this_acc = testResult['acc']
    this_f1 = testResult['f1']
    print(f"Acc: {this_acc}, F1: {this_f1}")


EyeMovement
Acc: 0.24494485294117646, F1: 0.23679851856524756
Fixation
Acc: 0.22977941176470587, F1: 0.22552786011120624
MFCC
Acc: 0.19140625, F1: 0.17911995574555029
Pupil
Acc: 0.5050911674165285, F1: 0.49929289588253783
RawGaze
Acc: 0.48253676470588236, F1: 0.47374495022463764
Saccade
Acc: 0.23713235294117646, F1: 0.22805306621512897


Feature Selection by Sequential Feature Selector

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

path = "data/Result1/Different_All.csv"
thisData = LoadResult1Data(path)
thisData.set_domain("all")
X = thisData.take_x()
y = thisData.take_y()

rf_model = RandomForestClassifier(random_state=0)  # Random Forest
sfs = SequentialFeatureSelector(rf_model, n_features_to_select="auto", cv=10, n_jobs=-1)
sfs.fit(X, y)
sfs.get_support()
sfs.transform(X).shape