In [3]:
import glob
import pandas as pd

# combine features participants to one file

In [42]:
data_types = {
    'HEADSET': ['EEG','head motion',],
              'EYETRACKER': ['PUPIL', 'GAZE'], 
              'WRISTBAND': ['EDA', 'ACC']
             }

info_cols =  ['tw', 'n', 'task', 'name', 'durations', 'start', 'end', 'stage', 'Topic', 'Input', 'Output', 'start_sec', 'end_sec', 'PID']

def format_name(c, prefix):
    if not c.startswith(prefix):
        c = c.replace(':_', '_')
        c = c.replace(':', '_')
        return prefix+':'+c
    return c

for tw in ['4-2', '2-1', '1-0']:
    for folder, types in data_types.items():
        for typ in types:
            files = glob.glob(f'../data/MUWIS-dataset/{folder}/features/{typ}/**features({tw}).csv')
            print(f'Found {len(files)} files in total for data type: {typ}') 
            print(f'Reading {files[0]} to combine')
            
            data = None
            for file in files:
                df = pd.read_csv(file, low_memory=False)
                print(df.head(2))
        
                if data is None: data = df
                else:
                    data = pd.concat([data, df])
            
            if data is None:
                raise 'Error No Data Read'
            else:
                data.reset_index(inplace=True, drop=True)
                
                if typ == 'EEG': #format feature names
                    feature_cols = [c for c in data.columns if c not in info_cols + ['PID']]
                    rename_cols = {c: format_name(c, 'EEG') for c in feature_cols}
                    data.rename(columns=rename_cols, inplace=True)
                elif typ == 'PUPIL':
                    feature_cols = [c for c in data.columns if c not in info_cols + ['PID']]
                    rename_cols = {c: format_name(c, 'EYE') for c in feature_cols}
                    data.rename(columns=rename_cols, inplace=True)
                out_f = f'../data/MUWIS-dataset/{folder}/features/{typ}_features_combined{tw}.csv'
                data.to_csv(out_f, index=False)
                print('DONE', out_f)

Found 20 files in total for data type: EEG
Reading ../data/MUWIS-dataset/HEADSET/features/EEG/PA19_EEG_features(4-2).csv to combine
   n  hjorthActivity:_AF3  hjorthActivity:_F7  hjorthActivity:_F3  \
0  0            -7.341109          -17.207206            0.947003   
1  1            -5.987132          -21.940290            2.917019   

   hjorthActivity:_FC5  hjorthActivity:_T7  hjorthActivity:_P7  \
0            -66.47854          -11.591927           69.411316   
1            -68.65510          -10.981664           96.786028   

   hjorthActivity:_O1  hjorthActivity:_O2  hjorthActivity:_P8  ...  task  \
0           -6.279556           -4.327094          -42.704262  ...     1   
1           -4.344129           -5.145281          -22.924096  ...     1   

               name  durations                             start  \
0  information need     49.094  2023-09-01 09:52:52.962000+10:00   
1  information need     49.094  2023-09-01 09:52:52.962000+10:00   

                           

In [37]:
files = glob.glob('../data/MUWIS-dataset/EYETRACKER/features/GAZE/**_gaze_features(session).csv')
print(f'Found {len(files)} files in total') 
print(f'Reading {files[0]} to combine')
data = None
for file in files:
    df = pd.read_csv(file, low_memory=False)
    if data is None: data = df
    else:
        data = pd.concat([data, df])
data.reset_index(inplace=True, drop=True)
if data is None:
    raise 'Error No Data Read'
else:
    data.to_csv('../data/MUWIS-dataset/EYETRACKER/features/GAZE_features_combined.csv')

# group the features and prepare for trainning

In [31]:
def convert(df, columns):
    X = df[df[target_name].isin(target_set)][columns]
    print(X.shape)
    X[target_name] = le4.transform(X[target_name])
    print(X[target_name].value_counts())
    return X


def fill_nan_with_mean(df, feature_cols):
    for column in feature_cols:
        df[column] = df.groupby('PID')[column].transform(lambda x: x.fillna(x.mean()))
    return df
    
def read_data(file):
    gaps = ['+1', '+2', '+3', '+4', '+5']
    info_cols =  ['task', 'stage', 'Topic','name',]
    remove_cols = [ 'start', 'end', 'start_sec', 'end_sec', 'durations', ]
    time_window_cols = ['tw', 'n']
    id_col = ['PID']

    df = pd.read_csv(file)
    if time_window_cols[0] in df.columns.values:
        df.drop(columns=time_window_cols, inplace=True)
    df.drop(columns=remove_cols, inplace=True)
    
    empty_cols = [col for col in df.columns if col.__contains__('Unnamed') ]
    print('empty columns', empty_cols)
    if len(empty_cols) != 0:
        df.drop(columns=empty_cols, inplace=True)
        
    df = df[~df.stage.isin(gaps+['Break'])]
    for c in ['task', 'Topic']:
        df[c] = df[c].apply(lambda x: str(x))

    feature_cols = [c for c in df.columns if c not in id_col+info_cols]
    df = fill_nan_with_mean(df, feature_cols)
    
    df = df.groupby(id_col+info_cols).mean().reset_index()
    check = df[['PID', 'task']].value_counts().mean()
    if check == 6:
        print('valid data')
    else:
        print(check, 'Might have grouping error!', file)
    return df

In [44]:
import re
from sklearn.preprocessing import LabelEncoder

EXPORT = 1 

info_cols =  ['task', 'stage', 'Topic', 'PID']
id_col = ['PID']
unwanted = ['tw', 'n', 'name', 'start', 'end', 'start_sec','end_sec','durations',]

target_name = 'stage'
target_set = ['IN', 'QF', 'LISTEN','READ', 'TYPE', 'SPEAK']
le4 = LabelEncoder().fit(target_set)

data_files = glob.glob('../data/MUWIS-dataset/features/**')
print(data_files)

while data_files:
    file = data_files.pop(0)
        
    print('='*15)
    print(file)
    print('='*15)
    
    try:
        tw = re.match('.*(\d-\d).*', file).group(1)
    except:
        tw=''
        
    df = read_data(file)
    
    if file.__contains__('EEG'):
        df.rename(columns={c: c[:-2] for c in df.columns.values if re.match('EEG:curve_length_([A-Z]{1,2}\d)_0', c)}, inplace=True)
        features_cols =df.columns.values
        features_cols = [c for c in features_cols if c not in info_cols + id_col + unwanted]
        print('Extracted', len(features_cols), 'features in total', features_cols[0])
        
        gf = [f for f in features_cols if f.__contains__('inter') or f.__contains__('intra')]
        print(len(gf), 'group EEG features')
        sf = [f for f in features_cols if re.match('EEG:(?:(.*)_(?:([A-Z]{1,2}\d)))$', f) and \
              re.match('EEG:(?:(.*)_(?:([A-Z]{1,2}\d)))$', f).group(1) in [\
            'mean', 'std',  'skewness', 'kurtosis', 'npeaks', 'wavelet_entropy', 'curve_length', 'zero_crossings']]
        print(len(sf), 'statistic EEG features')
        ff = [f for f in features_cols if re.match('EEG:(?:(.*)_(?:[A-Z]{1,2}\d_[a-z]{4,}))$', f) and 
              re.match('EEG:(?:(.*)_(?:[A-Z]{1,2}\d_[a-z]{4,}))$', f).group(1) in [\
            'bandPower_PSD_norm',]]
        print(len(ff), 'frequency EEG features')
        selected = gf + sf + ff
        print(len(selected), 'EEG features')
        
        features_cols = selected
        
        outf = f'trainning data/EEG(6 {tw}).csv'
        
        
    elif file.__contains__('EDA'):
        features_cols =df.columns.values
        features_cols = [c for c in features_cols if c not in info_cols + id_col + unwanted]
        print('Extracted', len(features_cols), 'features in total', features_cols[0])
        print(features_cols)
        outf = f'trainning data/EDA(6 {tw}).csv'
        
    elif file.__contains__('PUPIL'):
        features_cols =df.columns.values
        features_cols = [c for c in features_cols if c not in info_cols + id_col + unwanted]
        print('Extracted', len(features_cols), 'features in total', features_cols[0])
        print(features_cols)
        outf = f'trainning data/PUPIL(6 {tw}).csv'

    elif file.__contains__('ACC') or file.__contains__('head motion'):
        if file.__contains__('ACC'):
            f2 = file.replace('ACC', 'head motion')
        elif file.__contains__('head motion'):
            f2 = file.replace('head motion', 'ACC')
    
        if f2 in data_files: 
            data_files.remove(f2)
            print('to combine with', f2)
            df2 = read_data(f2)
            print('before merge:', df.shape, df2.shape)
        
        else: 
            print('ERROR!!', f2, 'not exist' )
            continue
        
        columns = list(df.columns) +  list(df2.columns)
        rename = {}
        for c in columns:
            if c.__contains__('E4'):
                rename.update({c: c.replace('E4:', 'MOTION:WR_')})
            elif c.__contains__('MOTION'):
                rename.update({c: c.replace('MOTION:', 'MOTION:HD_')})
        df.rename(columns=rename, inplace=True)
        df2.rename(columns=rename, inplace=True)
        
        df = pd.merge(left=df, right=df2, on=['PID', 'task', 'stage', 'Topic', 'name'], how='inner')
        print('after merge:', df.shape)
    
        features_cols = df.columns.values
        features_cols = [c for c in features_cols if c not in info_cols + id_col + unwanted]
        print('Extracted', len(features_cols), 'features in total', features_cols[0])
        print(features_cols)
        outf = f'trainning data/MOTION(6 {tw}).csv'

    elif file.__contains__('GAZE'):
        features_cols =df.columns.values
        features_cols = [c for c in features_cols if c not in info_cols + id_col + unwanted]
        print('Extracted', len(features_cols), 'features in total', features_cols[0])
        print(features_cols)
        outf = f'trainning data/GAZE(6 {tw}).csv'
    
    X = convert(df, features_cols + info_cols)
    if EXPORT == 1:
        X.to_csv(outf, index=False) 
        print('DONE!!', outf)
        print('!'*15)

['../data/MUWIS-dataset/features/GAZE_features_combined.csv', '../data/MUWIS-dataset/features/EEG_features_combined2-1.csv', '../data/MUWIS-dataset/features/EEG_features_combined1-0.csv', '../data/MUWIS-dataset/features/EEG_features_combined4-2.csv', '../data/MUWIS-dataset/features/GAZE_features_combined4-2.csv', '../data/MUWIS-dataset/features/GAZE_features_combined1-0.csv', '../data/MUWIS-dataset/features/GAZE_features_combined2-1.csv', '../data/MUWIS-dataset/features/PUPIL_features_combined1-0.csv', '../data/MUWIS-dataset/features/EDA_features_combined1-0.csv', '../data/MUWIS-dataset/features/PUPIL_features_combined4-2.csv', '../data/MUWIS-dataset/features/EDA_features_combined4-2.csv', '../data/MUWIS-dataset/features/EDA_features_combined2-1.csv', '../data/MUWIS-dataset/features/PUPIL_features_combined2-1.csv', '../data/MUWIS-dataset/features/ACC_features_combined2-1.csv', '../data/MUWIS-dataset/features/head motion_features_combined4-2.csv', '../data/MUWIS-dataset/features/head mo

  df = pd.read_csv(file)


empty columns []
5.686274509803922 Might have grouping error! ../data/MUWIS-dataset/features/EEG_features_combined2-1.csv
Extracted 460 features in total EEG:hjorthActivity_AF3
12 group EEG features
112 statistic EEG features
56 frequency EEG features
180 EEG features
(940, 184)
stage
0    235
2    235
5    119
3    118
1    117
4    116
Name: count, dtype: int64
DONE!! trainning data/EEG(6 2-1).csv
!!!!!!!!!!!!!!!
../data/MUWIS-dataset/features/EEG_features_combined1-0.csv


  df = pd.read_csv(file)


empty columns []
5.686274509803922 Might have grouping error! ../data/MUWIS-dataset/features/EEG_features_combined1-0.csv
Extracted 460 features in total EEG:hjorthActivity_AF3
12 group EEG features
112 statistic EEG features
56 frequency EEG features
180 EEG features
(940, 184)
stage
0    235
2    235
5    119
3    118
1    117
4    116
Name: count, dtype: int64
DONE!! trainning data/EEG(6 1-0).csv
!!!!!!!!!!!!!!!
../data/MUWIS-dataset/features/EEG_features_combined4-2.csv
empty columns []
5.666666666666667 Might have grouping error! ../data/MUWIS-dataset/features/EEG_features_combined4-2.csv
Extracted 460 features in total EEG:hjorthActivity_AF3
12 group EEG features
112 statistic EEG features
56 frequency EEG features
180 EEG features
(935, 184)
stage
0    235
2    235
5    119
3    118
1    117
4    111
Name: count, dtype: int64
DONE!! trainning data/EEG(6 4-2).csv
!!!!!!!!!!!!!!!
../data/MUWIS-dataset/features/GAZE_features_combined4-2.csv
empty columns []
5.397379912663755 Might 