In [1]:
import os
import math
import numpy as np
import shutil
import pandas as pd
import matplotlib.pyplot as plt
from BCI2kReader import BCI2kReader


directory_dataset = "datasets/splited dataset autiism"
directory_csv = "datasets/csv"
directory_segmented = "datasets/segmented"
directory_grouped = "datasets/grouped_csv"


In [2]:
# Take Sample
reader = BCI2kReader.BCI2kReader("datasets/splited dataset autiism/autism/Mada_Autism_26_5_2011S001R02.dat")
data = reader.signals
header = reader.parameters

print(header)
print(data.shape)

{'SourceCh': 16, 'SampleBlockSize': 8, 'SamplingRate': 256, 'ChannelNames': ['Fp1', 'F3', 'F7', 'T3', 'T5', 'O1', 'C4', 'FP2', 'Fz', 'F4', 'F8', 'C3', 'Cz', 'Pz', 'Oz', 'O2'], 'SourceChOffset': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), 'SourceChGain': (0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033), 'DataDirectory': '..\\data', 'SubjectName': 'Mada_Autism_26_5_2011', 'SubjectSession': '001', 'SubjectRun': '02', 'ID_System': '', 'ID_Amp': '', 'ID_Montage': '', 'VisualizeTiming': 1, 'VisualizeSource': 1, 'VisualizeSourceDecimation': 1, 'VisualizeSourceTime': 2000, 'SourceMin': -100, 'SourceMax': 100, 'SourceChList': [], 'SourceChDevices': (16,), 'NumBuffers': 2, 'DeviceIDMaster': 'auto', 'FilterEnabled': 1, 'FilterHighPass': 0.1, 'FilterLowPass': 60.0, 'FilterModelOrder': 8, 'FilterType': 1, 'NotchEnabled': 1, 'NotchHighPass': 58.0, 'NotchLowPass': 62.0, 'NotchModelOrder': 4, 'NotchTy

## Convert to CSV

In [3]:
def convert_to_csv(sourcedir: str, desdir: str):
    for directory in os.listdir(sourcedir):
        metadata_df = pd.DataFrame()
        for files in os.listdir(os.path.join(sourcedir, directory)):
            print(os.path.join(sourcedir, directory, files))
            reader = BCI2kReader.BCI2kReader(os.path.join(sourcedir, directory, files))
            
            # Data read
            data_np = reader.signals
            df = pd.DataFrame(columns=reader.parameters["ChannelNames"], data=data_np.T)

            # Metadata
            df_m = pd.DataFrame(list(reader.parameters.values())).T
            df_m.columns = list(reader.parameters.keys())
            metadata_df = pd.concat([metadata_df, df_m])
            print(data_np.shape)
            des_path = os.path.join(desdir, directory)
            if not os.path.exists(des_path):
                os.makedirs(des_path)
            df.to_csv(os.path.join(des_path, files.replace('.dat', '.csv')), index=False)
        metadata_df.to_csv(f"{directory}_metadata_.csv", index=False)

In [4]:
convert_to_csv(directory_dataset, directory_csv)

datasets/splited dataset autiism\autism\Bader_Autism_24_11_2011S001R01.dat
(16, 47088)
datasets/splited dataset autiism\autism\Bader_Autism_24_11_2011S001R05.dat
(16, 12344)
datasets/splited dataset autiism\autism\Bader_Autism_24_11_2011S001R07.dat
(16, 17232)
datasets/splited dataset autiism\autism\Bader_Autism_24_11_2011S001R08.dat
(16, 18096)
datasets/splited dataset autiism\autism\Bader_Autism_24_11_2011S001R09.dat
(16, 212144)
datasets/splited dataset autiism\autism\Bader_Autism_24_11_2011S001R10.dat
(16, 88024)
datasets/splited dataset autiism\autism\Deena_Autism_23_4_2011S001R01.dat
(16, 44896)
datasets/splited dataset autiism\autism\Duaa_Autism_23_4_2011S001R01.dat
(16, 107744)
datasets/splited dataset autiism\autism\Mada_Autism_26_5_2011S001R01.dat
(16, 221624)
datasets/splited dataset autiism\autism\Mada_Autism_26_5_2011S001R02.dat
(16, 9216)
datasets/splited dataset autiism\autism\Mohammed_Autism_9_11_2011S001R01.dat
(16, 283440)
datasets/splited dataset autiism\autism\Nour_

## Groups data by name of participants

In [4]:
def group_by_name(sourcedir: str, list_name: list, desdir: str):
    if not os.path.exists(desdir):
        os.makedirs(desdir)
    for files in os.listdir(sourcedir):
        for name in list_name:
            if not os.path.exists(os.path.join(desdir, name)):
                os.makedirs(os.path.join(desdir, name))
            if name in files:
                shutil.copy(os.path.join(sourcedir, files), os.path.join(desdir, name, files))

In [16]:
list_autism = ["Bader", "Deena", "Duaa", "Mada", "Mohammed", "Nour", "Omar", "Saud", "Shahad", "Yahia", "Zyad"]
list_normal = ["Amer", "Dhelal", "Mahmud", "Majdolin", "Omran"]

group_by_name(os.path.join(directory_csv, "autism"), list_autism, os.path.join(directory_grouped, "autism"))
group_by_name(os.path.join(directory_csv, "normal"), list_normal, os.path.join(directory_grouped, "normal"))

## Segments with overlapping data

In [5]:
class SegmentLengthError(Exception):
    pass

class MaxOverlapError(Exception):
    pass

def find_segments(num_segments, segment_length, data_length):
    if segment_length > data_length:
        raise SegmentLengthError("Segment length cannot be greater than data length.")
    
    max_overlap = segment_length // 2  # Maximum allowed overlap is 50% of segment length
    segments = []
    step = (data_length - segment_length) // (num_segments - 1) if num_segments > 1 else 0
    # Check if the calculated step leads to more than 50% overlap
    if step < max_overlap:
        raise MaxOverlapError("The overlap between segments exceeds 50% of the segment length.")
    
    for i in range(num_segments):
        start_index = i * step
        end_index = start_index + segment_length - 1

        if end_index >= data_length:
            end_index = data_length - 1
            start_index = end_index - segment_length + 1
        segments.append((start_index, end_index))
        
        if end_index == data_length - 1:
            break

    return segments

In [6]:
# Test Segment

df_test = pd.read_csv("datasets/grouped_csv/autism/Bader/Bader_Autism_24_11_2011S001R01.csv")
test_seg_num = math.ceil(df_test.shape[0] / (256 * 1))
test_seg_index = find_segments(test_seg_num, (256 * 1), df_test.shape[0])
print(test_seg_index)
print(len(test_seg_index))

[(0, 255), (255, 510), (510, 765), (765, 1020), (1020, 1275), (1275, 1530), (1530, 1785), (1785, 2040), (2040, 2295), (2295, 2550), (2550, 2805), (2805, 3060), (3060, 3315), (3315, 3570), (3570, 3825), (3825, 4080), (4080, 4335), (4335, 4590), (4590, 4845), (4845, 5100), (5100, 5355), (5355, 5610), (5610, 5865), (5865, 6120), (6120, 6375), (6375, 6630), (6630, 6885), (6885, 7140), (7140, 7395), (7395, 7650), (7650, 7905), (7905, 8160), (8160, 8415), (8415, 8670), (8670, 8925), (8925, 9180), (9180, 9435), (9435, 9690), (9690, 9945), (9945, 10200), (10200, 10455), (10455, 10710), (10710, 10965), (10965, 11220), (11220, 11475), (11475, 11730), (11730, 11985), (11985, 12240), (12240, 12495), (12495, 12750), (12750, 13005), (13005, 13260), (13260, 13515), (13515, 13770), (13770, 14025), (14025, 14280), (14280, 14535), (14535, 14790), (14790, 15045), (15045, 15300), (15300, 15555), (15555, 15810), (15810, 16065), (16065, 16320), (16320, 16575), (16575, 16830), (16830, 17085), (17085, 17340),

In [7]:
df_test.iloc[test_seg_index[183][0]:test_seg_index[183][1]+1]

Unnamed: 0,Fp1,F3,F7,T3,T5,O1,C4,FP2,Fz,F4,F8,C3,Cz,Pz,Oz,O2
46665,8.382,16.532999,36.135,20.328,24.453000,21.549000,29.039999,19.800,32.670,30.789,66.462,18.942,31.020,37.686,39.699000,17.754000
46666,3.135,14.223000,32.934,18.018,20.889000,19.932000,25.476000,18.447,28.281,31.416,58.674,13.068,25.476,33.363,33.990000,16.730999
46667,-1.683,10.395000,26.136,17.688,15.873000,16.764000,22.737000,15.807,24.156,30.228,54.285,6.435,20.196,28.413,29.997000,16.104000
46668,-3.828,6.336000,17.655,17.094,12.573000,16.598999,22.968000,13.596,22.209,30.228,50.721,1.683,18.084,26.895,31.581000,19.272000
46669,-5.709,1.716000,9.999,11.715,9.306000,20.723999,24.387000,9.702,20.295,30.591,46.332,-2.343,16.302,28.281,35.640000,24.882000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46916,-31.350,-36.300000,-18.249,-5.412,-23.001000,-30.195000,-19.536000,-42.636,-57.255,-31.218,-22.176,-30.888,-43.296,-23.760,-19.536000,-8.910000
46917,-34.848,-39.765000,-21.285,-9.669,-24.420000,-31.284000,-21.912000,-37.884,-58.905,-27.918,-19.569,-34.089,-46.596,-27.291,-20.657999,-13.365000
46918,-38.841,-45.507000,-26.235,-16.170,-27.159000,-34.188000,-27.390000,-39.600,-65.505,-26.070,-22.671,-40.920,-54.318,-31.383,-25.443000,-18.150000
46919,-41.481,-50.325000,-32.340,-24.618,-29.171999,-34.221000,-32.307000,-46.332,-71.115,-26.202,-24.057,-44.715,-59.631,-31.581,-27.126000,-19.437000


In [8]:
def segments_with_overlap(sourcedir: str, desdir: str, sampling_freq: int, seg_time: int, excluded_name=[]):
    for name in os.listdir(sourcedir):
        if name in excluded_name:
            continue
        merged_df = pd.DataFrame()
        for files in os.listdir(os.path.join(sourcedir, name)):
            print(os.path.join(sourcedir, name, files))
            df = pd.read_csv(os.path.join(sourcedir, name, files))
            merged_df = pd.concat([merged_df, df])
        print(merged_df.shape)
        seg_num = math.ceil(merged_df.shape[0] / (sampling_freq * seg_time))
        try:
            seg_index = find_segments(seg_num, sampling_freq * seg_time, merged_df.shape[0])
        except MaxOverlapError:
            seg_num = seg_num - 1    
            seg_index = find_segments(seg_num, sampling_freq * seg_time, merged_df.shape[0])
        for i, idx in enumerate(seg_index):
            des_path = os.path.join(desdir, name)
            if not os.path.exists(des_path):
                os.makedirs(des_path)
            merged_df.iloc[idx[0]:idx[1]+1].to_csv(os.path.join(des_path, f"segment_{i+1}.csv"), index=False)

In [None]:
SAMPLING_FREQ = 256 # Hz
SEGMENT_TIME = 1 # s

for folder in os.listdir(directory_grouped):
    segments_with_overlap(
        os.path.join(directory_grouped, folder), 
        os.path.join(f"{directory_segmented}_{SEGMENT_TIME} seconds", folder), 
        SAMPLING_FREQ, 
        SEGMENT_TIME
        )

In [6]:
excluded_name = ["Zyad"]

In [7]:
segment_times = [1, 10, 20, 30, 50, 60, 90]

for segment in segment_times:
    for folder in os.listdir(f"{directory_segmented}_{segment} seconds"):
        count = 0
        for name in os.listdir(os.path.join(f"{directory_segmented}_{segment} seconds", folder)):
            if name in excluded_name:
                continue
            count = count + len(os.listdir(os.path.join(f"{directory_segmented}_{segment} seconds", folder, name))) 
        print(f"{segment} second(s), Segments for {folder}: ", count)
    print("\n")
    

# for folder in os.listdir(f"{directory_segmented}_{SEGMENT_TIME} seconds"):
#     count = 0
#     for name in os.listdir(os.path.join(f"{directory_segmented}_{SEGMENT_TIME} seconds", folder)):
#         if name in excluded_name:
#             continue
#         count = count + len(os.listdir(os.path.join(f"{directory_segmented}_{SEGMENT_TIME} seconds", folder, name))) 
#     print(f"{SEGMENT_TIME} second(s), Segments for {folder}: ", count)

1 second(s), Segments for autism:  10579
1 second(s), Segments for normal:  4855


10 second(s), Segments for autism:  1064
10 second(s), Segments for normal:  488


20 second(s), Segments for autism:  535
20 second(s), Segments for normal:  245


30 second(s), Segments for autism:  358
30 second(s), Segments for normal:  164


50 second(s), Segments for autism:  217
50 second(s), Segments for normal:  101


60 second(s), Segments for autism:  181
60 second(s), Segments for normal:  83


90 second(s), Segments for autism:  122
90 second(s), Segments for normal:  57




In [None]:
# for folder in os.listdir(f"{directory_segmented}_{SEGMENT_TIME} seconds"):
#     count = 0
#     for name in os.listdir(os.path.join(f"{directory_segmented}_{SEGMENT_TIME} seconds", folder)):
#         count = count + len(os.listdir(os.path.join(f"{directory_segmented}_{SEGMENT_TIME} seconds", folder, name))) 
#     print(f"{SEGMENT_TIME} second(s), Segments for {folder}: ", count)