In [4]:
import os

os.chdir('../..')
print("Current working directory:", os.getcwd())

Current working directory: /dmj/fizmed/kchorzela/licencjat/eeg-site-effects


In [5]:
from joblib import Parallel, delayed
import pandas as pd
from tqdm import tqdm

from src.utils.utils import get_feat_names
from src.data.feature_extraction import map_edf_to_samples_with_idx

In [6]:
ELM19_info_filtered_path = 'datasets/ELM19/filtered/ELM19_enriched_info_filtered.csv'

In [7]:
df_info = pd.read_csv('datasets/ELM19/ELM19_enriched_info_filtered.csv')
df_info.info()

df_info = df_info[abs(df_info['original_sfreq'] - 250) <= 0.1]
df_info = df_info.reset_index(drop=True)
df_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54779 entries, 0 to 54778
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   examination_id       54779 non-null  object 
 1   patient_original_id  54779 non-null  object 
 2   age_dec              54779 non-null  float64
 3   patient_sex          54779 non-null  object 
 4   institution_id       54779 non-null  object 
 5   classification       54779 non-null  object 
 6   original_sfreq       54779 non-null  float64
 7   n_channels_original  54779 non-null  int64  
 8   highpass_filter_acq  54779 non-null  float64
 9   lowpass_filter_acq   54779 non-null  float64
 10  n_samples            54779 non-null  int64  
 11  duration_sec         54779 non-null  float64
 12  recording_date       54779 non-null  object 
dtypes: float64(5), int64(2), object(6)
memory usage: 5.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47310 entries, 0 to 47309
Dat

In [4]:
import logging

logging.basicConfig(level=logging.WARNING)

In [None]:
num_workers = 10
results = Parallel(n_jobs=num_workers)(
    delayed(map_edf_to_samples_with_idx)(df_info.examination_id.iloc[idx],
                                         df_info.institution_id.iloc[idx],
                                         idx,
                                         "datasets/ELM19/raw/ELM19/ELM19_edfs",
                                         "ELM19")
    for idx in tqdm(df_info.index)
)

results.sort(key=lambda x: x[0])

df_feats = pd.DataFrame([x[1] for x in results if x[1] is not None], columns=get_feat_names())
df_feats.to_csv("datasets/ELM19/processed/ELM19_features_250", index=False)

# Dataset preparation

In [None]:
df_feats = pd.read_csv("datasets/ELM19/processed/ELM19_features_250")

In [11]:
df_feats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47310 entries, 0 to 47309
Columns: 2850 entries, coh_Fp2_Fp1_0_2 to cov_O2_O2
dtypes: float64(2850)
memory usage: 1.0 GB


In [12]:
df_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47310 entries, 0 to 47309
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   examination_id       47310 non-null  object 
 1   patient_original_id  47310 non-null  object 
 2   age_dec              47310 non-null  float64
 3   patient_sex          47310 non-null  object 
 4   institution_id       47310 non-null  object 
 5   classification       47310 non-null  object 
 6   original_sfreq       47310 non-null  float64
 7   n_channels_original  47310 non-null  int64  
 8   highpass_filter_acq  47310 non-null  float64
 9   lowpass_filter_acq   47310 non-null  float64
 10  n_samples            47310 non-null  int64  
 11  duration_sec         47310 non-null  float64
 12  recording_date       47310 non-null  object 
dtypes: float64(5), int64(2), object(6)
memory usage: 4.7+ MB


In [16]:
df_info_norm = df_info[df_info['classification'] == 'norm'].copy()
df_info_norm.to_csv('datasets/ELM19/filtered/ELM19_info_filtered_norm_250.csv', index=False)

In [17]:
df_feats_norm = df_feats.loc[df_info_norm.index]
df_feats_norm.to_csv("datasets/ELM19/processed/ELM19_features_norm_250.csv", index=False)

In [20]:
df_feats_norm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22732 entries, 0 to 47306
Columns: 2850 entries, coh_Fp2_Fp1_0_2 to cov_O2_O2
dtypes: float64(2850)
memory usage: 494.5 MB


In [19]:
df_info_norm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22732 entries, 0 to 47306
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   examination_id       22732 non-null  object 
 1   patient_original_id  22732 non-null  object 
 2   age_dec              22732 non-null  float64
 3   patient_sex          22732 non-null  object 
 4   institution_id       22732 non-null  object 
 5   classification       22732 non-null  object 
 6   original_sfreq       22732 non-null  float64
 7   n_channels_original  22732 non-null  int64  
 8   highpass_filter_acq  22732 non-null  float64
 9   lowpass_filter_acq   22732 non-null  float64
 10  n_samples            22732 non-null  int64  
 11  duration_sec         22732 non-null  float64
 12  recording_date       22732 non-null  object 
dtypes: float64(5), int64(2), object(6)
memory usage: 2.4+ MB
