# Explore 'old' dataset

### Import packages

In [1]:
# Import packages
import os
import sys

sys.path.insert(0, os.path.dirname(os.getcwd()))

import numpy as np
import h5py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mne

sns.set_theme(style="whitegrid")
%matplotlib inline
# %matplotlib qt

In [2]:
import fnmatch
import warnings
import re
warnings.filterwarnings('ignore')

import helper_functions

In [3]:
#from config_for_repro_prepro  import PATH_RAW_DATA, PATH_METADATA, PATH_DATA_PROCESSED_ML

### Now let's get all our old cnt files together

In [4]:
dir_names = {11: '11mnd mmn',
             17: '17mnd mmn',
             23: '23mnd mmn',
             29: '29mnd mmn',
             35: '35mnd mmn',
             41: '41mnd mmn',
             47: '47mnd mmn'}
    
df_list = []

for age_group, directory in dir_names.items(): # Go into every age group folder        
    dir_path = os.path.join('C:/Projects/EEG_explorer/Data', directory)
    file_names = os.listdir(dir_path)
    
    cnt_paths = [os.path.join(dir_path, file_name) for file_name in fnmatch.filter(file_names, "*.cnt")]
    # list comprehension - creates a list of all cnt file names with thisdirectory
    cnt_files = [os.path.basename(x)[:-4] for x in cnt_paths]
    # list comprehension
    codes = [int(re.search(r'\d+', x).group()) for x in cnt_files]
    # takes number out of string (\d+ takes out digits)
    df = pd.DataFrame(list(zip(codes, cnt_paths, cnt_files)), columns=['code', 'cnt_path','cnt_file']) 
    
    df['age_group'] = age_group
    df_list.append(df)

cnt_files = pd.concat(df_list)

In [5]:
cnt_files

Unnamed: 0,code,cnt_path,cnt_file,age_group
0,1,C:/Projects/EEG_explorer/Data\11mnd mmn\001_11...,001_11_jc_mmn36_wk_mmn25_wk_mmn47_wk_mmn58_wk,11
1,3,C:/Projects/EEG_explorer/Data\11mnd mmn\003_11...,003_11_jc_mmn36_slp_mmn25_wk_mmn47_slp_mmn58_slp,11
2,4,C:/Projects/EEG_explorer/Data\11mnd mmn\004_11...,004_11_mc_mmn,11
3,5,C:/Projects/EEG_explorer/Data\11mnd mmn\005_11...,005_11_jc_mmn2,11
4,7,C:/Projects/EEG_explorer/Data\11mnd mmn\007_11...,007_11_jc_mmn2_36_wk,11
...,...,...,...,...
39,704,C:/Projects/EEG_explorer/Data\47mnd mmn\704-03...,704-032-47m-jr-mmn36,47
40,705,C:/Projects/EEG_explorer/Data\47mnd mmn\705-05...,705-050-47m-jr-mmn36,47
41,709,C:/Projects/EEG_explorer/Data\47mnd mmn\709-07...,709-078-47m-jr-mmn36,47
42,710,C:/Projects/EEG_explorer/Data\47mnd mmn\710-07...,710-078-47m-jr-mmn36,47


### Now let's get all our old edf files together

In [16]:
dir_names = {11: '11mnd mmn',
             17: '17mnd mmn',
             23: '23mnd mmn',
             29: '29mnd mmn',
             35: '35mnd mmn',
             41: '41mnd mmn',
             47: '47mnd mmn'}
    
edf_df_list = []

for age_group, directory in dir_names.items(): # Go into every age group folder        
    dir_path = os.path.join('C:/Projects/EEG_explorer/Data', directory)
    file_names = os.listdir(dir_path)
    
    edf_paths = [os.path.join(dir_path, file_name) for file_name in fnmatch.filter(file_names, "*.edf")]
    # list comprehension - creates a list of all cnt file names with thisdirectory
    edf_files = [os.path.basename(x)[:-4] for x in edf_paths]
    # list comprehension
    #codes = [int(re.search(r'\d+', x).group()) for x in cnt_files]
    # takes number out of string (\d+ takes out digits)
    df_edf = pd.DataFrame(list(zip(edf_paths, edf_files)), columns=['edf_path','edf_file']) 
    
    
    edf_df_list.append(df_edf)

edf_files = pd.concat(edf_df_list)

In [17]:
edf_files

Unnamed: 0,edf_path,edf_file
0,C:/Projects/EEG_explorer/Data\11mnd mmn\001_11...,001_11_jc_mmn36_wk_mmn25_wk_mmn47_wk_mmn58_wk-...
1,C:/Projects/EEG_explorer/Data\11mnd mmn\003_11...,003_11_jc_mmn36_slp_mmn25_wk_mmn47_slp_mmn58_s...
2,C:/Projects/EEG_explorer/Data\11mnd mmn\004_11...,004_11_mc_mmn-cleaned
3,C:/Projects/EEG_explorer/Data\11mnd mmn\005_11...,005_11_jc_mmn2-cleaned
4,C:/Projects/EEG_explorer/Data\11mnd mmn\007_11...,007_11_jc_mmn2_36_wk-cleaned
...,...,...
71,C:/Projects/EEG_explorer/Data\17mnd mmn\312_17...,312_17_jc_mmn25_2_slp-cleaned
72,C:/Projects/EEG_explorer/Data\17mnd mmn\317_17...,317_17_mc_mmn25_2_wk-cleaned
73,C:/Projects/EEG_explorer/Data\17mnd mmn\323_17...,323_17_mc_mmn25_2_slp-cleaned
74,C:/Projects/EEG_explorer/Data\17mnd mmn\345_17...,345_17_mc_mmn25_wk-cleaned


### We can now clean out the "cleaned", but not be sure about certain other changes

In [20]:
# make set of all files,,
# make sure of probably cleaned files

# set of all files
files_list = []
cleaned_files_list = []
for i in cnt_files['cnt_path']:
    files_list.append(i)
    if 'cleaned' in i:
        print(i)
if len(cleaned_files_list) == 0:
    print("No .cnt files have cleaned in the name")
    


No .cnt files have cleaned in the name


In [23]:
edf_files_list = []
edf_cleaned_files_list = []
for i in edf_files['edf_path']:
    edf_files_list.append(i)
    if 'cleaned' in i:
        edf_cleaned_files_list.append(i)
if len(edf_cleaned_files_list) == 0:
    print("No .edf files have cleaned in the name")
else:
    print(len(edf_cleaned_files_list), "files appear to have been cleaned, out of", len(edf_files_list) )


157 files appear to have been cleaned, out of 157


In [None]:
### So we can see the edf files are cleaned, but were they just preprocessed, or what process generated them?