# Explore 'old' dataset

### Import packages

In [1]:
# Import packages
import os
import sys

sys.path.insert(0, os.path.dirname(os.getcwd()))

import numpy as np
import h5py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mne

sns.set_theme(style="whitegrid")
%matplotlib inline
# %matplotlib qt

In [2]:
import fnmatch
import warnings
import re
warnings.filterwarnings('ignore')

import helper_functions

In [3]:
#from config_for_repro_prepro  import PATH_RAW_DATA, PATH_METADATA, PATH_DATA_PROCESSED_ML

## Let's see what was on the SURFDrive so we can compare it to what was published

### SURF first

### Now let's get all our old cnt files together

In [4]:
dir_names = {11: '11mnd mmn',
             17: '17mnd mmn',
             23: '23mnd mmn',
             29: '29mnd mmn',
             35: '35mnd mmn',
             41: '41mnd mmn',
             47: '47mnd mmn'}
    
df_list = []

for age_group, directory in dir_names.items(): # Go into every age group folder        
    dir_path = os.path.join('C:/Projects/EEG_explorer/Data', directory)
    file_names = os.listdir(dir_path)
    
    cnt_paths = [os.path.join(dir_path, file_name) for file_name in fnmatch.filter(file_names, "*.cnt")]
    # list comprehension - creates a list of all cnt file names with thisdirectory
    cnt_files = [os.path.basename(x)[:-4] for x in cnt_paths]
    # list comprehension
    codes = [int(re.search(r'\d+', x).group()) for x in cnt_files]
    # takes number out of string (\d+ takes out digits)
    df = pd.DataFrame(list(zip(codes, cnt_paths, cnt_files)), columns=['code', 'cnt_path','cnt_file']) 
    
    df['age_group'] = age_group
    df_list.append(df)

cnt_files = pd.concat(df_list)

In [24]:
print( "We have", len(cnt_files), ".cnt files")

We have 2149 .cnt files


Now we see something odd already, when a former grad student pre-processed all the .cnt files he got 2084, so we have a discrepancy. Please note I have checked and this is not due to the inclusion of any 5 month old folder, as far as I can see that data was always disincluded. 

![title](./bjornprepro.png)

### Now let's look at all our .edf files together

In [16]:
dir_names = {11: '11mnd mmn',
             17: '17mnd mmn',
             23: '23mnd mmn',
             29: '29mnd mmn',
             35: '35mnd mmn',
             41: '41mnd mmn',
             47: '47mnd mmn'}
    
edf_df_list = []

for age_group, directory in dir_names.items(): # Go into every age group folder        
    dir_path = os.path.join('C:/Projects/EEG_explorer/Data', directory)
    file_names = os.listdir(dir_path)
    
    edf_paths = [os.path.join(dir_path, file_name) for file_name in fnmatch.filter(file_names, "*.edf")]
    # list comprehension - creates a list of all cnt file names with thisdirectory
    edf_files = [os.path.basename(x)[:-4] for x in edf_paths]
    # list comprehension
    #codes = [int(re.search(r'\d+', x).group()) for x in cnt_files]
    # takes number out of string (\d+ takes out digits)
    df_edf = pd.DataFrame(list(zip(edf_paths, edf_files)), columns=['edf_path','edf_file']) 
    
    
    edf_df_list.append(df_edf)

edf_files = pd.concat(edf_df_list)

In [25]:
print( "We have", len(edf_files), ".edf files")

We have 157 .edf files


### We can now clean out the "cleaned", but not be sure about certain other changes

In [26]:
# make set of all files,,
# make sure of probably cleaned files

# set of all files
files_list = []
cleaned_files_list = []
for i in cnt_files['cnt_path']:
    files_list.append(i)
    if 'cleaned' in i:
        print(i)
if len(cleaned_files_list) == 0:
    print("No .cnt files have cleaned in the name")
    


No .cnt files have cleaned in the name


In [28]:
edf_files_list = []
edf_cleaned_files_list = []
for i in edf_files['edf_path']:
    edf_files_list.append(i)
    if 'cleaned' in i:
        edf_cleaned_files_list.append(i)
if len(edf_cleaned_files_list) == 0:
    print("No .edf files have cleaned in the name")
else:
    print(len(edf_cleaned_files_list), "files appear to have been cleaned or have it in the name, out of", len(edf_files_list) )


157 files appear to have been cleaned or have it in the name, out of 157


### So we can see the edf files are cleaned, but were they just preprocessed, or what process generated them?

## Let's also examine the metadata:

In [29]:
metadata_dir = 'C:/Projects/EEG_explorer/ePODIUM_metadata/'
os.listdir(metadata_dir)

['108casesvanHerten.xls',
 'ages',
 'AvgRef_N1_allesites_missingDELETE_Min30trials.xlsx',
 'CDIinfo.xlsx',
 'CDIinfo_combined.csv',
 'info_DDP_set1.txt',
 'metadata.xlsx',
 'Screening_children5a_summary_new.txt']

In [62]:
hundred_and_eight_casesvanHerten = pd.read_excel(os.path.join(metadata_dir, '108casesvanHerten.xls'), engine='xlrd')
hundred_and_eight_casesvanHerten

Unnamed: 0,File
0,015_thomas_mmn36w
1,030_04_jc_mmn36_wk_1
2,031_04_mc_mmn36_wk_1
3,034_17_mc_mmn36_wk
4,036_17_mc_mmn36_wk
...,...
103,757-487-17m-jr-mmn36
104,758-465-17m-mr-mmn36
105,Copy of lola_mr_mmn3-6
106,Copy of timo_jr_mmn36


So we have this file with 108 'special cases'. Interesting? 
Let's look at what is on the age files

In [63]:
ages_metadata_dir = os.path.join(metadata_dir, 'ages')
os.listdir(ages_metadata_dir)

['ages_11mnths.txt',
 'ages_17mnths.txt',
 'ages_23mnths.txt',
 'ages_29mnths.txt',
 'ages_35mnths.txt',
 'ages_41mnths.txt',
 'ages_47mnths.txt',
 'ages_5mnths.txt']

In [64]:
eleven_months= pd.read_csv(os.path.join(ages_metadata_dir, 'ages_11mnths.txt'), sep='\t')

In [65]:
eleven_months

Unnamed: 0,code,age_days,age_months,age_years
0,1,329,10.966667,0.913889
1,3,336,11.200000,0.933333
2,5,329,10.966667,0.913889
3,6,319,10.633333,0.886111
4,7,329,10.966667,0.913889
...,...,...,...,...
261,753,331,11.033333,0.919444
262,754,332,11.066667,0.922222
263,755,327,10.900000,0.908333
264,757,326,10.866667,0.905556


So the ages are where we get the exact ages

In [50]:
AvgRef_N1_allesites_missingDELETE_Min30trials= pd.read_excel(os.path.join(metadata_dir,  'AvgRef_N1_allesites_missingDELETE_Min30trials.xlsx'))

In [51]:
AvgRef_N1_allesites_missingDELETE_Min30trials

Unnamed: 0,file,nr,group,c4_n1_l,c4_n1_v,c3_n1_l,c3_n1_v,f4_n1_l,f4_n1_v,f8_n1_l,...,fz_n1_v,n1_lh3,n1_rh3,n1latera,lhgroter,laterality,n1_mid3,n1_LH6,n1_RH6,n1_mid6
0,031_04_mc_mmn36_wk_1,1,1,164,0.726,170,0.779,184,1.714,186,...,0.092,0.27850,1.46875,-1.19025,1,1,-0.885333,3.90500,-3.47500,-3.654333
1,034_17_mc_mmn36_wk,2,1,154,-2.198,160,-0.337,192,-1.912,218,...,-2.992,-1.53400,-2.99400,1.46000,0,2,-0.322000,0.41300,-0.71775,-2.874000
2,036_17_mc_mmn36_wk,3,1,112,-1.687,164,-1.631,160,0.007,118,...,-0.220,-1.84000,-1.57875,-0.26125,1,2,-0.496667,0.52950,-0.17775,1.743333
3,039_04_jc_mnn_wk,4,1,150,-3.832,174,-0.776,204,0.938,218,...,1.334,-0.37900,-1.55200,1.17300,0,2,1.649667,-0.88700,-0.22425,0.876333
4,305_17_jc_mmn36_wk,5,1,166,-1.877,138,-1.567,218,-1.902,250,...,-1.651,-1.33750,-1.36175,0.02425,0,2,-0.846667,-5.07925,0.87350,-0.164333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,749-461-17m-jr-mmn36,31,2,160,-3.168,156,-1.473,170,0.686,246,...,0.542,-0.32950,-1.42400,1.09450,0,2,-0.677667,-3.10550,-1.43550,-0.542000
62,751-542-17m-jr-mmn36,32,2,150,-1.852,166,-2.152,204,-2.581,256,...,-4.480,-2.73725,-2.79450,0.05725,0,2,-3.297000,-4.43825,-3.00650,-4.056000
63,754-472-17m-jr-mmn36,33,2,112,-0.849,176,0.060,178,-1.133,192,...,-1.898,-0.41575,-0.50225,0.08650,0,2,-0.584000,-2.13550,-2.46400,-4.198667
64,757-487-17m-jr-mmn36,34,2,180,-0.605,178,-1.297,184,-0.567,234,...,-0.742,-0.45825,-0.58350,0.12525,0,2,-0.589000,0.88400,-2.46400,4.828000


# CDIinfo = pd.read_excel(os.path.join(metadata_dir,'CDIinfo.xlsx' ))
CDIinfo file can be opened by running the above as code, however contains sensitive information i.e. names, and therefore is not shown here

In [59]:
CDIinfo_combined = pd.read_csv(os.path.join(metadata_dir,'CDIinfo_combined.csv' ))

In [61]:
CDIinfo_combined

Unnamed: 0,code,age_months,productive,receptive
0,0,17,45,177
1,1,29,506,580
2,1,35,608,674
3,3,29,357,512
4,4,17,79,215
...,...,...,...,...
754,646,23,444,547
755,646,29,596,612
756,646,35,646,658
757,708,17,79,238


In [68]:
info_DDP_set1= os.path.join(metadata_dir, 'info_DDP_set1.txt')
with open(info_DDP_set1, 'r') as text:
    textfile = text.read()
    print(textfile)

Beste Frank, Hugo,
Hierbij de publicaties.
Wat betreft aantallen van volledig opgeschoonde en publiceerbare data:
1. Been et al. 2008: 5 maanden: n=121 FR; n=73 controls
2. van Herten et al. 2008; van Herten first submission: 17 maanden [FR n= 60; controls n=48]
[de analyse over n=108 kinderen is in het definitieve artikel gesneuveld, maar de EEG-registraties zijn goed].
3. zie lijst: n=64 kinderen zowel 17 als 29 maanden; [FR n= 36; controls n=28]
[Die 1/3 â€“ 1/3 â€“ 1/3 die ik noemde betrof 12-jarigen; afhankelijk van de selectie (en de gehanteerde criteria) heeft 30% â€“ 40% van de FR kinderen een diagnose dyslexie, en 5% - 8% van de controle-kinderen].
HG,
Ben

----NB. publicaties staan in folder 'Literature'


So  was  thatabout some old publications in the literatureor our kids?...may not be of much interest?

In [70]:
metadata_xl = pd.read_excel(os.path.join(metadata_dir, 'metadata.xlsx'))
metadata_xl

Unnamed: 0,file,nr,group,c4_n1_l,c4_n1_v,c3_n1_l,c3_n1_v,f4_n1_l,f4_n1_v,f8_n1_l,...,fz_n1_v,n1_lh3,n1_rh3,n1latera,lhgroter,laterality,n1_mid3,n1_LH6,n1_RH6,n1_mid6
0,031_04_mc_mmn36_wk_1,1,1,164,0.726,170,0.779,184,1.714,186,...,0.092,0.27850,1.46875,-1.19025,1,1,-0.885333,3.90500,-3.47500,-3.654333
1,034_17_mc_mmn36_wk,2,1,154,-2.198,160,-0.337,192,-1.912,218,...,-2.992,-1.53400,-2.99400,1.46000,0,2,-0.322000,0.41300,-0.71775,-2.874000
2,036_17_mc_mmn36_wk,3,1,112,-1.687,164,-1.631,160,0.007,118,...,-0.220,-1.84000,-1.57875,-0.26125,1,2,-0.496667,0.52950,-0.17775,1.743333
3,039_04_jc_mnn_wk,4,1,150,-3.832,174,-0.776,204,0.938,218,...,1.334,-0.37900,-1.55200,1.17300,0,2,1.649667,-0.88700,-0.22425,0.876333
4,305_17_jc_mmn36_wk,5,1,166,-1.877,138,-1.567,218,-1.902,250,...,-1.651,-1.33750,-1.36175,0.02425,0,2,-0.846667,-5.07925,0.87350,-0.164333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,749-461-17m-jr-mmn36,31,2,160,-3.168,156,-1.473,170,0.686,246,...,0.542,-0.32950,-1.42400,1.09450,0,2,-0.677667,-3.10550,-1.43550,-0.542000
62,751-542-17m-jr-mmn36,32,2,150,-1.852,166,-2.152,204,-2.581,256,...,-4.480,-2.73725,-2.79450,0.05725,0,2,-3.297000,-4.43825,-3.00650,-4.056000
63,754-472-17m-jr-mmn36,33,2,112,-0.849,176,0.060,178,-1.133,192,...,-1.898,-0.41575,-0.50225,0.08650,0,2,-0.584000,-2.13550,-2.46400,-4.198667
64,757-487-17m-jr-mmn36,34,2,180,-0.605,178,-1.297,184,-0.567,234,...,-0.742,-0.45825,-0.58350,0.12525,0,2,-0.589000,0.88400,-2.46400,4.828000


In [73]:
# 'Screening_children5a_summary_new.txt'

Screening_children5a_summary_new= pd.read_csv(os.path.join(metadata_dir, 'Screening_children5a_summary_new.txt'), sep ='\t')
Screening_children5a_summary_new

Unnamed: 0,id_child,groupDDP,atRiskOrNotDDP,dyslexicAtMidGroup3DDP,assignment1,assignment2,assignment3,assignment4,childInfoPresent,relativeInfoPresent,mmr_2mth,mmr_5mth,mmr_11mth,mmr_17mth,mmr_23mth,mmr_29mth,mmr_35mth,mmr_41mth,mmr_47mth
0,001,4,unclear,1,notEnoughInfo,notEnoughInfo,notEnoughInfo,notEnoughInfo,1,1,1,0,1,1,1,1,1,1,1
1,002,missing,missing,missing,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,0,1,0,0,0,0,1,1,1,0,0
2,003,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,0,1,0,1,1,0
3,004,missing,missing,missing,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,1,1,0,1,1,0,1,0
4,005,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,1,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,020,missing,missing,missing,missing,missing,missing,missing,0,0,1,0,0,0,0,0,0,0,0
332,037,missing,missing,missing,missing,missing,missing,missing,0,0,1,0,0,0,0,0,0,0,0
333,315,missing,missing,missing,missing,missing,missing,missing,0,0,1,0,0,0,0,0,0,0,0
334,349,missing,missing,missing,missing,missing,missing,missing,0,0,1,1,0,0,0,0,0,0,0


## Now let's compare to the published data available from DANS-DAWS

![title](./dans.png)

You will have to trust me, the DANS sight was a one by one download of hundreds of files. I emailed them, and there is no other way to get that data. Practically inaccesible, and therefore we made a shell script. The organization did not lend to research. Also there were only 17 month and 29 month groups. We should publish the data on Zenodo or another open platform in such a way that it can be downloaded easily, not one by one.

The whole DANS site goes down for days at a time, and contact is not easy.

# The 'new data'

New data was gathered in 2021 in a different file format (.bdf) from participants who again varied in age and presumed prediliction for dyslexia. New data gathering has completed, but not all files have been sent to us. 