# Magics, Imports, and Versions

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import collections
import json
import os
from pathlib import Path
import pyedflib
import re
from subprocess import call
import sys
import typing
import urllib

In [3]:
print(f'__Python VERSION: {sys.version}')

try:
    print(f'__pyTorch VERSION: {torch.__version__}')
    PYTORCH = True
except: 
    print("Pytorch Not Installed")
    PYTORCH = False

try:
    print(f'__fastai VERSION: {fastai.__version__}')
except:
    print("fastai Not Installed")
    
print('__CUDA VERSION')

! nvcc --version

if PYTORCH:
    print(f'__CUDNN VERSION: {torch.backends.cudnn.version()}')
    print(f'__Number CUDA Devices: {torch.cuda.device_count()}')
    
print(f'__Devices')

try:
    call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
    print(f'Active CUDA Device: GPU {torch.cuda.current_device()}')

    print (f'Available devices: {torch.cuda.device_count()}')
    print (f'Current cuda device {torch.cuda.current_device()}')
except:
    print("No GPUs Found")

__Python VERSION: 3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]
Pytorch Not Installed
fastai Not Installed
__CUDA VERSION
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130
__Devices
No GPUs Found


# Paths

In [4]:
here = Path(f'{os.getcwd()}')
data_path = here.parent/'data'
raw_path = data_path/'raw'/'v1.5.0/edf'

# Build List of Files

In [5]:
working_files = []
for (dir_path, dir_names, file_names) in os.walk(raw_path):
    max_eeg = [re.search('t(\d*)',file.split('/')[-1])[1] for file in file_names]
    m = max(max_eeg) if len(max_eeg) >= 1 else 0
    for file in file_names:
        if file.endswith('.edf'):
            working_files.append((dir_path + '/' + file, m))

In [6]:
working_files[:5]

[('/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/01_tcp_ar/085/00008544/s011_2012_01_08/00008544_s011_t008.edf',
  '011'),
 ('/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/01_tcp_ar/085/00008544/s011_2012_01_08/00008544_s011_t004.edf',
  '011'),
 ('/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/01_tcp_ar/085/00008544/s011_2012_01_08/00008544_s011_t010.edf',
  '011'),
 ('/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/01_tcp_ar/085/00008544/s011_2012_01_08/00008544_s011_t006.edf',
  '011'),
 ('/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/01_tcp_ar/085/00008544/s011_2012_01_08/00008544_s011_t001.edf',
  '011')]

In [7]:
# make sure that they all have the same structure
nums = []
for file, m in working_files:
    nums.append(len(file.split('/')))
collections.Counter(nums)

Counter({14: 5610})

# Process File List

In [8]:
d = collections.defaultdict(list)
for file, m in working_files:
    split = file.split('/')
    data_type = split[8]
    config = split[9].split('_')[-1]
    patient_id = split[11]
    long = split[12].split('_')
    session = long[0].strip('s')
    date = f'{long[1]}/{long[2]}/{long[3].strip(patient_id)}' 
    segment = re.search('t(\d*)',split[13])[1]
    f = pyedflib.EdfReader(file)
    channels = str(f.signals_in_file)
    num_obs = str(f.getNSamples()[0])
    label_file = file[:-3]+'tse'
    with open(label_file,'r') as f:
        labels, durations = [], []
        for line in f.readlines()[2:]:
            labels.append(line.split(' ')[2])
            durations.append(line.split(' ')[1])
    sample_rate = str(float(num_obs) / float(durations[-1]))
    d[patient_id].append({'patient':patient_id,
                          'date':date, 
                          'session':session, 
                          'segment':segment, 
                          'total_segments':m, 
                          'labels':labels,
                          'durations': durations,
                          'channels':channels,
                          'number_obs': num_obs,
                          'sample_rate': sample_rate,
                          'config':config, 
                          'type':data_type, 
                          'loc':file,
                          'label_file': label_file
                         })

In [17]:
d['00008544']

[{'patient': '00008544',
  'date': '2012/01/',
  'session': '011',
  'segment': '008',
  'total_segments': '011',
  'labels': ['bckg'],
  'durations': ['601.0000'],
  'channels': '34',
  'number_obs': '153856',
  'sample_rate': '256.0',
  'config': 'ar',
  'type': 'dev_test',
  'loc': '/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/01_tcp_ar/085/00008544/s011_2012_01_08/00008544_s011_t008.edf',
  'label_file': '/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/01_tcp_ar/085/00008544/s011_2012_01_08/00008544_s011_t008.tse'},
 {'patient': '00008544',
  'date': '2012/01/',
  'session': '011',
  'segment': '004',
  'total_segments': '011',
  'labels': ['bckg'],
  'durations': ['601.0000'],
  'channels': '34',
  'number_obs': '153856',
  'sample_rate': '256.0',
  'config': 'ar',
  'type': 'dev_test',
  'loc': '/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/01_tcp_ar/085/00008544/s011_2012_01_08/00008544_s011_t004.edf',
  'label_file': '/data2/dcgrzenda/tuh_eeg/data/raw/v1.

# Save and Load Dictionary

In [10]:
# save the dictionary to json
with open(here.parent/'data_dict.json', 'w') as file:
     file.write(json.dumps(d))

In [11]:
# load the file back in 
with open(here.parent/'data_dict.json', 'r') as file:
     data_dict = json.load(file)

In [18]:
# make sure they are the same
data_dict['00004151'] == d['00004151']

True