# Magics, Imports, and Versions

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import collections
import json
import operator
import os
from pathlib import Path
import pyedflib
from subprocess import call
import sys
import typing
import urllib

In [3]:
print(f'__Python VERSION: {sys.version}')

try:
    print(f'__pyTorch VERSION: {torch.__version__}')
    PYTORCH = True
except: 
    print("Pytorch Not Installed")
    PYTORCH = False

try:
    print(f'__fastai VERSION: {fastai.__version__}')
except:
    print("fastai Not Installed")
    
print('__CUDA VERSION')

! nvcc --version

if PYTORCH:
    print(f'__CUDNN VERSION: {torch.backends.cudnn.version()}')
    print(f'__Number CUDA Devices: {torch.cuda.device_count()}')
    
print(f'__Devices')

try:
    call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
    print(f'Active CUDA Device: GPU {torch.cuda.current_device()}')

    print (f'Available devices: {torch.cuda.device_count()}')
    print (f'Current cuda device {torch.cuda.current_device()}')
except:
    print("No GPUs Found")

__Python VERSION: 3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]
Pytorch Not Installed
fastai Not Installed
__CUDA VERSION
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130
__Devices
No GPUs Found


# Load in Data Dict

In [4]:
here = Path(f'{os.getcwd()}')
data_path = here.parent/'data'
raw_path = data_path/'raw'/'v1.5.0/edf'

In [5]:
# load the file back in 
with open(here.parent/'data_dict.json', 'r') as file:
     data_dict = json.load(file)

In [6]:
data_dict['00004151']

[{'patient': '00004151',
  'date': '2007/09/28',
  'session': '002',
  'segment': '001',
  'total_segments': '003',
  'labels': ['bckg'],
  'durations': ['490.0000'],
  'channels': '41',
  'number_obs': '122500',
  'sample_rate': '250.0',
  'config': 'le',
  'type': 'dev_test',
  'loc': '/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/02_tcp_le/041/00004151/s002_2007_09_28/00004151_s002_t001.edf',
  'label_file': '/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/02_tcp_le/041/00004151/s002_2007_09_28/00004151_s002_t001.tse'},
 {'patient': '00004151',
  'date': '2007/09/28',
  'session': '002',
  'segment': '003',
  'total_segments': '003',
  'labels': ['bckg'],
  'durations': ['443.0000'],
  'channels': '41',
  'number_obs': '110750',
  'sample_rate': '250.0',
  'config': 'le',
  'type': 'dev_test',
  'loc': '/data2/dcgrzenda/tuh_eeg/data/raw/v1.5.0/edf/dev_test/02_tcp_le/041/00004151/s002_2007_09_28/00004151_s002_t003.edf',
  'label_file': '/data2/dcgrzenda/tuh_eeg/data/raw

# Observations per Electrode Config

In [7]:
configs = []
for key, value in data_dict.items():
    for entry in value:
        configs.append(entry['config'])

In [8]:
collections.Counter(configs)

Counter({'ar': 4074, 'le': 535, 'a': 1001})

## Time per Electrode Config as well

In [9]:
configs = collections.defaultdict(list)
for key, value in data_dict.items():
    for entry in value:
        configs[entry['config']].append(float(entry['durations'][-1]))

In [10]:
summary = collections.defaultdict()
for key, value in sorted(configs.items()):
    summary[key] = sum(value)
for key, value in summary.items():
    print(key, value)

a 452039.0
ar 2376345.0
le 493132.0


In [11]:
total = sum(summary.values())
for key, value in sorted(summary.items(), key=operator.itemgetter(1), reverse=True):
    print(f'{key} {(value/total)*100:.2f}%')

ar 71.54%
le 14.85%
a 13.61%


# Duration Breakdown

In [12]:
duration = collections.defaultdict(int)
for key, value in data_dict.items():
    for entry in value:
        duration[float(entry['durations'][-1])] += 1

min(duration.items()), max(duration.items())

((11.0, 2), (3598.0, 1))

# Time Breakdown for each Label

In [13]:
time_breakdown = collections.defaultdict(list)
for key, value in data_dict.items():
    for entry in value:
        time_list = zip(entry['labels'], entry['durations'])
        last = 0
        for label, curr in time_list:
            time = float(curr) - last
            time_breakdown[label].append(time)
            last = float(curr)

In [14]:
summary = collections.defaultdict()
for key, value in time_breakdown.items():
    summary[key] = sum(value)

sorted(summary.items())

[('absz', 851.9839999999986),
 ('bckg', 3092339.931200001),
 ('cpsz', 35342.82400000001),
 ('fnsz', 122307.4608999996),
 ('gnsz', 60024.92230000003),
 ('mysz', 1312.0),
 ('spsz', 2145.8244000000004),
 ('tcsz', 5879.5585),
 ('tnsz', 1311.4947000000002)]

In [15]:
total = sum(summary.values())
print(f'Total Time: {total}')
for key, value in sorted(summary.items(), key=operator.itemgetter(1), reverse=True):
    print(f'{key} {(value/total)*100:.2f}%')

Total Time: 3321516.000000001
bckg 93.10%
fnsz 3.68%
gnsz 1.81%
cpsz 1.06%
tcsz 0.18%
spsz 0.06%
mysz 0.04%
tnsz 0.04%
absz 0.03%


# Label Breakdown

Is there ever more than one type of seizure in a session?

In [16]:
label_sets = collections.defaultdict(int)
for key, value in data_dict.items():
    for entry in value:
        label_set = frozenset(entry['labels'])
        label_sets[label_set] += 1
for key, value in label_sets.items():
    print(f'{key} : {value}')

frozenset({'bckg'}) : 4457
frozenset({'bckg', 'cpsz'}) : 162
frozenset({'fnsz', 'bckg'}) : 649
frozenset({'fnsz', 'bckg', 'cpsz'}) : 5
frozenset({'bckg', 'gnsz'}) : 218
frozenset({'fnsz', 'bckg', 'gnsz'}) : 23
frozenset({'bckg', 'mysz'}) : 3
frozenset({'tcsz', 'bckg'}) : 30
frozenset({'tcsz', 'bckg', 'fnsz'}) : 3
frozenset({'bckg', 'tnsz'}) : 31
frozenset({'bckg', 'cpsz', 'gnsz'}) : 1
frozenset({'bckg', 'spsz', 'gnsz'}) : 2
frozenset({'fnsz', 'bckg', 'spsz'}) : 1
frozenset({'absz', 'bckg'}) : 20
frozenset({'bckg', 'spsz'}) : 5


Can we relabel the one's with 3 seizure types? Do people usually have different seizure types back to back?

fnsz or gnsz become the other label?

# Channel Breakdown

In [17]:
time_breakdown = collections.defaultdict(list)
for key, value in data_dict.items():
    for entry in value:
        time_breakdown[int(entry['channels'])].append(float(entry['durations'][-1]))

In [18]:
summary = collections.defaultdict()
for key, value in time_breakdown.items():
    summary[key]= sum(value)

sorted(summary.items())

[(25, 6007.0),
 (26, 4079.0),
 (27, 322983.0),
 (28, 114145.0),
 (29, 315438.0),
 (30, 322669.0),
 (31, 243163.0),
 (32, 326016.0),
 (33, 550259.0),
 (34, 572917.0),
 (35, 5879.0),
 (36, 242326.0),
 (41, 281907.0),
 (128, 1250.0),
 (129, 12478.0)]

In [19]:
total = sum(summary.values())

for key, value in sorted(summary.items(), key=operator.itemgetter(1), reverse=True):
    print(f'{key} {(value/total)*100:.2f}%')

34 17.25%
33 16.57%
32 9.82%
27 9.72%
30 9.71%
29 9.50%
41 8.49%
31 7.32%
36 7.30%
28 3.44%
129 0.38%
25 0.18%
35 0.18%
26 0.12%
128 0.04%


# Hertz Breakdown

In [20]:
hertz = collections.defaultdict(int)
for key, value in data_dict.items():
    for entry in value:
        hertz[int(float(entry['sample_rate']))] += 1
hertz.items()

dict_items([(256, 3750), (250, 1045), (400, 637), (512, 127), (1000, 51)])

In [21]:
total_hertz = sum(hertz.values())
for key, value in sorted(hertz.items(), key=operator.itemgetter(1), reverse=True):
    print(f'{key} {(value/total_hertz)*100:.2f}%')

256 66.84%
250 18.63%
400 11.35%
512 2.26%
1000 0.91%


# Size Estimation based on Time

In [22]:
total_time = 0
image_time = 0
for key, value in data_dict.items():
    if key == '00004151':
        for entry in value:
            image_time += float(entry['durations'][-1])
            total_time += float(entry['durations'][-1])
    else:
        for entry in value:
            total_time += float(entry['durations'][-1])

print(image_time, total_time)
ratio = image_time/total_time
print(ratio)

2612.0 3321516.0
0.0007863879023915585


In [23]:
regular_size = 20.47
# zipped_size = 
# gzip_size = 
# 7z_size = 
# zopfli_size = 

In [24]:
print(f'Regular: {regular_size/ratio:0.2f}')
# print(f'Zipped: {zipped_size/ratio:0.2f}')
# print(f'Gzip: {gzip_size/ratio:0.2f}')
# print(f'7z: {7z_size/ratio:0.2f}')
# print(f'Zopfli: {zopfli_size/ratio:0.2f}')

Regular: 26030.41


# EEG Channel Breakdown

In [25]:
channel_names = collections.defaultdict(int)
total_eeg_count = 0
for key, value in data_dict.items():
    total_eeg_count += len(value)
    for entry in value:
        f = pyedflib.EdfReader(entry['loc'])
        n = f.signals_in_file
        signal_labels = f.getSignalLabels()
        for label in signal_labels:
            channel_names[label] += 1

In [26]:
total_eeg_count

5610

In [27]:
for key, value in sorted(channel_names.items(), key=operator.itemgetter(1), reverse=True):
    print(f'{key:20}{value:>5}')

EEG FP1-REF          5075
EEG FP2-REF          5075
EEG F3-REF           5075
EEG F4-REF           5075
EEG C3-REF           5075
EEG C4-REF           5075
EEG P3-REF           5075
EEG P4-REF           5075
EEG O1-REF           5075
EEG O2-REF           5075
EEG F7-REF           5075
EEG F8-REF           5075
EEG T3-REF           5075
EEG T4-REF           5075
EEG T5-REF           5075
EEG T6-REF           5075
EEG FZ-REF           5075
EEG CZ-REF           5075
EEG PZ-REF           5075
EEG T1-REF           4536
EEG T2-REF           4536
IBI                  4438
BURSTS               4438
SUPPR                4438
EEG EKG1-REF         4423
EEG A1-REF           4074
EEG A2-REF           4074
EEG 31-REF           2927
EEG 32-REF           2927
EEG C3P-REF          2232
EEG C4P-REF          2232
EEG SP1-REF          2171
EEG SP2-REF          2083
EMG-REF              1583
EEG 29-REF           1249
EEG 30-REF           1249
EEG ROC-REF          1058
EEG LOC-REF          1058
PHOTIC-REF  

In [28]:
for key, value in sorted(channel_names.items(), key=operator.itemgetter(1), reverse=True):
    print(f'{key:20}{(value/total_eeg_count)*100:>5.2f}%')

EEG FP1-REF         90.46%
EEG FP2-REF         90.46%
EEG F3-REF          90.46%
EEG F4-REF          90.46%
EEG C3-REF          90.46%
EEG C4-REF          90.46%
EEG P3-REF          90.46%
EEG P4-REF          90.46%
EEG O1-REF          90.46%
EEG O2-REF          90.46%
EEG F7-REF          90.46%
EEG F8-REF          90.46%
EEG T3-REF          90.46%
EEG T4-REF          90.46%
EEG T5-REF          90.46%
EEG T6-REF          90.46%
EEG FZ-REF          90.46%
EEG CZ-REF          90.46%
EEG PZ-REF          90.46%
EEG T1-REF          80.86%
EEG T2-REF          80.86%
IBI                 79.11%
BURSTS              79.11%
SUPPR               79.11%
EEG EKG1-REF        78.84%
EEG A1-REF          72.62%
EEG A2-REF          72.62%
EEG 31-REF          52.17%
EEG 32-REF          52.17%
EEG C3P-REF         39.79%
EEG C4P-REF         39.79%
EEG SP1-REF         38.70%
EEG SP2-REF         37.13%
EMG-REF             28.22%
EEG 29-REF          22.26%
EEG 30-REF          22.26%
EEG ROC-REF         18.86%
E