In [34]:
import scipy.io as sio
import h5py
import mat4py # even more convenient that scipy.io.loadmat for v5
import numpy as np
import yaml
import json

In [2]:
import glob

In [3]:
asbytes_latin1 = lambda s: s.encode('latin1') # may want utf-8
asstr_latin1 = lambda b: b.decode('latin1')

def ndarray_uint16_to_string(arr):
    arr_uint = arr.astype('uint8') # truncate off 2nd byte
    arr_bytes = arr_uint.tobytes()
    return asstr_latin1(arr_bytes)


In [4]:
annotation_files = glob.glob('a*.mat')

In [5]:
annotation_files.sort()
afile = annotation_files[0]

In [6]:
# try a single file
h = h5py.File(afile)

In [7]:
list(h.attrs.items())

[]

In [8]:
list(h.keys())

['#refs#', 'annot_1']

In [9]:
a = h['annot_1']

In [11]:
list(a.attrs.items()) # this shows the names of the fields as arrays of |S1 dtype

[('MATLAB_class', b'struct'),
 ('MATLAB_fields',
  array([array([b'p', b'a', b't', b'i', b'e', b'n', b't'], dtype='|S1'),
         array([b's', b'z', b'_', b's', b't', b'a', b'r', b't'], dtype='|S1'),
         array([b's', b'z', b'_', b's', b't', b'o', b'p'], dtype='|S1'),
         array([b't', b'y', b'p', b'e'], dtype='|S1'),
         array([b'd', b'a', b't', b'a', b'_', b's', b't', b'a', b'r', b't'],
               dtype='|S1')                                                 ,
         array([b'd', b'a', b't', b'a', b'_', b's', b't', b'o', b'p'], dtype='|S1'),
         array([b'i', b'i', b'_', b's', b't', b'a', b'r', b't'], dtype='|S1'),
         array([b'i', b'i', b'_', b's', b't', b'o', b'p'], dtype='|S1')],
        dtype=object))]

In [12]:
list(a.items())

[('data_start', <HDF5 dataset "data_start": shape (1, 1), type "<f8">),
 ('data_stop', <HDF5 dataset "data_stop": shape (1, 1), type "<f8">),
 ('ii_start', <HDF5 dataset "ii_start": shape (3, 1), type "<f8">),
 ('ii_stop', <HDF5 dataset "ii_stop": shape (3, 1), type "<f8">),
 ('patient', <HDF5 dataset "patient": shape (7, 1), type "<u2">),
 ('sz_start', <HDF5 dataset "sz_start": shape (2, 1), type "<f8">),
 ('sz_stop', <HDF5 dataset "sz_stop": shape (2, 1), type "<f8">),
 ('type', <HDF5 dataset "type": shape (1, 1), type "<f8">)]

In [13]:
datasets = {}
for kk,vv  in a.items():
    print(f"{kk}: {vv[:]=}")
    datasets[kk] = vv[:]

data_start: vv[:]=array([[24000.]])
data_stop: vv[:]=array([[54000.]])
ii_start: vv[:]=array([[24000.],
       [24846.],
       [51868.]])
ii_stop: vv[:]=array([[24742.],
       [51771.],
       [54000.]])
patient: vv[:]=array([[82],
       [73],
       [68],
       [48],
       [48],
       [54],
       [48]], dtype=uint16)
sz_start: vv[:]=array([[24742.],
       [51771.]])
sz_stop: vv[:]=array([[24846.],
       [51868.]])
type: vv[:]=array([[4.]])


In [17]:
#import tables
datasets

{'data_start': array([[24000.]]),
 'data_stop': array([[54000.]]),
 'ii_start': array([[24000.],
        [24846.],
        [51868.]]),
 'ii_stop': array([[24742.],
        [51771.],
        [54000.]]),
 'patient': array([[82],
        [73],
        [68],
        [48],
        [48],
        [54],
        [48]], dtype=uint16),
 'sz_start': array([[24742.],
        [51771.]]),
 'sz_stop': array([[24846.],
        [51868.]]),
 'type': array([[4.]])}

In [18]:
list(h['#refs#'].items())

[('a', <HDF5 dataset "a": shape (2,), type "<u8">)]

In [19]:
ann = {}
errorfiles = []

for afile in annotation_files:
    sp = afile.split('_')
    num_str = sp[1][:-4]
    annot_name = afile[:-4]
    num = int(num_str)
    #print(f"working on {annot_name=} {num} from file {afile}")
    try:
        h = h5py.File(afile)
        
        #print(f"{list(h.keys())}")
        a = h[annot_name]
        datasets = {}
        for kk,vv  in a.items():
            #print(f"{kk}: {vv[:]=}")
            datasets[kk] = vv[:]
        ann[num] = datasets 
    except OSError:
        print(f"problems opening {afile}")
        errorfiles.append(afile) 
    

    #h = h5py.File(afile)

problems opening annot_6.mat
problems opening annot_92.mat
problems opening annot_93.mat
problems opening annot_94.mat


In [20]:
len(ann)

90

In [21]:
# fix those annotations
for kk, aa in ann.items():
    for jj, vv in aa.items():
        aa[jj] = np.squeeze(vv)
    aa['patient'] = ndarray_uint16_to_string(aa['patient'])

ann[1]

{'data_start': array(24000.),
 'data_stop': array(54000.),
 'ii_start': array([24000., 24846., 51868.]),
 'ii_stop': array([24742., 51771., 54000.]),
 'patient': 'RID0060',
 'sz_start': array([24742., 51771.]),
 'sz_stop': array([24846., 51868.]),
 'type': array(4.)}

In [22]:
pt = ann[1]['patient']

In [23]:
for fn in errorfiles:
    sp = fn.split('_')
    num_str = sp[1][:-4]
    annot_name = fn[:-4]
    num = int(num_str)
    print(f"{fn=},{num=}, {annot_name=}")
    data = mat4py.loadmat(fn)
    print(data)
    ann[num] = data[annot_name] 

fn='annot_6.mat',num=6, annot_name='annot_6'
{'annot_6': {'patient': 'RID0065', 'sz_start': [13709, 15433, 18216, 19625], 'sz_stop': [13827, 15581, 18368, 19763], 'type': 4, 'ii_start': [13000, 13827, 15581, 18368, 19763], 'ii_stop': [13709, 15433, 18216, 19625, 20000]}}
fn='annot_92.mat',num=92, annot_name='annot_92'
{'annot_92': {'patient': 'ICUDataRedux_0003', 'ii_start': [25000, 26019], 'ii_stop': [25923, 30000], 'sz_start': 25923, 'sz_stop': 26019}}
fn='annot_93.mat',num=93, annot_name='annot_93'
{'annot_93': {'patient': 'ICUDataRedux_0004', 'sz_start': 23125, 'sz_stop': 23136, 'ii_start': [20000, 23136], 'ii_stop': [23125, 30000]}}
fn='annot_94.mat',num=94, annot_name='annot_94'
{'annot_94': {'patient': 'ICUDataRedux_0006', 'sz_start': 24807, 'sz_stop': 25389, 'ii_start': [20600, 25389], 'ii_stop': [24807, 25600]}}


In [24]:
yaml.dump(ann, open('annotations_numpy.yaml','w+'))

## Important
the mat v5 files do not contain data start/stop annotations

In [25]:
ann[6]

{'patient': 'RID0065',
 'sz_start': [13709, 15433, 18216, 19625],
 'sz_stop': [13827, 15581, 18368, 19763],
 'type': 4,
 'ii_start': [13000, 13827, 15581, 18368, 19763],
 'ii_stop': [13709, 15433, 18216, 19625, 20000]}

In [26]:
ann[1]

{'data_start': array(24000.),
 'data_stop': array(54000.),
 'ii_start': array([24000., 24846., 51868.]),
 'ii_stop': array([24742., 51771., 54000.]),
 'patient': 'RID0060',
 'sz_start': array([24742., 51771.]),
 'sz_stop': array([24846., 51868.]),
 'type': array(4.)}

In [27]:
# now we may as well convert things to python lists given how short these arrays are
for kk in ann:
    data = ann[kk]
    for ii,vv in data.items():
        if type(vv) == np.ndarray:
            data[ii] = vv.tolist()


In [28]:
ann[1]

{'data_start': 24000.0,
 'data_stop': 54000.0,
 'ii_start': [24000.0, 24846.0, 51868.0],
 'ii_stop': [24742.0, 51771.0, 54000.0],
 'patient': 'RID0060',
 'sz_start': [24742.0, 51771.0],
 'sz_stop': [24846.0, 51868.0],
 'type': 4.0}

In [29]:
yaml.dump(ann, open('annotations_floats.yaml','w+'))

In [30]:
#annpy = yaml.safe_load(open('annotations_floats.yaml'))

In [31]:
for kk, data in ann.items():
    for ii,vv in data.items():
        if type(vv) == list:
            data[ii] = [int(val) for val in vv]
        if type(vv) == float:
            data[ii] = int(vv)

In [32]:
yaml.safe_dump(ann, open('annotations.yaml','w+'))


In [35]:
json.dump(ann, open('annotations.json', 'w+'))

In [36]:
ann

{1: {'data_start': 24000,
  'data_stop': 54000,
  'ii_start': [24000, 24846, 51868],
  'ii_stop': [24742, 51771, 54000],
  'patient': 'RID0060',
  'sz_start': [24742, 51771],
  'sz_stop': [24846, 51868],
  'type': 4},
 10: {'ii_start': [3200,
   5139,
   5201,
   5276,
   5338,
   5436,
   5609,
   5730,
   5763,
   5862,
   5949],
  'ii_stop': [5128,
   5187,
   5263,
   5322,
   5420,
   5593,
   5720,
   5754,
   5846,
   5940,
   6145],
  'patient': 'RID0069',
  'sz_start': [5128,
   5187,
   5263,
   5322,
   5420,
   5593,
   5720,
   5754,
   5846,
   5940,
   6145],
  'sz_stop': [5139,
   5201,
   5276,
   5338,
   5436,
   5609,
   5730,
   5763,
   5862,
   5949,
   6157],
  'type': 1},
 100: {'iic_start': 2000, 'iic_stop': 12000, 'patient': 'ICUDataRedux_0012'},
 101: {'iic_start': 1000, 'iic_stop': 11000, 'patient': 'ICUDataRedux_0013'},
 102: {'iic_start': 3000, 'iic_stop': 13000, 'patient': 'ICUDataRedux_0014'},
 103: {'iic_start': 2000, 'iic_stop': 12000, 'patient': 'ICU

In [38]:
import hdf5storage

In [37]:
#json.dump(annpy, open('annotations.json', 'w'))
# yaml.safe_dump(annpy, open('annotations.yaml', 'w'))

In [39]:
afile

'annot_99.mat'

In [44]:
# hdf5storage is based upon h5py and is supposed to support matlab 7.3 format
# let's see what it is does.
import collections as cl
import numpy as np
options = hdf5storage.Options(matlab_compatible=True)
matfile = hdf5storage.read(filename=afile, options=options)

In [45]:
matfile

array([([([['ICUDataRedux_0011']], [[14000.]], [[4000.]])],)],
      dtype=[('annot_99', [('patient', '<U17', (1, 1)), ('iic_stop', '<f8', (1, 1)), ('iic_start', '<f8', (1, 1))], (1,))])

In [46]:
matfile['annot_99']

array([[([['ICUDataRedux_0011']], [[14000.]], [[4000.]])]],
      dtype=[('patient', '<U17', (1, 1)), ('iic_stop', '<f8', (1, 1)), ('iic_start', '<f8', (1, 1))])

In [48]:
matfile.dtype

dtype([('annot_99', [('patient', '<U17', (1, 1)), ('iic_stop', '<f8', (1, 1)), ('iic_start', '<f8', (1, 1))], (1,))])

In [59]:
np.squeeze(matfile['annot_99']['patient']) # dtype='<U17' means a 17 character unicode string, so it handles unicode better

array('ICUDataRedux_0011', dtype='<U17')

In [53]:
matfile['annot_99']['iic_stop']

array([[[[14000.]]]])

In [56]:
np.squeeze(matfile['annot_99']['iic_start'])

array(4000.)