HDF5 for Python: http://docs.h5py.org/en/latest/

## Load modules

In [14]:
import h5py 
import os
import itertools
import re

## Define helper functions

In [223]:
def get_filenames(path):
    return([get_filenames(path+"/"+entry.name)
            if entry.is_dir() 
            else path+"/"+entry.name 
            for entry 
            in os.scandir(path)
           ])
def unlist(alist):
    return(list(itertools.chain.from_iterable(alist)
               )
          )
def check_HDF5_Group(Group):
    for key in list(Group.keys()):
        print('key:', key)
    for value in list(Group.values()):
        print('value:', value)

def check_HDF5_Dataset(Dataset):
    print('shape:',Dataset.shape)
    print('dtype:',Dataset.dtype)
    print('type(bc.value):',type(Dataset.value))
    print('value:',Dataset.value)

## Get list of filenames

In [17]:
path = "/Users/David/Dropbox/Data/MillionSongSubset/data"
x = unlist(unlist(unlist(get_filenames(path))))
p = re.compile("\.h5$")
y = [filename for filename in x if p.search(filename)]

## Read in the first HDF5 file, which is y[0].

In [218]:
f = h5py.File(y[0], "r")
print(f)

<HDF5 file "TRAAAAW128F429D538.h5" (mode r+)>


Note that it is of type `File`. The other HDF5 types are `Group` and `Dataset`.

In [71]:
isinstance(f, h5py.File)

True

In [228]:
print(f)
print(f["/"])
f == f["/"]

<HDF5 file "TRAAAAW128F429D538.h5" (mode r+)>
<HDF5 group "/" (3 members)>


True

The hierarchy stored in the file starts with the root `/`. Beneath this find types `Group`, possibly at multiple levels, and `Dataset`, at the leaves.

In [229]:
check_HDF5_Group(f["/"])

key: analysis
key: metadata
key: musicbrainz
value: <HDF5 group "/analysis" (16 members)>
value: <HDF5 group "/metadata" (5 members)>
value: <HDF5 group "/musicbrainz" (3 members)>


In [231]:
check_HDF5_Group(f["/analysis"])

key: bars_confidence
key: bars_start
key: beats_confidence
key: beats_start
key: sections_confidence
key: sections_start
key: segments_confidence
key: segments_loudness_max
key: segments_loudness_max_time
key: segments_loudness_start
key: segments_pitches
key: segments_start
key: segments_timbre
key: songs
key: tatums_confidence
key: tatums_start
value: <HDF5 dataset "bars_confidence": shape (83,), type "<f8">
value: <HDF5 dataset "bars_start": shape (83,), type "<f8">
value: <HDF5 dataset "beats_confidence": shape (344,), type "<f8">
value: <HDF5 dataset "beats_start": shape (344,), type "<f8">
value: <HDF5 dataset "sections_confidence": shape (10,), type "<f8">
value: <HDF5 dataset "sections_start": shape (10,), type "<f8">
value: <HDF5 dataset "segments_confidence": shape (971,), type "<f8">
value: <HDF5 dataset "segments_loudness_max": shape (971,), type "<f8">
value: <HDF5 dataset "segments_loudness_max_time": shape (971,), type "<f8">
value: <HDF5 dataset "segments_loudness_start

## Check out /analysis/songs

Looks like it contains the variables which have single values. 

In [234]:
check_HDF5_Dataset(f["/analysis/songs"])

shape: (1,)
dtype: [('analysis_sample_rate', '<i4'), ('audio_md5', 'S32'), ('danceability', '<f8'), ('duration', '<f8'), ('end_of_fade_in', '<f8'), ('energy', '<f8'), ('idx_bars_confidence', '<i4'), ('idx_bars_start', '<i4'), ('idx_beats_confidence', '<i4'), ('idx_beats_start', '<i4'), ('idx_sections_confidence', '<i4'), ('idx_sections_start', '<i4'), ('idx_segments_confidence', '<i4'), ('idx_segments_loudness_max', '<i4'), ('idx_segments_loudness_max_time', '<i4'), ('idx_segments_loudness_start', '<i4'), ('idx_segments_pitches', '<i4'), ('idx_segments_start', '<i4'), ('idx_segments_timbre', '<i4'), ('idx_tatums_confidence', '<i4'), ('idx_tatums_start', '<i4'), ('key', '<i4'), ('key_confidence', '<f8'), ('loudness', '<f8'), ('mode', '<i4'), ('mode_confidence', '<f8'), ('start_of_fade_out', '<f8'), ('tempo', '<f8'), ('time_signature', '<i4'), ('time_signature_confidence', '<f8'), ('track_id', 'S32')]
type(bc.value): <class 'numpy.ndarray'>
value: [ (22050, b'a222795e07cd65b7a530f1346f52

Why are the values in the first element of `value`? There is no second element

In [159]:
import numpy
for i in numpy.arange(0,len(bc.dtype)): 
    print(bc.dtype[int(i)])

int32
|S32
float64
float64
float64
float64
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
float64
float64
int32
float64
float64
float64
int32
float64
|S32


In [181]:
bc.value # two elements here

array([ (22050, b'a222795e07cd65b7a530f1346f520649', 0.0, 218.93179, 0.247, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.736, -11.197, 0, 0.636, 218.932, 92.198, 4, 0.778, b'TRAAAAW128F429D538')], 
      dtype=[('analysis_sample_rate', '<i4'), ('audio_md5', 'S32'), ('danceability', '<f8'), ('duration', '<f8'), ('end_of_fade_in', '<f8'), ('energy', '<f8'), ('idx_bars_confidence', '<i4'), ('idx_bars_start', '<i4'), ('idx_beats_confidence', '<i4'), ('idx_beats_start', '<i4'), ('idx_sections_confidence', '<i4'), ('idx_sections_start', '<i4'), ('idx_segments_confidence', '<i4'), ('idx_segments_loudness_max', '<i4'), ('idx_segments_loudness_max_time', '<i4'), ('idx_segments_loudness_start', '<i4'), ('idx_segments_pitches', '<i4'), ('idx_segments_start', '<i4'), ('idx_segments_timbre', '<i4'), ('idx_tatums_confidence', '<i4'), ('idx_tatums_start', '<i4'), ('key', '<i4'), ('key_confidence', '<f8'), ('loudness', '<f8'), ('mode', '<i4'), ('mode_confidence', '<f8'), ('start_of_fade_out'

## Check out /analysis/tatums_start

In [180]:
check_HDF5_Dataset(f["/analysis/tatums_start"])

shape: (688,)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [   0.28519    0.58521    0.89422    1.19196    1.49119    1.78893
    2.0828     2.37813    2.66761    2.94247    3.23074    3.50622
    3.79165    4.05077    4.3099     4.56902    4.82814    5.14371
    5.45437    5.76504    6.09135    6.41767    6.75551    7.08552
    7.41553    7.74554    8.07907    8.40279    8.73142    9.05841
    9.38051    9.70913   10.03123   10.36149   10.68195   11.01057
   11.34083   11.66783   11.99809   12.32182   12.65209   12.98399
   13.31099   13.63798   13.96498   14.28872   14.61572   14.94762
   15.27298   15.59835   15.92535   16.25463   16.58233   16.9132
   17.23796   17.56272   17.88748   18.21002   18.52945   18.85198
   19.17898   19.50271   19.83788   20.16161   20.49024   20.81724
   21.14751   21.47124   21.79988   22.12232   22.44802   22.76883
   23.08634   23.41491   23.74347   24.07314   24.39964   24.72931   25.051
   25.381     25.70102   26.053     26.40498 