HDF5 for Python: http://docs.h5py.org/en/latest/

## Load modules

In [77]:
import h5py 
import os
import itertools
import re

## Define helper functions

In [3]:
def get_filenames(path):
    return([get_filenames(path+"/"+entry.name)
            if entry.is_dir() 
            else path+"/"+entry.name 
            for entry 
            in os.scandir(path)
           ])
def unlist(alist):
    return(list(itertools.chain.from_iterable(alist)
               )
          )
def check_HDF5_Group(Group):
    for key in list(Group.keys()):
        print('key:', key)
    for value in list(Group.values()):
        print('value:', value)

def check_HDF5_Dataset(Dataset):
    print('shape:',Dataset.shape)
    print('dtype:',Dataset.dtype)
    print('type(bc.value):',type(Dataset.value))
    print('value:',Dataset.value)

## Get list of filenames

In [23]:
path = "/Users/David/Dropbox/Data/MillionSongSubset/data"
x = unlist(unlist(unlist(get_filenames(path))))
p = re.compile("\.h5$")
y = [filename for filename in x if p.search(filename)]

## Open the first HDF5 file y[0].

Each files contains data for a single track or song. (Need to figure this out.)

In [5]:
f = h5py.File(y[0], "r")

In [6]:
check_HDF5_Group(f["/"])

key: analysis
key: metadata
key: musicbrainz
value: <HDF5 group "/analysis" (16 members)>
value: <HDF5 group "/metadata" (5 members)>
value: <HDF5 group "/musicbrainz" (3 members)>


## Look at the `/metadata` group

In [7]:
check_HDF5_Group(f["/metadata"])

key: artist_terms
key: artist_terms_freq
key: artist_terms_weight
key: similar_artists
key: songs
value: <HDF5 dataset "artist_terms": shape (37,), type "|S256">
value: <HDF5 dataset "artist_terms_freq": shape (37,), type "<f8">
value: <HDF5 dataset "artist_terms_weight": shape (37,), type "<f8">
value: <HDF5 dataset "similar_artists": shape (100,), type "|S20">
value: <HDF5 dataset "songs": shape (1,), type "|V5320">


Need to figure out what the `freq` and `weight` say about the `artist_terms`.

In [17]:
f["/metadata/artist_terms"].value

array([b'hip hop', b'underground rap', b'g funk', b'alternative rap',
       b'gothic rock', b'west coast rap', b'rap', b'club dance',
       b'singer-songwriter', b'chill-out', b'underground hip hop', b'rock',
       b'gothic', b'san francisco bay area', b'indie', b'american',
       b'punk', b'california', b'industrial', b'new york', b'90s',
       b'latin', b'spanish', b'dark', b'ebm', b'underground', b'deathrock',
       b'west coast', b'san francisco', b'producer', b'oakland',
       b'catalan', b'barcelona', b'doomsdope', b'norcal',
       b'west coast hip hop', b'alternative rock'], 
      dtype='|S256')

In [18]:
f["/metadata/artist_terms_freq"].value

array([ 1.        ,  0.77613623,  0.72966979,  0.68301072,  0.73013328,
        0.6715377 ,  0.80834839,  0.63365545,  0.72966979,  0.5663102 ,
        0.57592584,  0.76900909,  0.58208775,  0.47493557,  0.67402276,
        0.61640511,  0.64964936,  0.51614094,  0.6000771 ,  0.51980619,
        0.59353997,  0.56511801,  0.55205485,  0.53260669,  0.53958053,
        0.50931501,  0.45188402,  0.4484606 ,  0.43320144,  0.4322592 ,
        0.42699224,  0.40636693,  0.36872193,  0.35580011,  0.33148334,
        0.28803951,  0.3212017 ])

In [19]:
f["/metadata/artist_terms_weight"].value

array([ 1.        ,  0.89793596,  0.88426185,  0.84262975,  0.84256301,
        0.83239282,  0.82577707,  0.79859195,  0.7431759 ,  0.73850237,
        0.72505245,  0.71389955,  0.67049417,  0.65697231,  0.65105613,
        0.65105612,  0.65105597,  0.65105592,  0.65105547,  0.65105532,
        0.65105508,  0.65105506,  0.65105461,  0.65105427,  0.65105376,
        0.65104997,  0.6364043 ,  0.63334971,  0.61973455,  0.61889383,
        0.61419433,  0.59579116,  0.56220197,  0.55067233,  0.52897541,
        0.49021215,  0.38341077])

Artist identifiers are strings that start with "AR".

In [234]:
check_HDF5_Dataset(f["/analysis/songs"])

shape: (1,)
dtype: [('analysis_sample_rate', '<i4'), ('audio_md5', 'S32'), ('danceability', '<f8'), ('duration', '<f8'), ('end_of_fade_in', '<f8'), ('energy', '<f8'), ('idx_bars_confidence', '<i4'), ('idx_bars_start', '<i4'), ('idx_beats_confidence', '<i4'), ('idx_beats_start', '<i4'), ('idx_sections_confidence', '<i4'), ('idx_sections_start', '<i4'), ('idx_segments_confidence', '<i4'), ('idx_segments_loudness_max', '<i4'), ('idx_segments_loudness_max_time', '<i4'), ('idx_segments_loudness_start', '<i4'), ('idx_segments_pitches', '<i4'), ('idx_segments_start', '<i4'), ('idx_segments_timbre', '<i4'), ('idx_tatums_confidence', '<i4'), ('idx_tatums_start', '<i4'), ('key', '<i4'), ('key_confidence', '<f8'), ('loudness', '<f8'), ('mode', '<i4'), ('mode_confidence', '<f8'), ('start_of_fade_out', '<f8'), ('tempo', '<f8'), ('time_signature', '<i4'), ('time_signature_confidence', '<f8'), ('track_id', 'S32')]
type(bc.value): <class 'numpy.ndarray'>
value: [ (22050, b'a222795e07cd65b7a530f1346f52

The `/metadata/songs` values correspond to the single value variables. Notice that the _file-song_ dataset contains multiple value variables. 

Why are the values in the first element of `value` object? There is no second element. At least, the other element (which seems to be named `dtype`)  cannot be accessed as the second element or as an element named `dtype`. 

In [159]:
import numpy
for i in numpy.arange(0,len(bc.dtype)): 
    print(bc.dtype[int(i)])

int32
|S32
float64
float64
float64
float64
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
float64
float64
int32
float64
float64
float64
int32
float64
|S32


In [181]:
bc.value # two elements here

array([ (22050, b'a222795e07cd65b7a530f1346f520649', 0.0, 218.93179, 0.247, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.736, -11.197, 0, 0.636, 218.932, 92.198, 4, 0.778, b'TRAAAAW128F429D538')], 
      dtype=[('analysis_sample_rate', '<i4'), ('audio_md5', 'S32'), ('danceability', '<f8'), ('duration', '<f8'), ('end_of_fade_in', '<f8'), ('energy', '<f8'), ('idx_bars_confidence', '<i4'), ('idx_bars_start', '<i4'), ('idx_beats_confidence', '<i4'), ('idx_beats_start', '<i4'), ('idx_sections_confidence', '<i4'), ('idx_sections_start', '<i4'), ('idx_segments_confidence', '<i4'), ('idx_segments_loudness_max', '<i4'), ('idx_segments_loudness_max_time', '<i4'), ('idx_segments_loudness_start', '<i4'), ('idx_segments_pitches', '<i4'), ('idx_segments_start', '<i4'), ('idx_segments_timbre', '<i4'), ('idx_tatums_confidence', '<i4'), ('idx_tatums_start', '<i4'), ('key', '<i4'), ('key_confidence', '<f8'), ('loudness', '<f8'), ('mode', '<i4'), ('mode_confidence', '<f8'), ('start_of_fade_out'

## Check out /analysis/tatums_start

Note that we have a (a vector of) `688` values this variable, for this song. 

In [7]:
check_HDF5_Dataset(f["/analysis/tatums_start"])

shape: (688,)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [   0.28519    0.58521    0.89422    1.19196    1.49119    1.78893
    2.0828     2.37813    2.66761    2.94247    3.23074    3.50622
    3.79165    4.05077    4.3099     4.56902    4.82814    5.14371
    5.45437    5.76504    6.09135    6.41767    6.75551    7.08552
    7.41553    7.74554    8.07907    8.40279    8.73142    9.05841
    9.38051    9.70913   10.03123   10.36149   10.68195   11.01057
   11.34083   11.66783   11.99809   12.32182   12.65209   12.98399
   13.31099   13.63798   13.96498   14.28872   14.61572   14.94762
   15.27298   15.59835   15.92535   16.25463   16.58233   16.9132
   17.23796   17.56272   17.88748   18.21002   18.52945   18.85198
   19.17898   19.50271   19.83788   20.16161   20.49024   20.81724
   21.14751   21.47124   21.79988   22.12232   22.44802   22.76883
   23.08634   23.41491   23.74347   24.07314   24.39964   24.72931   25.051
   25.381     25.70102   26.053     26.40498 

## Check out /analysis/segments_pitches

Note that we have a matrix of `971` rows and `12` columns for this variable, for this song. 

In [9]:
check_HDF5_Dataset(f["/analysis/segments_pitches"])

shape: (971, 12)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [[ 0.946  0.684  0.679 ...,  0.732  1.     0.742]
 [ 0.01   0.054  0.015 ...,  0.008  0.012  0.017]
 [ 0.296  0.993  1.    ...,  0.105  0.064  0.19 ]
 ..., 
 [ 0.41   0.777  0.126 ...,  0.999  0.378  0.007]
 [ 0.373  0.697  1.    ...,  0.413  0.577  0.431]
 [ 0.877  1.     0.059 ...,  0.343  0.583  0.031]]


In [22]:
check_HDF5_Group(f["/musicbrainz"])

key: artist_mbtags
key: artist_mbtags_count
key: songs
value: <HDF5 dataset "artist_mbtags": shape (0,), type "|S256">
value: <HDF5 dataset "artist_mbtags_count": shape (0,), type "<i4">
value: <HDF5 dataset "songs": shape (1,), type "|V8">


## Read in the "additional" HDF5 file

In [26]:
path_addl = "/Users/David/Dropbox/Data/MillionSongSubset/AdditionalFiles"
file_addl = "subset_msd_summary_file.h5"
f = h5py.File(path_addl+"/"+file_addl, "r")

In [28]:
f["/"]

<HDF5 group "/" (3 members)>

In [29]:
check_HDF5_Group(f["/"])

key: analysis
key: metadata
key: musicbrainz
value: <HDF5 group "/analysis" (1 members)>
value: <HDF5 group "/metadata" (1 members)>
value: <HDF5 group "/musicbrainz" (1 members)>


## Look at `/metadata`

In [37]:
list(f["/metadata"].keys())

['songs']

We (someone) should compare these variables with those from the first file `y[0]` above. 

In [40]:
f["/metadata/songs"]

<HDF5 dataset "songs": shape (10000,), type "|V5320">

In [76]:
x = f["/metadata/songs"].value
print(len(x)) # number of songs in the million song subset
print('the first item of x:', x[0])
x[0:1] # shows variable names and types

10000
the first item of x: (b'', 29785, 0.7804617487770407, 0.5742747305168561, b'ARMQHX71187B9890D3', nan, b'Atlanta, GA', nan, b'bc5e2ad6-0a4a-4d90-b911-e9a7e6861727', b'Mastodon', -1, b'', 0, 0, b'Call of the Mastodon', 223563, 0.5976407977147769, b'SOVLGJY12A8C13FBED', b'Deep Sea Creature', 2442524)


array([ (b'', 29785, 0.7804617487770407, 0.5742747305168561, b'ARMQHX71187B9890D3', nan, b'Atlanta, GA', nan, b'bc5e2ad6-0a4a-4d90-b911-e9a7e6861727', b'Mastodon', -1, b'', 0, 0, b'Call of the Mastodon', 223563, 0.5976407977147769, b'SOVLGJY12A8C13FBED', b'Deep Sea Creature', 2442524)], 
      dtype=[('analyzer_version', 'S32'), ('artist_7digitalid', '<i4'), ('artist_familiarity', '<f8'), ('artist_hotttnesss', '<f8'), ('artist_id', 'S32'), ('artist_latitude', '<f8'), ('artist_location', 'S1024'), ('artist_longitude', '<f8'), ('artist_mbid', 'S40'), ('artist_name', 'S1024'), ('artist_playmeid', '<i4'), ('genre', 'S1024'), ('idx_artist_terms', '<i4'), ('idx_similar_artists', '<i4'), ('release', 'S1024'), ('release_7digitalid', '<i4'), ('song_hotttnesss', '<f8'), ('song_id', 'S32'), ('title', 'S1024'), ('track_7digitalid', '<i4')])

## Look at `/analysis`

In [46]:
list(f["/analysis"].keys())

['songs']

In [49]:
x = f["/analysis/songs"].value
print(len(x))
print(x[0])

10000
(22050, b'a600d65cf157a306be60f26ecbf218f4', 0.0, 280.21506, 0.238, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0.555, -3.306, 1, 0.5, 275.528, 173.205, 5, 0.12, b'TRACCVZ128F4291A8A')


## Look at `/musicbrainz`

In [51]:
list(f["/musicbrainz"].keys())

['songs']

In [53]:
x = f["/musicbrainz/songs"].value
print(len(x))
print(x[0],x[1])

10000
(0, 2001) (0, 1984)


In [63]:
x.shape, x[0], list(x[0]), list(x[0])[1]

((10000,), (0, 2001), [0, 2001], 2001)

The first value of each of the 10,000 2-tuples is a zero.

In [66]:
set([list(item)[0] for item in x]) 

{0}

So it looks like `/musicbrainz/songs` contains only the year of the song.

Details to put somewhere else that shouldn't be forgotten:

- Artist IDs likely start with "AR"
- Track IDs likely start with "TR"
- Song IDs likely start with "SO"
- The `_mb` extension likely indicates _Music Brainz_ information. 