# Million Song Subset - Variable Investigation

## Essential setup (can be ignored)

In [1]:
import h5py 
import os
import itertools
import re

In [2]:
def get_filenames(path):
    return([get_filenames(path+"/"+entry.name)
            if entry.is_dir() 
            else path+"/"+entry.name 
            for entry 
            in os.scandir(path)
           ])
def unlist(alist):
    return(list(itertools.chain.from_iterable(alist)
               )
          )
def check_HDF5_Group(Group):
    for key in list(Group.keys()):
        print('key:', key)
    for value in list(Group.values()):
        print('value:', value)

def check_HDF5_Dataset(Dataset):
    print('shape:',Dataset.shape)
    print('dtype:',Dataset.dtype)
    print('type(bc.value):',type(Dataset.value))
    print('value:',Dataset.value)

In [3]:
path = "/Users/David/Dropbox/Data/MillionSongSubset/data"
x = unlist(unlist(unlist(get_filenames(path))))
p = re.compile("\.h5$")
y = [filename for filename in x if p.search(filename)]

## Check out a single song

The following command opens the first file which has information for a single song of the dataset. 

In [4]:
f = h5py.File(y[0], "r")

The file has three groups of data: `analysis`, `metadata` and `musicbrainz`.

In [5]:
check_HDF5_Group(f["/"])

key: analysis
key: metadata
key: musicbrainz
value: <HDF5 group "/analysis" (16 members)>
value: <HDF5 group "/metadata" (5 members)>
value: <HDF5 group "/musicbrainz" (3 members)>


## Check out the `/metadata` group

In [7]:
check_HDF5_Group(f["/metadata"])

key: artist_terms
key: artist_terms_freq
key: artist_terms_weight
key: similar_artists
key: songs
value: <HDF5 dataset "artist_terms": shape (37,), type "|S256">
value: <HDF5 dataset "artist_terms_freq": shape (37,), type "<f8">
value: <HDF5 dataset "artist_terms_weight": shape (37,), type "<f8">
value: <HDF5 dataset "similar_artists": shape (100,), type "|S20">
value: <HDF5 dataset "songs": shape (1,), type "|V5320">


Need to figure out what the `freq` and `weight` say about the `artist_terms`.

In [6]:
f["/metadata/artist_terms"]

<HDF5 dataset "artist_terms": shape (37,), type "|S256">

In [17]:
f["/metadata/artist_terms"].value

array([b'hip hop', b'underground rap', b'g funk', b'alternative rap',
       b'gothic rock', b'west coast rap', b'rap', b'club dance',
       b'singer-songwriter', b'chill-out', b'underground hip hop', b'rock',
       b'gothic', b'san francisco bay area', b'indie', b'american',
       b'punk', b'california', b'industrial', b'new york', b'90s',
       b'latin', b'spanish', b'dark', b'ebm', b'underground', b'deathrock',
       b'west coast', b'san francisco', b'producer', b'oakland',
       b'catalan', b'barcelona', b'doomsdope', b'norcal',
       b'west coast hip hop', b'alternative rock'], 
      dtype='|S256')

In [7]:
f["/metadata/artist_terms_freq"]

<HDF5 dataset "artist_terms_freq": shape (37,), type "<f8">

In [18]:
f["/metadata/artist_terms_freq"].value

array([ 1.        ,  0.77613623,  0.72966979,  0.68301072,  0.73013328,
        0.6715377 ,  0.80834839,  0.63365545,  0.72966979,  0.5663102 ,
        0.57592584,  0.76900909,  0.58208775,  0.47493557,  0.67402276,
        0.61640511,  0.64964936,  0.51614094,  0.6000771 ,  0.51980619,
        0.59353997,  0.56511801,  0.55205485,  0.53260669,  0.53958053,
        0.50931501,  0.45188402,  0.4484606 ,  0.43320144,  0.4322592 ,
        0.42699224,  0.40636693,  0.36872193,  0.35580011,  0.33148334,
        0.28803951,  0.3212017 ])

In [19]:
f["/metadata/artist_terms_weight"].value

array([ 1.        ,  0.89793596,  0.88426185,  0.84262975,  0.84256301,
        0.83239282,  0.82577707,  0.79859195,  0.7431759 ,  0.73850237,
        0.72505245,  0.71389955,  0.67049417,  0.65697231,  0.65105613,
        0.65105612,  0.65105597,  0.65105592,  0.65105547,  0.65105532,
        0.65105508,  0.65105506,  0.65105461,  0.65105427,  0.65105376,
        0.65104997,  0.6364043 ,  0.63334971,  0.61973455,  0.61889383,
        0.61419433,  0.59579116,  0.56220197,  0.55067233,  0.52897541,
        0.49021215,  0.38341077])

Artist identifiers are strings that start with "AR".

In [13]:
f["/metadata/songs"]

<HDF5 dataset "songs": shape (1,), type "|V5320">

In [17]:
f["/metadata/songs"].value

array([ (b'', 165270, 0.5817937658450281, 0.4019975433642836, b'ARD7TVE1187B99BFB1', nan, b'California - LA', nan, b'e77e51a5-4761-45b3-9847-2051f811e366', b'Casual', 4479, b'', 0, 0, b'Fear Itself', 300848, 0.6021199899057548, b'SOMZWCG12A8C13C480', b"I Didn't Mean To", 3401791)], 
      dtype=[('analyzer_version', 'S32'), ('artist_7digitalid', '<i4'), ('artist_familiarity', '<f8'), ('artist_hotttnesss', '<f8'), ('artist_id', 'S32'), ('artist_latitude', '<f8'), ('artist_location', 'S1024'), ('artist_longitude', '<f8'), ('artist_mbid', 'S40'), ('artist_name', 'S1024'), ('artist_playmeid', '<i4'), ('genre', 'S1024'), ('idx_artist_terms', '<i4'), ('idx_similar_artists', '<i4'), ('release', 'S1024'), ('release_7digitalid', '<i4'), ('song_hotttnesss', '<f8'), ('song_id', 'S32'), ('title', 'S1024'), ('track_7digitalid', '<i4')])

The `/metadata/songs` values correspond to the single value variables. Notice that the _file-song_ dataset contains multiple value variables. 

In [21]:
f["/metadata/similar_artists"]

<HDF5 dataset "similar_artists": shape (100,), type "|S20">

In [22]:
f["/metadata/similar_artists"].value

array([b'ARV4KO21187FB38008', b'ARWHM281187FB3D381', b'ARJGOG11187B98D89F',
       b'AR9ODB41187FB459B2', b'ARXM6VQ1187FB5B1E0', b'ARNWZ1N1187B9B71BA',
       b'ARDWYZZ11F4C8413FA', b'ARTP3H51187B98FB75', b'ARWCDXN12454A4D1E8',
       b'ARJ54S61187B9ACD39', b'AR5PF241187B989C1D', b'ARR7MLL1187B99B636',
       b'ARLMHFV1187B9A3833', b'ARPRERY1187B99E2DC', b'AR34BCQ1187B9A68E4',
       b'ARFWBUC11F4C8413DA', b'ARPWGMN1187FB560E3', b'ARVCIVW12454A4D1E7',
       b'ARG89HY1187FB3CA15', b'AR9IGU51187FB40D6B', b'ARNNOYR11F4C845127',
       b'ARZMFNT11F4C8413DD', b'ARPR9W71187FB3723A', b'AR5VBGP1187B98EB43',
       b'ARFHDOI1187FB57230', b'ARBSQPF11F4C8413E0', b'AROYGID11F4C8413DB',
       b'ARDXUGZ11F4C84452F', b'ARMW4I01187B98AEF8', b'AR7AYQG1187B994B3F',
       b'ARHVZEM11F4C841FF9', b'ARP9H0U1187FB3FEA7', b'ARVSIGU11F4C8413E6',
       b'AROWKNS1187FB59ED5', b'ARUSTLW11F4C8413DE', b'ARSKPDX11F4C83D2A9',
       b'ARB4D891187B9954F7', b'ARRIWD31187B9A9B4A', b'ARNAAQH11F4C8413E1',
       b'ARV

## Check out the `analysis` group

In [24]:
check_HDF5_Group(f["/analysis"])

key: bars_confidence
key: bars_start
key: beats_confidence
key: beats_start
key: sections_confidence
key: sections_start
key: segments_confidence
key: segments_loudness_max
key: segments_loudness_max_time
key: segments_loudness_start
key: segments_pitches
key: segments_start
key: segments_timbre
key: songs
key: tatums_confidence
key: tatums_start
value: <HDF5 dataset "bars_confidence": shape (83,), type "<f8">
value: <HDF5 dataset "bars_start": shape (83,), type "<f8">
value: <HDF5 dataset "beats_confidence": shape (344,), type "<f8">
value: <HDF5 dataset "beats_start": shape (344,), type "<f8">
value: <HDF5 dataset "sections_confidence": shape (10,), type "<f8">
value: <HDF5 dataset "sections_start": shape (10,), type "<f8">
value: <HDF5 dataset "segments_confidence": shape (971,), type "<f8">
value: <HDF5 dataset "segments_loudness_max": shape (971,), type "<f8">
value: <HDF5 dataset "segments_loudness_max_time": shape (971,), type "<f8">
value: <HDF5 dataset "segments_loudness_start

## Check out the `analysis/songs` group

This looks like a list of standard single value variables. 

In [20]:
f["/analysis/songs"]

<HDF5 dataset "songs": shape (1,), type "|V220">

In [25]:
check_HDF5_Dataset(f["/analysis/songs"])

shape: (1,)
dtype: [('analysis_sample_rate', '<i4'), ('audio_md5', 'S32'), ('danceability', '<f8'), ('duration', '<f8'), ('end_of_fade_in', '<f8'), ('energy', '<f8'), ('idx_bars_confidence', '<i4'), ('idx_bars_start', '<i4'), ('idx_beats_confidence', '<i4'), ('idx_beats_start', '<i4'), ('idx_sections_confidence', '<i4'), ('idx_sections_start', '<i4'), ('idx_segments_confidence', '<i4'), ('idx_segments_loudness_max', '<i4'), ('idx_segments_loudness_max_time', '<i4'), ('idx_segments_loudness_start', '<i4'), ('idx_segments_pitches', '<i4'), ('idx_segments_start', '<i4'), ('idx_segments_timbre', '<i4'), ('idx_tatums_confidence', '<i4'), ('idx_tatums_start', '<i4'), ('key', '<i4'), ('key_confidence', '<f8'), ('loudness', '<f8'), ('mode', '<i4'), ('mode_confidence', '<f8'), ('start_of_fade_out', '<f8'), ('tempo', '<f8'), ('time_signature', '<i4'), ('time_signature_confidence', '<f8'), ('track_id', 'S32')]
type(bc.value): <class 'numpy.ndarray'>
value: [ (22050, b'a222795e07cd65b7a530f1346f52

## Check out /analysis/tatums_start

What is a "tatum"? We have a __vector__ of `688` numeric decimal values for this variable, for this song. 

In [7]:
check_HDF5_Dataset(f["/analysis/tatums_start"])

shape: (688,)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [   0.28519    0.58521    0.89422    1.19196    1.49119    1.78893
    2.0828     2.37813    2.66761    2.94247    3.23074    3.50622
    3.79165    4.05077    4.3099     4.56902    4.82814    5.14371
    5.45437    5.76504    6.09135    6.41767    6.75551    7.08552
    7.41553    7.74554    8.07907    8.40279    8.73142    9.05841
    9.38051    9.70913   10.03123   10.36149   10.68195   11.01057
   11.34083   11.66783   11.99809   12.32182   12.65209   12.98399
   13.31099   13.63798   13.96498   14.28872   14.61572   14.94762
   15.27298   15.59835   15.92535   16.25463   16.58233   16.9132
   17.23796   17.56272   17.88748   18.21002   18.52945   18.85198
   19.17898   19.50271   19.83788   20.16161   20.49024   20.81724
   21.14751   21.47124   21.79988   22.12232   22.44802   22.76883
   23.08634   23.41491   23.74347   24.07314   24.39964   24.72931   25.051
   25.381     25.70102   26.053     26.40498 

## Check out /analysis/segments_pitches

What is a "segment"? We have a __matrix__ of `971` rows and `12` columns for `segment_pitches` variable, for this song. 

The `segment` variables are:  

- segments_confidence
- segments_loudness_max
- segments_loudness_max_time
- segments_loudness_start
- segments_pitches
- segments_start
- segments_timbre

In [9]:
check_HDF5_Dataset(f["/analysis/segments_pitches"])

shape: (971, 12)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [[ 0.946  0.684  0.679 ...,  0.732  1.     0.742]
 [ 0.01   0.054  0.015 ...,  0.008  0.012  0.017]
 [ 0.296  0.993  1.    ...,  0.105  0.064  0.19 ]
 ..., 
 [ 0.41   0.777  0.126 ...,  0.999  0.378  0.007]
 [ 0.373  0.697  1.    ...,  0.413  0.577  0.431]
 [ 0.877  1.     0.059 ...,  0.343  0.583  0.031]]


## Check out /analysis/bars_start

What is a "bar"? We have a __vector__ of `83` values for this variable, for this song. 

In [26]:
check_HDF5_Dataset(f["/analysis/bars_start"])

shape: (83,)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [   0.58521    2.94247    5.14371    7.74554   10.36149   12.98399
   15.59835   18.21002   20.81724   23.41491   27.28786   29.56053
   31.67335   33.82211   36.44331   39.04965   41.66893   44.26169
   46.88745   49.50972   51.76914   54.3454    56.95219   59.55657
   62.15946   64.75832   67.3715    69.97649   72.5852    75.17954
   81.9098    84.48832   86.92448   89.5343    91.80534   94.42298
   96.99542   99.45001  101.99655  104.61578  107.12841  109.73635
  112.34487  114.94105  117.52123  120.13085  122.74322  125.3533   127.952
  130.55674  133.16574  135.77195  138.36015  140.97508  143.59784
  145.66061  148.15968  150.77104  153.38742  155.98586  158.56945
  161.13148  163.17129  165.50756  167.84934  170.20492  172.85136
  175.43549  177.91593  180.51087  183.14453  185.74844  188.37569
  190.98184  193.61642  196.2308   198.85633  201.56144  204.16946
  206.77992  209.38599  211.99695  214.60515]

## Check out /analysis/bars_confidence

We have a __vector__ of `83` values for this variable, for this song. 

In [27]:
check_HDF5_Dataset(f["/analysis/bars_confidence"])

shape: (83,)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [ 0.643  0.746  0.722  0.095  0.091  0.362  0.465  0.204  0.129  0.618
  0.192  0.053  0.855  0.236  0.616  0.008  0.113  0.155  0.027  0.593
  0.707  0.453  0.434  0.059  0.994  0.727  0.442  0.489  0.861  0.007
  0.019  0.18   0.28   0.07   0.008  0.19   0.1    0.038  0.074  0.076
  0.185  0.002  0.023  0.074  0.393  0.163  0.119  0.048  0.119  0.006
  0.021  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]


## Check out /analysis/beats_start

What is a "beat"? We have a __vector__ of `344` values for this variable, for this song. 

In [29]:
check_HDF5_Dataset(f["/analysis/beats_start"])

shape: (344,)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [   0.58521    1.19196    1.78893    2.37813    2.94247    3.50622
    4.05077    4.56902    5.14371    5.76504    6.41767    7.08552
    7.74554    8.40279    9.05841    9.70913   10.36149   11.01057
   11.66783   12.32182   12.98399   13.63798   14.28872   14.94762
   15.59835   16.25463   16.9132    17.56272   18.21002   18.85198
   19.50271   20.16161   20.81724   21.47124   22.12232   22.76883
   23.41491   24.07314   24.72931   25.381     26.053     26.75695
   27.28786   27.82394   28.3656    28.90727   29.56053   30.13608
   30.65717   31.17374   31.67335   32.15958   32.62496   33.17838
   33.82211   34.47504   35.13502   35.7874    36.44331   37.10017
   37.74491   38.39728   39.04965   39.70366   40.35113   41.01003
   41.66893   42.32214   42.9643    43.61667   44.26169   44.91089
   45.5717    46.2311    46.88745   47.54129   48.18394   48.84123
   49.50972   50.1248    50.6717    51.22026   51.769

## Check out /analysis/beats_confidence

What is a "beat"? We have a __vector__ of `344` values for this variable, for this song. 

In [30]:
check_HDF5_Dataset(f["/analysis/beats_confidence"])

shape: (344,)
dtype: float64
type(bc.value): <class 'numpy.ndarray'>
value: [ 0.834  0.851  0.65   0.635  0.532  0.753  0.622  0.657  0.704  0.745
  0.765  0.626  0.748  0.743  0.772  0.641  0.765  0.688  0.47   0.832
  0.468  0.359  0.26   0.81   0.694  0.745  0.52   0.607  0.392  0.782
  0.65   0.663  0.512  0.606  0.44   0.637  0.722  0.657  0.142  0.566
  0.362  0.532  0.607  0.674  0.736  0.519  0.631  0.619  0.654  0.547
  0.53   0.652  0.521  0.717  0.226  0.64   0.359  0.524  0.807  0.762
  0.247  0.63   0.61   0.216  0.22   0.332  0.731  0.684  0.347  0.606
  0.782  0.61   0.68   0.744  0.57   0.511  0.545  0.319  0.488  0.459
  0.565  0.456  0.67   0.332  0.603  0.569  0.751  0.427  0.727  0.531
  0.737  0.485  0.619  0.467  0.746  0.188  0.714  0.448  0.689  0.544
  0.743  0.563  0.738  0.513  0.762  0.615  0.787  0.525  0.719  0.498
  0.722  0.568  0.711  0.168  0.738  0.594  0.582  0.612  0.695  0.487
  0.722  0.683  0.719  0.226  0.353  0.783  0.774  0.823  0.645  0.618
 

## Check out `/musicbrainz`

In [22]:
check_HDF5_Group(f["/musicbrainz"])

key: artist_mbtags
key: artist_mbtags_count
key: songs
value: <HDF5 dataset "artist_mbtags": shape (0,), type "|S256">
value: <HDF5 dataset "artist_mbtags_count": shape (0,), type "<i4">
value: <HDF5 dataset "songs": shape (1,), type "|V8">


In [31]:
check_HDF5_Dataset(f["/musicbrainz/artist_mbtags"])

shape: (0,)
dtype: |S256
type(bc.value): <class 'numpy.ndarray'>
value: []


In [32]:
check_HDF5_Dataset(f["/musicbrainz/artist_mbtags_count"])

shape: (0,)
dtype: int32
type(bc.value): <class 'numpy.ndarray'>
value: []


In [33]:
check_HDF5_Dataset(f["/musicbrainz/songs"])

shape: (1,)
dtype: [('idx_artist_mbtags', '<i4'), ('year', '<i4')]
type(bc.value): <class 'numpy.ndarray'>
value: [(0, 0)]


# STOP HERE FOR TODAY

## Read in the "additional" HDF5 file

In [26]:
path_addl = "/Users/David/Dropbox/Data/MillionSongSubset/AdditionalFiles"
file_addl = "subset_msd_summary_file.h5"
f = h5py.File(path_addl+"/"+file_addl, "r")

In [28]:
f["/"]

<HDF5 group "/" (3 members)>

In [29]:
check_HDF5_Group(f["/"])

key: analysis
key: metadata
key: musicbrainz
value: <HDF5 group "/analysis" (1 members)>
value: <HDF5 group "/metadata" (1 members)>
value: <HDF5 group "/musicbrainz" (1 members)>


## Look at `/metadata`

In [37]:
list(f["/metadata"].keys())

['songs']

We (someone) should compare these variables with those from the first file `y[0]` above. 

In [40]:
f["/metadata/songs"]

<HDF5 dataset "songs": shape (10000,), type "|V5320">

In [76]:
x = f["/metadata/songs"].value
print(len(x)) # number of songs in the million song subset
print('the first item of x:', x[0])
x[0:1] # shows variable names and types

10000
the first item of x: (b'', 29785, 0.7804617487770407, 0.5742747305168561, b'ARMQHX71187B9890D3', nan, b'Atlanta, GA', nan, b'bc5e2ad6-0a4a-4d90-b911-e9a7e6861727', b'Mastodon', -1, b'', 0, 0, b'Call of the Mastodon', 223563, 0.5976407977147769, b'SOVLGJY12A8C13FBED', b'Deep Sea Creature', 2442524)


array([ (b'', 29785, 0.7804617487770407, 0.5742747305168561, b'ARMQHX71187B9890D3', nan, b'Atlanta, GA', nan, b'bc5e2ad6-0a4a-4d90-b911-e9a7e6861727', b'Mastodon', -1, b'', 0, 0, b'Call of the Mastodon', 223563, 0.5976407977147769, b'SOVLGJY12A8C13FBED', b'Deep Sea Creature', 2442524)], 
      dtype=[('analyzer_version', 'S32'), ('artist_7digitalid', '<i4'), ('artist_familiarity', '<f8'), ('artist_hotttnesss', '<f8'), ('artist_id', 'S32'), ('artist_latitude', '<f8'), ('artist_location', 'S1024'), ('artist_longitude', '<f8'), ('artist_mbid', 'S40'), ('artist_name', 'S1024'), ('artist_playmeid', '<i4'), ('genre', 'S1024'), ('idx_artist_terms', '<i4'), ('idx_similar_artists', '<i4'), ('release', 'S1024'), ('release_7digitalid', '<i4'), ('song_hotttnesss', '<f8'), ('song_id', 'S32'), ('title', 'S1024'), ('track_7digitalid', '<i4')])

## Look at `/analysis`

In [46]:
list(f["/analysis"].keys())

['songs']

In [49]:
x = f["/analysis/songs"].value
print(len(x))
print(x[0])

10000
(22050, b'a600d65cf157a306be60f26ecbf218f4', 0.0, 280.21506, 0.238, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0.555, -3.306, 1, 0.5, 275.528, 173.205, 5, 0.12, b'TRACCVZ128F4291A8A')


## Look at `/musicbrainz`

In [51]:
list(f["/musicbrainz"].keys())

['songs']

In [53]:
x = f["/musicbrainz/songs"].value
print(len(x))
print(x[0],x[1])

10000
(0, 2001) (0, 1984)


In [63]:
x.shape, x[0], list(x[0]), list(x[0])[1]

((10000,), (0, 2001), [0, 2001], 2001)

The first value of each of the 10,000 2-tuples is a zero.

In [66]:
set([list(item)[0] for item in x]) 

{0}

So it looks like `/musicbrainz/songs` contains only the year of the song.

Details to put somewhere else that shouldn't be forgotten:

- Artist IDs likely start with "AR"
- Track IDs likely start with "TR"
- Song IDs likely start with "SO"
- The `_mb` extension likely indicates _Music Brainz_ information. 