# Create a dataframe from the MSS dataset

This notebook creates a pandas dataframe from the `/metadata/songs` and `/analysis/songs` tables in the HDF5 files. 

### Load libraries

In [15]:
import os
import re
import itertools as it
import pandas as pd
import numpy as np
import operator 
import functools


### Define utility functions

The `get_filenames` function recursively gets the names of all files in a given directory `path` and all of its subdirectories. The function returns a multi-level list if `path` contains subdirectories. The `unlist` function flattens the list by removing one level. 

In [16]:
def get_filenames(path):
    return([get_filenames(path+"/"+entry.name)
            if entry.is_dir() 
            else path+"/"+entry.name 
            for entry 
            in os.scandir(path)
           ])

In [17]:
def unlist(alist):
    return(list(it.chain.from_iterable(alist)
               )
          )

In [18]:
def var_list(base,numof):
    return([base+str(ndx) for ndx in range(numof)]
          )

In [19]:
def h1d_array(in_array,n): 
    # n1d is the number of elements in `in_array`
    n1d = functools.reduce(operator.mul,
                           list(in_array.shape))
    # return a 1 row 2D array with `n` columns
    b = np.ndarray(shape=(1,n1d),
                   buffer=in_array,
                   dtype=in_array.dtype
                  )[0:1,0:n]
    return(b)

The `make_1row_df` function returns a single row dataframe and takes the following input:

- `filename`: full path file name of an MSS HDF5 file containing data for a single song
- `metadata_vars`: list of variable names from `/metadata/songs`
- `analysis_vars`: list of variable names from `/analysis/songs`

See comments in the code for further details. 

In [37]:
def make_1row_df(filename='', metadata_vars=[], analysis_vars=[], remove=False):
    # open `filename` as a HDF5 file
    store = pd.HDFStore(filename,"r")

    # In this next two paragraphs these variables:
    #      artist_terms, tatums, segments, beats, bars and section
    # can be summarized to produce a predetermined number of values 
    # that will become columns of a table (which we can then analyze.)
    
    # We now only "summarize" the `segments_pitches` and `segments_timbre`
    # arrays by retrieving only the first 36 values (of each.)
    segments_pitches = h1d_array(store.root.analysis.segments_pitches.read(),36)
    segments_timbre  = h1d_array(store.root.analysis.segments_timbre.read(),36)
    
    # For now store these values as variables in single dataframes
    sp_df = pd.DataFrame(segments_pitches,columns=var_list('sp_',segments_pitches.shape[1]))
    st_df = pd.DataFrame(segments_timbre ,columns=var_list('st_',segments_timbre .shape[1]))

    # get track from filename
    match = re.split('\/',filename)
    match[-1]
    match = re.split('\.',match[-1])
    
    # merge these single dataframes into one single row dataframe
    ret = pd.concat([
            # make single row dataframe from track
            pd.DataFrame([match[0]], columns=['track']),
            # retrieve a single row dataframe from `/metadata/songs`
            pd.DataFrame(store.root.metadata.songs.read(), 
                         columns=metadata_vars),
            # retrieve a single row dataframe from `/analysis/songs`
            pd.DataFrame(store.root.analysis.songs.read(), 
                         columns=analysis_vars),
            #at_df, 
            #bc_df, 
            sp_df,
            st_df],
            axis=1) # `axes=1` means stack the dataframes horizontally 
    store.close()   # close the HDF5 file
    return(ret)     # return the merged dataframe

### Get the list of (10,000) HDF5 (.h5) files

The `path` variable stores the root of the directory tree containing all of the song files. The function `get_filenames` returns a multi-level list, which is flattened using `unlist` and stored in variable `filenames` as a list of full-path filenames.

In [38]:
path = "/home/jovyan/work/Dropbox/Data/MillionSongSubset/data"
filenames = unlist(unlist(unlist(get_filenames(path))))
filenames[0:2]

['/home/jovyan/work/Dropbox/Data/MillionSongSubset/data/A/A/A/TRAAAAW128F429D538.h5',
 '/home/jovyan/work/Dropbox/Data/MillionSongSubset/data/A/A/A/TRAAABD128F429CF47.h5']

### Store in the `filenames` variable only the files with extension `.h5`

In [39]:
p = re.compile("\.h5$")
filenames = [filename for filename 
             in filenames if p.search(filename)]
filenames[0:2]

['/home/jovyan/work/Dropbox/Data/MillionSongSubset/data/A/A/A/TRAAAAW128F429D538.h5',
 '/home/jovyan/work/Dropbox/Data/MillionSongSubset/data/A/A/A/TRAAABD128F429CF47.h5']

In [40]:
len(filenames)

10000

### Get lists of variables from `/metadata/songs` and `/analysis/songs`

The two tables `/metadata/songs` and `/analysis/songs` provide data that is easy to load into a dataframe. Their variables are displayed below so we know which to choose or omit when creating the corresponding dataframes.

### `/metadata/songs`

In [41]:
tmp=pd.HDFStore(filenames[1])
print(tmp.root.metadata.songs.read().dtype)
tmp.close()

[('analyzer_version', 'S32'), ('artist_7digitalid', '<i4'), ('artist_familiarity', '<f8'), ('artist_hotttnesss', '<f8'), ('artist_id', 'S32'), ('artist_latitude', '<f8'), ('artist_location', 'S1024'), ('artist_longitude', '<f8'), ('artist_mbid', 'S40'), ('artist_name', 'S1024'), ('artist_playmeid', '<i4'), ('genre', 'S1024'), ('idx_artist_terms', '<i4'), ('idx_similar_artists', '<i4'), ('release', 'S1024'), ('release_7digitalid', '<i4'), ('song_hotttnesss', '<f8'), ('song_id', 'S32'), ('title', 'S1024'), ('track_7digitalid', '<i4')]


### `/analysis/songs`

In [42]:
tmp=pd.HDFStore(filenames[1])
print(tmp.root.analysis.songs.read().dtype)
tmp.close()

[('analysis_sample_rate', '<i4'), ('audio_md5', 'S32'), ('danceability', '<f8'), ('duration', '<f8'), ('end_of_fade_in', '<f8'), ('energy', '<f8'), ('idx_bars_confidence', '<i4'), ('idx_bars_start', '<i4'), ('idx_beats_confidence', '<i4'), ('idx_beats_start', '<i4'), ('idx_sections_confidence', '<i4'), ('idx_sections_start', '<i4'), ('idx_segments_confidence', '<i4'), ('idx_segments_loudness_max', '<i4'), ('idx_segments_loudness_max_time', '<i4'), ('idx_segments_loudness_start', '<i4'), ('idx_segments_pitches', '<i4'), ('idx_segments_start', '<i4'), ('idx_segments_timbre', '<i4'), ('idx_tatums_confidence', '<i4'), ('idx_tatums_start', '<i4'), ('key', '<i4'), ('key_confidence', '<f8'), ('loudness', '<f8'), ('mode', '<i4'), ('mode_confidence', '<f8'), ('start_of_fade_out', '<f8'), ('tempo', '<f8'), ('time_signature', '<i4'), ('time_signature_confidence', '<f8'), ('track_id', 'S32')]


### Run the `make_1row_df` function on the fourth file

We are currently only pulling data from `/metadata/songs` and `/analysis/songs`. 

Later we will pull additional data from the file. There are three types of data we can retreive:

1. From `/metadata` there are three lists: `artist_terms`, `artist_terms_freq`, `artist_terms_weight`
1. From `/analysis` there is information about tatums, beats, bars, sections, segments (timbre and pitch.)
1. From `/musicbrainz`

In [43]:
make_1row_df(filename=filenames[3],
                                metadata_vars=['artist_familiarity','artist_hotttnesss',
                                           'song_hotttnesss','title','artist_name',
                                           'artist_location','release',
                                           'artist_longitude','artist_latitude'],
                            # Omit: genre
                            analysis_vars=['duration','key','loudness','mode',
                                           'tempo','time_signature'],
                            # Omit: danceability, energy
                            remove=False
                           )

Unnamed: 0,track,artist_familiarity,artist_hotttnesss,song_hotttnesss,title,artist_name,artist_location,release,artist_longitude,artist_latitude,...,st_26,st_27,st_28,st_29,st_30,st_31,st_32,st_33,st_34,st_35
0,TRAAAEF128F4273421,0.630382,0.454231,,b'Something Girls',b'Adam Ant',"b'London, England'",b'Friend Or Foe',,,...,103.23,-17.005,-37.423,47.573,-0.734,25.383,-10.965,-44.947,10.023,-40.109


### There is more data in the file than the data in `/metadata/songs` and `/analysis/songs`. 

See the `MSS explore dataset` notebook. 

### Create a list of 10,000 single row dataframes

Because `remove=False` is specified the two lists of variables are retrieved from the two `Tables` displayed above. The result of this command is a list of 10,000 single row dataframes with columns indicated. 

It may take up to twenty (20) minutes to create `mss_df_list` with the current set of variables. 

In [44]:
mss_df_list = [make_1row_df(filename=filename,
                            metadata_vars=['artist_familiarity','artist_hotttnesss',
                                           'song_hotttnesss','title',
                                           'artist_name',
                                           'artist_location','release',
                                           'artist_longitude','artist_latitude',
                                           'artist_id','song_id','track_id'],
                            # Omit: genre
                            analysis_vars=['duration','key','loudness','mode',
                                           'tempo','time_signature'],
                            # Omit: danceability, energy
                            remove=False
                           )
                for filename in filenames[0:1000] # get data from 1,000 files
              ]
len(mss_df_list), mss_df_list[0].shape

(1000, (1, 91))

In [45]:
len(mss_df_list)

1000

### Merge all dataframes of `mss_df_list` into a single dataframe stored in `mss_df`.

In [47]:
mss_df = pd.concat(mss_df_list,axis=0).reset_index(drop=True)
mss_df.shape

(1000, 91)

### Check the head of the table

In [48]:
mss_df.head()

Unnamed: 0,track,artist_familiarity,artist_hotttnesss,song_hotttnesss,title,artist_name,artist_location,release,artist_longitude,artist_latitude,...,st_26,st_27,st_28,st_29,st_30,st_31,st_32,st_33,st_34,st_35
0,TRAAAAW128F429D538,0.581794,0.401998,0.60212,"b""I Didn't Mean To""",b'Casual',b'California - LA',b'Fear Itself',,,...,-159.915,-89.765,29.646,-45.432,15.733,29.094,-6.805,9.46,-15.33,-21.079
1,TRAAABD128F429CF47,0.63063,0.4175,,b'Soul Deep',b'The Box Tops',"b'Memphis, TN'",b'Dimensions',-90.04892,35.14968,...,5.4,59.208,-17.624,28.703,14.13,-0.71,34.62,-23.91,23.453,-5.048
2,TRAAADZ128F9348C2E,0.487357,0.343428,,b'Amor De Cabaret',b'Sonora Santanera',b'',b'Las Numero 1 De La Sonora Santanera',,,...,59.534,-17.441,-47.459,-19.073,3.268,9.741,16.689,-12.663,11.562,4.562
3,TRAAAEF128F4273421,0.630382,0.454231,,b'Something Girls',b'Adam Ant',"b'London, England'",b'Friend Or Foe',,,...,103.23,-17.005,-37.423,47.573,-0.734,25.383,-10.965,-44.947,10.023,-40.109
4,TRAAAFD128F92F423A,0.651046,0.401724,0.604501,b'Face the Ashes',b'Gob',b'',b'Muertos Vivos',,,...,-45.268,46.734,5.729,-30.722,-8.925,-6.816,13.172,-0.391,-29.761,8.708


### Check its dimensions (shape) and its variables.

In [15]:
print('shape  :',mss_df.shape)
print('columns:',mss_df.columns.values)

shape: (10000, 101)
columns: ['artist_familiarity' 'artist_hotttnesss' 'artist_id' 'artist_latitude'
 'artist_location' 'artist_longitude' 'artist_name' 'bc_0' 'bc_1' 'bc_2'
 'bc_3' 'bc_4' 'bc_5' 'bc_6' 'bc_7' 'bc_8' 'bc_9' 'duration' 'key'
 'loudness' 'mode' 'release' 'song_hotttnesss' 'song_id' 'sp_0' 'sp_1'
 'sp_10' 'sp_11' 'sp_12' 'sp_13' 'sp_14' 'sp_15' 'sp_16' 'sp_17' 'sp_18'
 'sp_19' 'sp_2' 'sp_20' 'sp_21' 'sp_22' 'sp_23' 'sp_24' 'sp_25' 'sp_26'
 'sp_27' 'sp_28' 'sp_29' 'sp_3' 'sp_30' 'sp_31' 'sp_32' 'sp_33' 'sp_34'
 'sp_35' 'sp_4' 'sp_5' 'sp_6' 'sp_7' 'sp_8' 'sp_9' 'st_0' 'st_1' 'st_10'
 'st_11' 'st_12' 'st_13' 'st_14' 'st_15' 'st_16' 'st_17' 'st_18' 'st_19'
 'st_2' 'st_20' 'st_21' 'st_22' 'st_23' 'st_24' 'st_25' 'st_26' 'st_27'
 'st_28' 'st_29' 'st_3' 'st_30' 'st_31' 'st_32' 'st_33' 'st_34' 'st_35'
 'st_4' 'st_5' 'st_6' 'st_7' 'st_8' 'st_9' 'tempo' 'time_signature' 'title'
 'track' 'track_id']


### Modify some of the variables

### Make  `key` and `time_signature` variables categorical

Leave `mode` as numeric. 

In [49]:
mss_df['mode']            = mss_df['mode']           .astype('float64')
mss_df['key']             = mss_df['key']            .astype('category')
mss_df['time_signature']  = mss_df['time_signature'] .astype('category')
mss_df['key'].dtype, mss_df['mode'].dtype, mss_df['time_signature'].dtype

(category, dtype('float64'), category)

### Create dummy variables from categorical variables `key` and `time_signature`

The `mode` variable is already binary. 

The `key` and `time_signature` variables are removed with this next command.

In [50]:

mss_df = pd.get_dummies(mss_df, 
                        columns=['key','time_signature'], 
                        prefix=['k','ts'])

In [51]:
mss_df.dtypes

track                  object
artist_familiarity    float64
artist_hotttnesss     float64
song_hotttnesss       float64
title                  object
artist_name            object
artist_location        object
release                object
artist_longitude      float64
artist_latitude       float64
artist_id              object
song_id                object
track_id               object
duration              float64
loudness              float64
mode                  float64
tempo                 float64
sp_0                  float64
sp_1                  float64
sp_2                  float64
sp_3                  float64
sp_4                  float64
sp_5                  float64
sp_6                  float64
sp_7                  float64
sp_8                  float64
sp_9                  float64
sp_10                 float64
sp_11                 float64
sp_12                 float64
                       ...   
st_23                 float64
st_24                 float64
st_25     

### Save the table `mss_df` in a _pickle_ file

First set the folder to save to and load from. 

In [52]:
save_load_path = '/home/jovyan/work/Desktop'

Save `mss_df` to a _pickle_ file. 

In [53]:
mss_df.to_pickle(save_load_path+'/mss_df.pkl')

Load `mss_df` from the _pickle_ file.

In [54]:
mss_df = pd.read_pickle(save_load_path+'/mss_df.pkl')

Now check that we retrieved the same number of rows and variables we expect.

In [55]:
print('shape:',mss_df.shape)
print('columns:',mss_df.columns.values)
mss_df.dtypes

shape: (1000, 106)
columns: ['track' 'artist_familiarity' 'artist_hotttnesss' 'song_hotttnesss' 'title'
 'artist_name' 'artist_location' 'release' 'artist_longitude'
 'artist_latitude' 'artist_id' 'song_id' 'track_id' 'duration' 'loudness'
 'mode' 'tempo' 'sp_0' 'sp_1' 'sp_2' 'sp_3' 'sp_4' 'sp_5' 'sp_6' 'sp_7'
 'sp_8' 'sp_9' 'sp_10' 'sp_11' 'sp_12' 'sp_13' 'sp_14' 'sp_15' 'sp_16'
 'sp_17' 'sp_18' 'sp_19' 'sp_20' 'sp_21' 'sp_22' 'sp_23' 'sp_24' 'sp_25'
 'sp_26' 'sp_27' 'sp_28' 'sp_29' 'sp_30' 'sp_31' 'sp_32' 'sp_33' 'sp_34'
 'sp_35' 'st_0' 'st_1' 'st_2' 'st_3' 'st_4' 'st_5' 'st_6' 'st_7' 'st_8'
 'st_9' 'st_10' 'st_11' 'st_12' 'st_13' 'st_14' 'st_15' 'st_16' 'st_17'
 'st_18' 'st_19' 'st_20' 'st_21' 'st_22' 'st_23' 'st_24' 'st_25' 'st_26'
 'st_27' 'st_28' 'st_29' 'st_30' 'st_31' 'st_32' 'st_33' 'st_34' 'st_35'
 'k_0' 'k_1' 'k_2' 'k_3' 'k_4' 'k_5' 'k_6' 'k_7' 'k_8' 'k_9' 'k_10' 'k_11'
 'ts_1' 'ts_3' 'ts_4' 'ts_5' 'ts_7']


track                  object
artist_familiarity    float64
artist_hotttnesss     float64
song_hotttnesss       float64
title                  object
artist_name            object
artist_location        object
release                object
artist_longitude      float64
artist_latitude       float64
artist_id              object
song_id                object
track_id               object
duration              float64
loudness              float64
mode                  float64
tempo                 float64
sp_0                  float64
sp_1                  float64
sp_2                  float64
sp_3                  float64
sp_4                  float64
sp_5                  float64
sp_6                  float64
sp_7                  float64
sp_8                  float64
sp_9                  float64
sp_10                 float64
sp_11                 float64
sp_12                 float64
                       ...   
st_23                 float64
st_24                 float64
st_25     

# End