In [None]:
# Read parts of the MSD into tables

This notebook creates a pandas dataframe from the `/metadata/songs` and `/analysis/songs` tables in the HDF5 files. 

The `pandas` module requires code from the _PyTables_ package. To load this package into Python from a console:

> `$ conda install --name python3 PyTables`

This only needs to happen once on your computer.

### Load libraries

In [1]:
import os
import re
import itertools as it
import pandas as pd
import numpy as np
import operator 
import functools

### Define utility functions

The `get_filenames` function recursively gets the names of all files in a given directory `path` and all of its subdirectories. The function returns a multi-level list if `path` contains subdirectories. The `unlist` function flattens the list by removing one level. 

In [42]:
def get_filenames(path):
    return([get_filenames(path+"/"+entry.name)
            if entry.is_dir() 
            else path+"/"+entry.name 
            for entry 
            in os.scandir(path)
           ])

def unlist(alist):
    return(list(it.chain.from_iterable(alist)
               )
          )

def var_list(base,numof):
    return([base+str(ndx) for ndx in range(numof)]
          )

def h1d_array(in_array,n): 
    # n1d is the number of elements in `in_array`
    n1d = functools.reduce(operator.mul,
                           list(in_array.shape))
    # return a 1 row 2D array with `n` rows
    b = np.ndarray(shape=(n1d,1),
                   buffer=in_array,
                   dtype=in_array.dtype
                  )[0:n,0:1]
    return(b)

The `make_1row_df` function returns a single row dataframe and takes the following input:

- `filename`: full path file name of an MSD HDF5 file containing data for a single song
- `metadata_vars`: list of variable names from `/metadata/songs`
- `analysis_vars`: list of variable names from `/analysis/songs`
- `remove`: 
    - if `False` the variables listed in the last two parameters are retrieved from the input file
    - if `True` all variables except those listed are retrieved from the input file

See comments in the code for further details. 

### Get the list of (10,000) HDF5 (.h5) files

The `path` variable stores the root of the directory tree containing all of the song files. The function `get_filenames` returns a multi-level list, which is flattened using `unlist` and stored in variable `filenames` as a list of full-path filenames.

In [28]:
def make_2col_df(filename='', metadata_vars=[], analysis_vars=[], remove=False):
    # open `filename` as a HDF5 file
    store = pd.HDFStore(filename,"r")
    if remove==True:
        # `metadata_vars` and `analysis_vars` contain the variables to remove
        metadata_vars = list({item for item 
                                  in list(store.root.metadata.songs.read().dtype.names) 
                                  if item not in metadata_vars})
        analysis_vars = list({item for item 
                                  in list(store.root.analysis.songs.read().dtype.names) 
                                  if item not in analysis_vars})
    # else: `metadata_vars` and `analysis_vars` contain the variables to keep
    
    # retrieve the first `n` values as a horizontal array of 1 dimension
    artist_terms     = h1d_array(store.root.metadata.artist_terms.read(),
                                len(store.root.metadata.artist_terms.read()))
    artist_terms_weight     = h1d_array(store.root.metadata.artist_terms_weight.read(),
                                       len(store.root.metadata.artist_terms.read()))
    
    # store these values as variables in single dataframes
    at_df = pd.DataFrame(artist_terms    ,columns=var_list('artist_terms_',artist_terms    .shape[1]))
    at_wt = pd.DataFrame(artist_terms_weight,columns=var_list('artist_term_weight',artist_terms_weight.shape[1]))
    ai    = pd.DataFrame(store.root.metadata.songs.read(), 
                      columns=metadata_vars)
    ai    = pd.concat([ai]*(len(at_df)), ignore_index=True)
    ai    = pd.DataFrame(ai)
    at_df = pd.concat([ai, at_df, at_wt
                      ], axis=1)
 
    
    
    # merge these single dataframes into one single row dataframe
    ret = pd.concat([
            # retrieve a single row dataframe from `/metadata/songs`
            #pd.DataFrame(store.root.metadata.songs.read(), 
            #             columns=metadata_vars),
            # retrieve a single row dataframe from `/analysis/songs`
            # pd.DataFrame(store.root.analysis.songs.read(), 
            #              columns=analysis_vars),
            at_df],
            axis=0) # `axes=1` means stack the dataframes horizontally 
    # close the HDF5 file
    store.close()
    # return the merged dataframe
    return(ret)

In [47]:
path = "D:\Documents\B\Bentley\Coursework\MA755\MillionSongSubset\data"
filenames = unlist(unlist(unlist(get_filenames(path))))
filenames[0:2]
x=pd.HDFStore(filenames[120]).root.musicbrainz.songs.read()
print(x)

ValueError: The file 'D:\Documents\B\Bentley\Coursework\MA755\MillionSongSubset\data/A/A/I/TRAAIRG128F93265E8.h5' is already opened, but in read-only mode.  Please close it before reopening in append mode.

In [44]:
store = pd.HDFStore(filename,"r")

NameError: name 'filename' is not defined

In [43]:
    ai    = pd.DataFrame(store.root.metadata.songs.read(), 
                      columns=metadata_vars)
    ai    = pd.concat([ai]*(len(at_df)), ignore_index=True)
    ai    = pd.DataFrame(ai)
    at_df = pd.concat([ai, at_df, at_wt
                      ], axis=1)

NameError: name 'store' is not defined

In [7]:
#path = "C:/Users/CH162975/Documents/B/MA755/MillionSongSubset/data"
path = "D:\Documents\B\Bentley\Coursework\MA755\MillionSongSubset\data"
filenames = unlist(unlist(unlist(get_filenames(path))))
filenames[119:122]

['D:\\Documents\\B\\Bentley\\Coursework\\MA755\\MillionSongSubset\\data/A/A/I/TRAAINT128F933BBE0.h5',
 'D:\\Documents\\B\\Bentley\\Coursework\\MA755\\MillionSongSubset\\data/A/A/I/TRAAIRG128F93265E8.h5',
 'D:\\Documents\\B\\Bentley\\Coursework\\MA755\\MillionSongSubset\\data/A/A/I/TRAAIXN128F428027A.h5']

### Store in the `filenames` variable only the files with extension `.h5`

In [9]:
p = re.compile("\.h5$")
filenames = [filename for filename 
             in filenames if p.search(filename)]
filenames[119:122]

['D:\\Documents\\B\\Bentley\\Coursework\\MA755\\MillionSongSubset\\data/A/A/I/TRAAINT128F933BBE0.h5',
 'D:\\Documents\\B\\Bentley\\Coursework\\MA755\\MillionSongSubset\\data/A/A/I/TRAAIRG128F93265E8.h5',
 'D:\\Documents\\B\\Bentley\\Coursework\\MA755\\MillionSongSubset\\data/A/A/I/TRAAIXN128F428027A.h5']

In [271]:
len(filenames)

10000

### Get lists of variables from `/metadata/songs` and `/analysis/songs`

The two tables `/metadata/songs` and `/analysis/songs` provide data that is easy to load into a dataframe. Their variables are displayed below so we know which to choose or omit when creating the corresponding dataframes.

### `/metadata/songs`

In [272]:
tmp=pd.HDFStore(filenames[1])
print(tmp.root.metadata.songs.read().dtype)
tmp.close()

[('analyzer_version', 'S32'), ('artist_7digitalid', '<i4'), ('artist_familiarity', '<f8'), ('artist_hotttnesss', '<f8'), ('artist_id', 'S32'), ('artist_latitude', '<f8'), ('artist_location', 'S1024'), ('artist_longitude', '<f8'), ('artist_mbid', 'S40'), ('artist_name', 'S1024'), ('artist_playmeid', '<i4'), ('genre', 'S1024'), ('idx_artist_terms', '<i4'), ('idx_similar_artists', '<i4'), ('release', 'S1024'), ('release_7digitalid', '<i4'), ('song_hotttnesss', '<f8'), ('song_id', 'S32'), ('title', 'S1024'), ('track_7digitalid', '<i4')]


### `/analysis/songs`

In [242]:
tmp=pd.HDFStore(filenames[1])
print(tmp.root.analysis.songs.read().dtype)
tmp.close()

[('analysis_sample_rate', '<i4'), ('audio_md5', 'S32'), ('danceability', '<f8'), ('duration', '<f8'), ('end_of_fade_in', '<f8'), ('energy', '<f8'), ('idx_bars_confidence', '<i4'), ('idx_bars_start', '<i4'), ('idx_beats_confidence', '<i4'), ('idx_beats_start', '<i4'), ('idx_sections_confidence', '<i4'), ('idx_sections_start', '<i4'), ('idx_segments_confidence', '<i4'), ('idx_segments_loudness_max', '<i4'), ('idx_segments_loudness_max_time', '<i4'), ('idx_segments_loudness_start', '<i4'), ('idx_segments_pitches', '<i4'), ('idx_segments_start', '<i4'), ('idx_segments_timbre', '<i4'), ('idx_tatums_confidence', '<i4'), ('idx_tatums_start', '<i4'), ('key', '<i4'), ('key_confidence', '<f8'), ('loudness', '<f8'), ('mode', '<i4'), ('mode_confidence', '<f8'), ('start_of_fade_out', '<f8'), ('tempo', '<f8'), ('time_signature', '<i4'), ('time_signature_confidence', '<f8'), ('track_id', 'S32')]


### `/musicbrainz/songs`

In [243]:
tmp=pd.HDFStore(filenames[1])
print(tmp.root.musicbrainz.songs.read().dtype)
tmp.close()

[('idx_artist_mbtags', '<i4'), ('year', '<i4')]


### Create a list of 10,000 single row dataframes

Because `remove=False` is specified the two lists of variables are retrieved from the two `Tables` displayed above. The result of this command is a list of 10,000 single row dataframes with columns indicated. 

It may take up to twenty (20) minutes to create `mss_df_list` with the current set of variables. 

In [36]:
mss_df_list = [make_2col_df(filename=filename,
                            metadata_vars=['artist_id'],
                            # Omit: genre
                            analysis_vars=[],
                            # Omit: danceability, energy
                            remove=False
                           )
                for filename in filenames[0:10] # get data from all 10,000 files
              ]
len(mss_df_list), mss_df_list[1].shape

(10, (38, 3))

In [31]:
len(mss_df_list)

100

### Merge all dataframes of `mss_df_list` into a single dataframe stored in `mss_df`.

In [38]:
mss_df = pd.concat(mss_df_list,axis=0).reset_index(drop=True)

### Check the head of the table

In [39]:
mss_df.tail(50)

Unnamed: 0,artist_id,artist_terms_0,artist_term_weight0
222,b'AR10USD1187B99F3F1',b'emotional',0.367699
223,b'AR10USD1187B99F3F1',b'pop',0.367284
224,b'AR10USD1187B99F3F1',b'metal',0.367045
225,b'AR10USD1187B99F3F1',b'heavy',0.365861
226,b'AR10USD1187B99F3F1',b'melodic',0.362774
227,b'AR8ZCNI1187B9A069B',b'new wave',1.0
228,b'AR8ZCNI1187B9A069B',b'progressive rock',0.986962
229,b'AR8ZCNI1187B9A069B',b'space rock',0.983065
230,b'AR8ZCNI1187B9A069B',b'eurodance',0.965809
231,b'AR8ZCNI1187B9A069B',b'hard rock',0.902451


### Check its dimensions (shape) and its variables.

In [227]:
print('shape:',mss_df.shape)
print('columns:',mss_df.columns.values)

shape: (272, 3)
columns: ['artist_id' 'artist_terms_0' 'artist_term_weight0']


### Save the table `mss_df` in a _pickle_ file

First set the folder to save to and load from. 

In [228]:
save_load_path = "D:/Documents/B/Bentley/Coursework/MA755/MillionSongSubset/Graph"


Save `mss_df` to a _pickle_ file. 

In [34]:
mss_df.to_pickle(save_load_path+'/artist_terms.pkl')

Load `mss_df` from the _pickle_ file.

In [16]:
mss_df = pd.read_pickle(save_load_path+'/song_metadata.pkl')

Now check that we retrieved the same number of rows and variables we expect.

In [19]:
mss_df.shape, mss_df.columns

((10000, 16), Index(['artist_id', 'artist_name', 'artist_location', 'genre',
        'artist_familiarity', 'artist_hotttnesss', 'song_id', 'title',
        'song_hotttnesss', 'track_id', 'duration', 'key', 'loudness', 'mode',
        'danceability', 'tempo'],
       dtype='object'))

# End