# Getting Features from Million Song Dataset (MSD)

In [1]:
# Import libraries
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np
import pandas as pd
import hdf5_getters as GETTERS

In [2]:
# local paths

# where data is from
msd_path= '/Volumes/Seagate Backup Plus Drive/MSD Data'
msd_addf_path=os.path.join(msd_path,'AdditionalFiles')
assert os.path.isdir(msd_path),'wrong path' # sanity check

# where data will be saved to
data_folder = "/Users/dannynightingale/Documents/Harvard 2017-2018/Harvard 2017-2018 Fall/CS 109a/Spotify Final Project/data/"

In [10]:
# the following function simply gives us a nice string for
# a time lag in seconds
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

# we define this very useful function to iterate the files
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt += len(files)
        # apply function to all files
        for f in files :
            func(f)
    return cnt

In [None]:
# get the metadata
conn = sqlite3.connect(os.path.join(msd_addf_path,
                                    'track_metadata.db'))

# we build the SQL query
q = "SELECT * FROM songs"
res = conn.execute(q)

# get the data
MSD_meta_data = res.fetchall()

# we close the connection to the database
conn.close()

In [28]:
MSD = pd.DataFrame(MSD_meta_data, columns=['id1', 'song', 'id2', 'album', 'id3', 'id4', 'artist', 'length', 'dk', 'artist_hotness', 'release_year'])
for i in ['id1', 'id2', 'id3', 'id4', 'dk']:
    del MSD[i]

In [29]:
MSD.head()

Unnamed: 0,song,album,artist,length,artist_hotness,release_year
0,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,252.05506,0.394032,2003
1,Tanssi vaan,Karkuteillä,Karkkiautomaatti,156.55138,0.356992,1995
2,No One Could Ever,Butter,Hudson Mohawke,138.97098,0.437504,2006
3,Si Vos Querés,De Culo,Yerba Brava,145.05751,0.372349,2003
4,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,514.29832,0.0,0


In [30]:
MSD.to_pickle(data_folder + "MSD_data")

# The rest of the code took too long to run (as the dataset is way too massive). Did not use these extra features. 

In [23]:
# get additional features 
song_hotnesss = {}
danceability = {}
energys = {}

# we define the function to apply to all files
def func_to_get_data(filename):

    h5 = GETTERS.open_h5_file_read(filename)
    
    song_id = GETTERS.get_song_id(h5)
    song_hotness = GETTERS.get_song_hotttnesss(h5)
    dance = GETTERS.get_danceability(h5)
    energy = GETTERS.get_energy(h5)
    
    song_hotnesss[song_id] = song_hotness
    danceability[song_id] = dance
    energys[song_id] = energy
    
    h5.close()  

In [None]:
# run the function to get data! iterate through files
file_names = ['I', 'J', 'K', 'L', 'O']
for name in (file_names):
    
    print(name)
    msd_data_path = os.path.join(msd_path,name)
    
    t1 = time.time()
    apply_to_all_files(msd_data_path,func=func_to_get_data)
    t2 = time.time()
    print('all info extracted in:',strtimedelta(t1,t2))  