# Million Song Dataset
<a href='https://labrosa.ee.columbia.edu/millionsong/'>Link to MSD</a>

<a href='https://github.com/tbertinmahieux/MSongsDB/tree/master/PythonSrc'>Python source</a>

<a href='https://labrosa.ee.columbia.edu/millionsong/pages/field-list'>MSD field list</a>

The key is a track-level attribute ranging from 0 to 11 and corresponding to one of the 12 keys: C, C#, D, etc. up to B. If no key was detected, the value is -1.

The mode is equal to 0 or 1 for “minor” or “major” and may be -1 in case of no result. Note that the major key (e.g. C major) could more likely be confused with the minor key at 3 semitones lower (e.g.  A minor) as both keys carry the same pitches. Harmonic details are given in segments below.

<a href='https://github.com/tbertinmahieux/MSongsDB/tree/master/Tasks_Demos/Preview7digital'>Get preview of 7digital song</a>

---

# MSD summary file (hdf5)

In [None]:
# EXAMPLE on how to read the Million Song Dataset entries
import os
import numpy as np
import tables as pt

file_name = os.path.join ('src', 'msd_summary_file.h5')
h5_file = pt.open_file (file_name, mode='r')

# print object tree of PyTable file
print (h5_file)

In [None]:
# get link to song table node
table = h5_file.get_node ('/analysis/songs')
print (table.coldescrs)

In [None]:
# shape of the table is:
print (table.nrows, len(table.coldtypes))

In [None]:
# read data from song table
msd_songs = table.read ()
table_colnames=table.colnames

In [None]:
h5_file.close ()

In [None]:
print (msd_songs[0])
print (table_colnames)

In [None]:
# make pandas df of data (just because it looks better and is more dynamic)
import pandas as pd
from IPython.display import display

msd_songs_df = pd.DataFrame (msd_songs, columns=table_colnames)
display (msd_songs_df.head ())

<a href='https://labrosa.ee.columbia.edu/millionsong/pages/field-list'>MSD field list</a>

In [None]:
# clean up the columns 'audio_md5' and 'track_id'
# they are byte objects and need to be decoded to string
if False:
    msd_songs_df['audio_md5'] = msd_songs_df['audio_md5'].map (lambda x: x.decode())
    msd_songs_df['track_id'] = msd_songs_df['track_id'].map (lambda x: x.decode())

display (msd_songs_df.head ())

In [None]:
# TODO: from every key, get 100 examples per mode
# 12 keys * 2 modes * 100 = 2400 songs
# constraint: key confidence AND mode confidence shall be > (???) <- check first where good cut is

In [None]:
# print histograms of key confidence and mode confidence
import matplotlib.pyplot as plt
%matplotlib inline

msd_songs_df['key_confidence'].plot (kind='hist', bins=20)
msd_songs_df['mode_confidence'].plot (kind='hist', bins=20, alpha=0.5);

In [None]:
songs_conf = msd_songs_df.loc[(msd_songs_df['key_confidence']>0.75) & (msd_songs_df['mode_confidence']>0.75)]

In [None]:
display (songs_conf.head ())
print (songs_conf.shape)

In [None]:
songs_conf['key'].plot (kind='hist', bins=12)
songs_conf['mode'].plot (kind='hist', bins=12);

In [None]:
overall_length = 0
for i in range (12):
    for j in range (2):
        k = len (songs_conf.loc[(songs_conf['key']==i) & (songs_conf['mode']==j)])
        print (i, j, ':', k)
        overall_length += k
print (overall_length)

In [3]:
# get artist name and song title from another msd file...
# read the 'all track Echo Nest ID' file
# it is a text file, each line is formatted: track id<SEP>song id<SEP>artist name<SEP>song title

import os
import pandas as pd

file_name = os.path.join ('src', 'unique_tracks.txt')
col_names = ['track_id', 'song_id', 'artist_name', 'song_title']
unique_tracks = pd.read_table (file_name, sep='<SEP>', header=None, names=col_names, engine='python')

In [5]:
from IPython.display import display

display (unique_tracks.head ())
print ('[i] Number of records:', len (unique_tracks))

Unnamed: 0,track_id,song_id,artist_name,song_title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


[i] Number of records: 1000000


In [None]:
# unique db-key = track_id (db-key = database key, to avoid confusion with audio key)
# db-key from songs_conf only taken into consideration

songs_conf_tracks = songs_conf.merge (unique_tracks, how='left', on=['track_id'], validate='one_to_one')

In [None]:
display (songs_conf_tracks.head ())
print (len (songs_conf_tracks))

In [None]:
# TODO: get rid of columns we don't need
# here it is easier to filter by the columns we need
print (list (songs_conf_tracks.columns.values))

In [None]:
keep_cols = ['duration', 'end_of_fade_in', 'key', 'key_confidence',
             'mode', 'mode_confidence', 'start_of_fade_out', 'tempo',
             'track_id', 'song_id', 'artist_name', 'song_title']

songs_conf_tracks_filt = songs_conf_tracks.filter (items=keep_cols, axis=1)

display (songs_conf_tracks_filt.head ())
print (len (songs_conf_tracks_filt))

In [None]:
songs_conf_tracks_filt.to_csv ('songs_conf=75_tracks_filt.csv')