# Train a model for song similarity

In [None]:
__import__('sys').path.append('../scripts/'); __import__('notebook_utils').table_of_contents('song_similarity_training.ipynb')

Steps:
1. **Get matching songs** between FMA & MSD
2. **Get all metadata** for the matching songs
3. **Find similar metadata** for both datasets
4. **Write code for the similarity model**, deciding what input and output we would have
5. **Create X and y from the .X dataset**
6. **Train the model**

## Setup

In [69]:
# IMPORTS
import json
import os
import numpy as np
import pandas as pd
import sys

import h5py  # for reading MSD .h5 files


from notebook_utils import md, h3, h4, h5
import FMA_code.utils

In [70]:
# PATHS
class paths():
    # General
    DATA_F = 'data/'

    # MSD files
    MSD_F = DATA_F + 'MSD/'
    SIMILAR_TRACKS = MSD_F + 'tracks_with_similar.txt'
    LASTFM_SUBSET_F = MSD_F + 'lastfm_subset/'
    LASTFM_TRAIN_F = MSD_F + 'lastfm_train/'
    LASTFM_TEST_F = MSD_F + 'lastfm_test/'
    MSD_SUBSET_F = MSD_F + 'MillionSongSubset/'
    MSD_SUMMARY = MSD_F + 'msd_summary_file.h5'

    # FMA files
    FMA_F = DATA_F + 'FMA/'
    FMA_METADATA_F = FMA_F + 'fma_metadata/'
    FMA_SMALL_F = FMA_F + 'fma_small/'

In [71]:
# LOAD DATA
# FMA
FMA_tracks = FMA_code.utils.load(paths.FMA_METADATA_F + 'tracks.csv')
FMA_genres = FMA_code.utils.load(paths.FMA_METADATA_F + 'genres.csv')
FMA_features = FMA_code.utils.load(paths.FMA_METADATA_F + 'features.csv')
FMA_echonest = FMA_code.utils.load(paths.FMA_METADATA_F + 'echonest.csv')

# MSD
msd_summary_h5 = h5py.File(paths.MSD_SUMMARY, 'r')

# LAST.FM

## Show all available metadata

In [None]:
# SHOW METADATA FOR ONE EXAMPLE OF LAST.FM SUBSET
lastfm_ex_path = paths.LASTFM_SUBSET_F + 'A/A/A/TRAAAAW128F429D538.json'
with open(lastfm_ex_path) as f:
    lastfm_ex = json.load(f)

for key in lastfm_ex.keys():
    print(f'{key:<15}', lastfm_ex[key][:2] + ['...'] if isinstance(lastfm_ex[key], list) else lastfm_ex[key])

artist          Casual
timestamp       2011-08-02 20:13:25.674526
similars        [['TRABACN128F425B784', 0.871737], ['TRIAINV12903CB4943', 0.751301], '...']
tags            [['Bay Area', '100'], ['hieroglyiphics', '100'], '...']
track_id        TRAAAAW128F429D538
title           I Didn't Mean To


In [89]:
# SHOW ALL METADATA FOR ONE EXAMPLE OF MSD
msd_ex_path = paths.MSD_SUBSET_F + 'A/X/A/TRAXAAS128F146B741.h5'
h5 = h5py.File(msd_ex_path,'r')
for key in h5.keys():
    print('\n',key)
    for key2 in h5[key].keys():
        print('\t', key2, h5[key][key2].shape)

        # Check if there are extra fields
        fields = h5[key][key2].dtype.fields
        if fields:
            for field in fields:
                print('\t\t', field, h5[key][key2][field])
        else:
            print('\t\t', h5[key][key2][:7])
            # print('\t\t', h5[key][key2].dtype.fields)


 analysis
	 bars_confidence (77,)
		 [0.01  0.406 0.277 0.416 0.534 0.691 0.275]
	 bars_start (77,)
		 [ 0.78155  2.50917  4.22547  5.93918  7.6545   9.36698 11.07402]
	 beats_confidence (311,)
		 [0.928 0.632 0.64  0.407 0.562 0.78  0.665]
	 beats_start (311,)
		 [0.34974 0.78155 1.21612 1.64698 2.07729 2.50917 2.93462]
	 sections_confidence (7,)
		 [1.    0.866 0.437 0.637 0.791 0.879 0.739]
	 sections_start (7,)
		 [  0.        8.08448  37.03275  66.6059   78.64275  91.19099 102.78459]
	 segments_confidence (477,)
		 [0.    1.    1.    1.    1.    0.949 1.   ]
	 segments_loudness_max (477,)
		 [-60.    -10.943  -6.516  -9.45   -6.962  -6.652  -9.828]
	 segments_loudness_max_time (477,)
		 [0.      0.05481 0.06717 0.02053 0.0334  0.03042 0.02525]
	 segments_loudness_start (477,)
		 [-60.    -60.    -46.214 -29.701 -33.415 -20.91  -38.481]
	 segments_pitches (477, 12)
		 [[1.    0.906 0.239 0.2   0.177 0.246 0.268 0.212 0.22  0.226 0.23  0.366]
 [0.076 0.058 0.188 0.168 0.103 0.091 0

In [117]:
# SHOW ALL METADATA KEYS FOR ONE EXAMPLE OF MSD
for key in h5.keys():
    print('\n',key)
    for key2 in h5[key].keys():
        print('\t', key2, h5[key][key2].shape)

        # Check if there are extra fields
        fields = h5[key][key2].dtype.fields
        if fields:
            for field in fields:
                print('\t\t', field, h5[key][key2][field])
        # else:
        #     print('\t\t', h5[key][key2][:7])



 analysis
	 bars_confidence (83,)
	 bars_start (83,)
	 beats_confidence (344,)
	 beats_start (344,)
	 sections_confidence (10,)
	 sections_start (10,)
	 segments_confidence (971,)
	 segments_loudness_max (971,)
	 segments_loudness_max_time (971,)
	 segments_loudness_start (971,)
	 segments_pitches (971, 12)
	 segments_start (971,)
	 segments_timbre (971, 12)
	 songs (1,)
		 analysis_sample_rate [22050]
		 audio_md5 [b'a222795e07cd65b7a530f1346f520649']
		 danceability [0.]
		 duration [218.93179]
		 end_of_fade_in [0.247]
		 energy [0.]
		 idx_bars_confidence [0]
		 idx_bars_start [0]
		 idx_beats_confidence [0]
		 idx_beats_start [0]
		 idx_sections_confidence [0]
		 idx_sections_start [0]
		 idx_segments_confidence [0]
		 idx_segments_loudness_max [0]
		 idx_segments_loudness_max_time [0]
		 idx_segments_loudness_start [0]
		 idx_segments_pitches [0]
		 idx_segments_start [0]
		 idx_segments_timbre [0]
		 idx_tatums_confidence [0]
		 idx_tatums_start [0]
		 key [1]
		 key_confidence

In [None]:
# SHOW ALL METADATA FOR THE MSD SUMMARY
h5 = h5py.File(paths.MSD_SUMMARY,'r')
for key in h5.keys():
    print('\n',key)
    for key2 in h5[key].keys():
        print('\t', key2, h5[key][key2].shape)

        # Check if there are extra fields
        fields = h5[key][key2].dtype.fields
        if fields:
            for field in fields:
                print('\t\t', field, h5[key][key2][field])
        else:
            print('\t\t', h5[key][key2][:7])
            # print('\t\t', h5[key][key2].dtype.fields)


 analysis
	 songs (1000000,)
		 analysis_sample_rate [22050 22050 22050 ... 22050 22050 22050]
		 audio_md5 [b'aee9820911781c734e7694c5432990ca' b'ed222d07c83bac7689d52753610a513a'
 b'96c7104889a128fef84fa469d60e380c' ...
 b'7d065b833e183244a3c3ed023fcbb70a' b'32473a8e2d20f3efbdcb3caa57d4bf35'
 b'7c4a1f610c8f73d467a1463027a8bc40']
		 danceability [0. 0. 0. ... 0. 0. 0.]
		 duration [252.05506 156.55138 138.97098 ... 553.03791 484.51873 295.07873]
		 end_of_fade_in [2.049 0.258 0.    ... 0.223 0.595 0.   ]
		 energy [0. 0. 0. ... 0. 0. 0.]
		 idx_bars_confidence [0 0 0 ... 0 0 0]
		 idx_bars_start [0 0 0 ... 0 0 0]
		 idx_beats_confidence [0 0 0 ... 0 0 0]
		 idx_beats_start [0 0 0 ... 0 0 0]
		 idx_sections_confidence [0 0 0 ... 0 0 0]
		 idx_sections_start [0 0 0 ... 0 0 0]
		 idx_segments_confidence [0 0 0 ... 0 0 0]
		 idx_segments_loudness_max [0 0 0 ... 0 0 0]
		 idx_segments_loudness_max_time [0 0 0 ... 0 0 0]
		 idx_segments_loudness_start [0 0 0 ... 0 0 0]
		 idx_segments_pitc

In [88]:
# Check % of 0s for danceability & energy
(msd_summary_h5['analysis']['songs']['danceability'] == 0).sum()
(msd_summary_h5['analysis']['songs']['energy'] == 0).sum()

# msd_summary_h5['analysis']['songs']['energy'].sum()

np.int64(1000000)

In [75]:
# NOW GET THE FMA ONE :)
artist_name = 'Chandeliers'
title = 'The Durks'

# Get all related metadata
FMA_tracks.columns
m = (FMA_tracks['artist']['name'] == artist_name) & (FMA_tracks['track']['title'] == title)

pd.set_option('display.max_columns', None)
display(FMA_tracks[m])

track_id = FMA_tracks[m].index[0]
# display(FMA_features[FMA_features.index == track_id])
display(FMA_echonest[FMA_echonest.index == track_id])

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,album,album,album,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,set,set,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,title,tracks,type,active_year_begin,active_year_end,associated_labels,bio,comments,date_created,favorites,id,latitude,location,longitude,members,name,related_projects,tags,website,wikipedia_page,split,subset,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2
18805,0,2009-09-16 13:14:52,2009-09-22,Chandeliers,0,4293,<p>Chicago's CHANDELIERS are totally uplifting...,3931,Chandeliers,[],Dirty Moves (Sampler),5,Single Tracks,2004-01-01,NaT,"Captcha Records (HBSP-2X), Pickled Egg (Europe)","<p><span style=""font-family:Verdana, Geneva, A...",1,2008-11-26 02:05:22,8,100,41.878114,"Chicago, IL",-87.629798,"Chris Kalis, Harry Brenner, Scott McGaughey, B...",Chandeliers,"Killer Whales, \nMichael Columbia\nMandate\nMr...",[chandeliers],thechandeliers.com,,training,medium,256000,0,,2009-09-16 13:16:39,NaT,63,2,Electronic,[15],[15],,1705,en,Attribution-Noncommercial-Share Alike 3.0 Unit...,602,,4,,[],The Durks


Unnamed: 0_level_0,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest
Unnamed: 0_level_1,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,metadata,metadata,metadata,metadata,metadata,metadata,metadata,ranks,ranks,ranks,ranks,ranks,social_features,social_features,social_features,social_features,social_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features
Unnamed: 0_level_2,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,album_date,album_name,artist_latitude,artist_location,artist_longitude,artist_name,release,artist_discovery_rank,artist_familiarity_rank,artist_hotttnesss_rank,song_currency_rank,song_hotttnesss_rank,artist_discovery,artist_familiarity,artist_hotttnesss,song_currency,song_hotttnesss,000,001,002,003,004,005,006,007,008,009,010,011,012,013,014,015,016,017,018,019,020,021,022,023,024,025,026,027,028,029,030,031,032,033,034,035,036,037,038,039,040,041,042,043,044,045,046,047,048,049,050,051,052,053,054,055,056,057,058,059,060,061,062,063,064,065,066,067,068,069,070,071,072,073,074,075,076,077,078,079,080,081,082,083,084,085,086,087,088,089,090,091,092,093,094,095,096,097,098,099,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3,Unnamed: 32_level_3,Unnamed: 33_level_3,Unnamed: 34_level_3,Unnamed: 35_level_3,Unnamed: 36_level_3,Unnamed: 37_level_3,Unnamed: 38_level_3,Unnamed: 39_level_3,Unnamed: 40_level_3,Unnamed: 41_level_3,Unnamed: 42_level_3,Unnamed: 43_level_3,Unnamed: 44_level_3,Unnamed: 45_level_3,Unnamed: 46_level_3,Unnamed: 47_level_3,Unnamed: 48_level_3,Unnamed: 49_level_3,Unnamed: 50_level_3,Unnamed: 51_level_3,Unnamed: 52_level_3,Unnamed: 53_level_3,Unnamed: 54_level_3,Unnamed: 55_level_3,Unnamed: 56_level_3,Unnamed: 57_level_3,Unnamed: 58_level_3,Unnamed: 59_level_3,Unnamed: 60_level_3,Unnamed: 61_level_3,Unnamed: 62_level_3,Unnamed: 63_level_3,Unnamed: 64_level_3,Unnamed: 65_level_3,Unnamed: 66_level_3,Unnamed: 67_level_3,Unnamed: 68_level_3,Unnamed: 69_level_3,Unnamed: 70_level_3,Unnamed: 71_level_3,Unnamed: 72_level_3,Unnamed: 73_level_3,Unnamed: 74_level_3,Unnamed: 75_level_3,Unnamed: 76_level_3,Unnamed: 77_level_3,Unnamed: 78_level_3,Unnamed: 79_level_3,Unnamed: 80_level_3,Unnamed: 81_level_3,Unnamed: 82_level_3,Unnamed: 83_level_3,Unnamed: 84_level_3,Unnamed: 85_level_3,Unnamed: 86_level_3,Unnamed: 87_level_3,Unnamed: 88_level_3,Unnamed: 89_level_3,Unnamed: 90_level_3,Unnamed: 91_level_3,Unnamed: 92_level_3,Unnamed: 93_level_3,Unnamed: 94_level_3,Unnamed: 95_level_3,Unnamed: 96_level_3,Unnamed: 97_level_3,Unnamed: 98_level_3,Unnamed: 99_level_3,Unnamed: 100_level_3,Unnamed: 101_level_3,Unnamed: 102_level_3,Unnamed: 103_level_3,Unnamed: 104_level_3,Unnamed: 105_level_3,Unnamed: 106_level_3,Unnamed: 107_level_3,Unnamed: 108_level_3,Unnamed: 109_level_3,Unnamed: 110_level_3,Unnamed: 111_level_3,Unnamed: 112_level_3,Unnamed: 113_level_3,Unnamed: 114_level_3,Unnamed: 115_level_3,Unnamed: 116_level_3,Unnamed: 117_level_3,Unnamed: 118_level_3,Unnamed: 119_level_3,Unnamed: 120_level_3,Unnamed: 121_level_3,Unnamed: 122_level_3,Unnamed: 123_level_3,Unnamed: 124_level_3,Unnamed: 125_level_3,Unnamed: 126_level_3,Unnamed: 127_level_3,Unnamed: 128_level_3,Unnamed: 129_level_3,Unnamed: 130_level_3,Unnamed: 131_level_3,Unnamed: 132_level_3,Unnamed: 133_level_3,Unnamed: 134_level_3,Unnamed: 135_level_3,Unnamed: 136_level_3,Unnamed: 137_level_3,Unnamed: 138_level_3,Unnamed: 139_level_3,Unnamed: 140_level_3,Unnamed: 141_level_3,Unnamed: 142_level_3,Unnamed: 143_level_3,Unnamed: 144_level_3,Unnamed: 145_level_3,Unnamed: 146_level_3,Unnamed: 147_level_3,Unnamed: 148_level_3,Unnamed: 149_level_3,Unnamed: 150_level_3,Unnamed: 151_level_3,Unnamed: 152_level_3,Unnamed: 153_level_3,Unnamed: 154_level_3,Unnamed: 155_level_3,Unnamed: 156_level_3,Unnamed: 157_level_3,Unnamed: 158_level_3,Unnamed: 159_level_3,Unnamed: 160_level_3,Unnamed: 161_level_3,Unnamed: 162_level_3,Unnamed: 163_level_3,Unnamed: 164_level_3,Unnamed: 165_level_3,Unnamed: 166_level_3,Unnamed: 167_level_3,Unnamed: 168_level_3,Unnamed: 169_level_3,Unnamed: 170_level_3,Unnamed: 171_level_3,Unnamed: 172_level_3,Unnamed: 173_level_3,Unnamed: 174_level_3,Unnamed: 175_level_3,Unnamed: 176_level_3,Unnamed: 177_level_3,Unnamed: 178_level_3,Unnamed: 179_level_3,Unnamed: 180_level_3,Unnamed: 181_level_3,Unnamed: 182_level_3,Unnamed: 183_level_3,Unnamed: 184_level_3,Unnamed: 185_level_3,Unnamed: 186_level_3,Unnamed: 187_level_3,Unnamed: 188_level_3,Unnamed: 189_level_3,Unnamed: 190_level_3,Unnamed: 191_level_3,Unnamed: 192_level_3,Unnamed: 193_level_3,Unnamed: 194_level_3,Unnamed: 195_level_3,Unnamed: 196_level_3,Unnamed: 197_level_3,Unnamed: 198_level_3,Unnamed: 199_level_3,Unnamed: 200_level_3,Unnamed: 201_level_3,Unnamed: 202_level_3,Unnamed: 203_level_3,Unnamed: 204_level_3,Unnamed: 205_level_3,Unnamed: 206_level_3,Unnamed: 207_level_3,Unnamed: 208_level_3,Unnamed: 209_level_3,Unnamed: 210_level_3,Unnamed: 211_level_3,Unnamed: 212_level_3,Unnamed: 213_level_3,Unnamed: 214_level_3,Unnamed: 215_level_3,Unnamed: 216_level_3,Unnamed: 217_level_3,Unnamed: 218_level_3,Unnamed: 219_level_3,Unnamed: 220_level_3,Unnamed: 221_level_3,Unnamed: 222_level_3,Unnamed: 223_level_3,Unnamed: 224_level_3,Unnamed: 225_level_3,Unnamed: 226_level_3,Unnamed: 227_level_3,Unnamed: 228_level_3,Unnamed: 229_level_3,Unnamed: 230_level_3,Unnamed: 231_level_3,Unnamed: 232_level_3,Unnamed: 233_level_3,Unnamed: 234_level_3,Unnamed: 235_level_3,Unnamed: 236_level_3,Unnamed: 237_level_3,Unnamed: 238_level_3,Unnamed: 239_level_3,Unnamed: 240_level_3,Unnamed: 241_level_3,Unnamed: 242_level_3,Unnamed: 243_level_3,Unnamed: 244_level_3,Unnamed: 245_level_3,Unnamed: 246_level_3,Unnamed: 247_level_3,Unnamed: 248_level_3,Unnamed: 249_level_3
18805,0.61392,0.535189,0.53458,0.971656,0.119325,0.030479,109.976,0.063106,,,30.2676,"Austin, TX, US",-97.743,Chandeliers,Dirty Moves (Sampler),,,,,,0.364921,0.400958,0.381724,0.0,0.0,0.203455,0.572626,0.222005,0.163111,0.208889,0.273975,0.302328,0.585904,0.277035,0.158101,0.256278,0.148349,0.166,0.67,0.1725,0.084,0.077,0.1565,0.238,0.5525,0.212,0.1015,0.1825,0.0895,0.027242,0.178368,0.030296,0.029767,0.063439,0.081689,0.052736,0.154852,0.03836,0.024448,0.05641,0.023361,0.007,0.012,0.019,0.006,0.005,0.013,0.006,0.011,0.029,0.005,0.008,0.01,0.827,1.0,1.0,0.85,1.0,1.0,1.0,1.0,1.0,0.734,1.0,0.936,0.82,0.988,0.981,0.844,0.995,0.987,0.994,0.989,0.971,0.729,0.992,0.926,1.318146,-0.117618,1.364685,1.443761,1.484357,1.341475,1.005804,-0.084406,1.535132,1.472825,1.445393,2.004295,1.995753,-1.838186,2.4678,1.798367,1.371141,0.683816,0.454619,-1.751035,2.375523,1.732423,1.708803,5.020381,43.712761,-65.023422,53.680927,19.976589,-53.479389,-9.841823,33.473156,-7.439671,26.321524,-5.126926,3.370681,13.146536,45.168499,-59.052002,51.225502,17.855499,-53.399002,-12.282499,31.731499,-6.0925,28.127501,-6.011,5.2925,14.789,26.110897,2984.777832,1272.178467,1258.61792,515.033875,479.480469,319.32077,418.779083,406.178375,290.687134,222.482498,291.502655,0.0,-217.011993,-30.575001,-219.222,-118.733002,-59.769001,-37.588001,-72.334999,-47.832001,-47.616001,-71.93,-35.972,49.424,171.130005,167.235001,230.742004,57.491001,103.685997,84.244003,96.764999,69.736,103.149002,66.152,50.073002,49.424,388.141998,197.809998,449.963989,176.223999,163.455002,121.832001,169.100006,117.568001,150.764999,138.082001,86.044998,-4.124304,0.062859,0.169484,-0.353582,0.584577,1.1757,-0.172498,0.234465,-0.576732,1.221359,-0.950839,-0.414168,27.243866,1.327363,-0.115413,15.613863,3.088614,3.735168,0.817082,2.829525,0.765777,7.210759,6.351656,-0.142011,-12.991006,-11.1445,28.616385,-60.0,-7.382,52.618,-4.108032,29.148115,0.064626,0.045675,0.005626,0.0,0.75993,0.75993,5.308431,39.285461,-19.458422,-18.2185,31.489656,-60.0,-11.608,48.391998,-4.233589,26.017132,0.322664,0.27293,0.047047,0.07814,1.99161,1.91347,3.093815,17.393087


In [37]:
# Extract titles & artists for MSD tracks that match to FMA
titles = [a.decode('utf-8') for t, a in zip(h5['analysis']['songs']['track_id'], h5['metadata']['songs']['title']) if t.decode('utf-8') in MSD_track_ids]
artists = [a.decode('utf-8') for t, a in zip(h5['analysis']['songs']['track_id'], h5['metadata']['songs']['artist_name']) if t.decode('utf-8') in MSD_track_ids]

assert len(titles) == len(artists) & len(titles) == len(MSD_track_ids)

In [42]:
similars_t_a = {(v['title'], v['artist']) for v in similars_dict.values()}

In [48]:
len([(t, a, tr) for t, a, tr in zip(titles, artists, MSD_track_ids) if (t, a) in similars_t_a])

400

In [None]:
# MANUALLY SEE IF ANYTHING MATCHES :)

# 1. Get a random title, artist pair from the MSD & get all data from it


## Get matching songs between FMA & LAST.FM

In [6]:
# GET LASTFM-FMA MATCHING TRACKS

# Get title & artist tuple set for FMA
FMA_title_artist_set = set(zip(FMA_tracks[('track', 'title')], FMA_tracks[('artist', 'name')]))

# Get list of tracks with similar tracks on last.fm
with open(paths.SIMILAR_TRACKS) as f:
    tracks_with_similar = f.read().splitlines()

# Get MSD tracks that match FMA & save it in similars_dict
notfound = 0
similars_dict = {}

for track_id in tracks_with_similar:
    rel_track_path = f'{track_id[2]}/{track_id[3]}/{track_id[4]}/{track_id}.json'

    # Get the last.fm folder (train/test) where the track is
    for supfolder in [paths.LASTFM_TEST_F, paths.LASTFM_TRAIN_F]:
        track_path = supfolder + rel_track_path
        if os.path.exists(track_path):
            break
    
    # Extrac the data from the track
    with open(track_path) as f:
        track_dict = json.load(f)

        # Get the title & artist tuple
        title = track_dict['title']
        artist = track_dict['artist']

        # Check if the title & artist match
        if (title, artist) in FMA_title_artist_set:
            similars_dict[track_id] = {
                'similars': track_dict['similars'],
                'artist': track_dict['artist'],
                'title': track_dict['title']   
            }

print(f"FMA and MSD match in {len(similars_dict)} tracks where MSD has similar tracks")

FMA and MSD match in 453 tracks where MSD has similar tracks


In [10]:
# EXTRACT MSD SIMILAR TRACKS
MSD_track_ids = similars_dict.keys()
MSD_track_ids

dict_keys(['TRACCYC128F930B379', 'TRADPJD12903D031C6', 'TRAEJYY128F4218203', 'TRAGYML128F4286683', 'TRAIZBQ128F92F02A7', 'TRAJXWL12903D0537F', 'TRAKKTI128F42B4F25', 'TRALDEB128F428CE7A', 'TRALSLF128F421E8FA', 'TRAMUUZ128F933CA08', 'TRAODHR128F4280D21', 'TRAPJCE128F933EFB6', 'TRAQEZO128F9345243', 'TRAUENB128F9302207', 'TRAULSR128F933DDA6', 'TRAYXFP128F93328B1', 'TRAYYKZ128F148F79E', 'TRBBDLJ128F933B13D', 'TRBBJRH128F92E321D', 'TRBHZIR128F42B9D3F', 'TRBIXYY128F427568E', 'TRBKXPB128F92FB059', 'TRBNTZP12903CB9C0D', 'TRBNYIW128F932D641', 'TRBOHGT128F933DDA1', 'TRBRHZB128F4260D0F', 'TRBRMXW128F4287365', 'TRBRRLH128F147F9B8', 'TRBSGIV128F933EFAE', 'TRBSVSB128F93108D6', 'TRBWOJQ128F933EFAC', 'TRBXKIE128F425BA78', 'TRBYHXS128F42720D9', 'TRCBCCT128F932345A', 'TRCDHJF128F4298A1C', 'TRCDLEX128F9309413', 'TRCFKBD128F93108D0', 'TRCJWFM12903CC9324', 'TRCKKGX12903CD2D06', 'TRCLAGG12903CEE39C', 'TRCLAJZ128F931528E', 'TRCMWIQ128F93167E0', 'TRCQXCN128F933EFB1', 'TRCUAAZ128F933EBB4', 'TRCVJZX128F42553CE',

## Prepare the model's input and output

In [10]:
# PREPARE THE INPUT

In [None]:
# PREPARE THE OUTPUT