# MSD converter to TFRecords

In [2]:
# Imports

import os
import numpy as np
import tensorflow as tf
import h5py

In [15]:
def get_path_from_id(ID, base):
    #Folder path
    for c in ID[2:5]:
        base += c + '/'
        
    #Filename
    base += ID + ".h5"
    
    #Sanity Check
    if not os.path.isfile(base):
        print('ERROR:',base,'does not exist.')
        return None
    
    return base

def numpy_to_map(items):
    
    res = []
    for item in items:
        d = {}
        for h, el in zip(items.dtype.fields, item):
            d[h] = el
        res.append(d)
        
    return res

def h5_to_map(node):
    try:
        node.keys()
    except:
        item = np.array(node)
        if item.dtype.fields is not None:
            item = numpy_to_map(item)
        return item
    
    res = {}    
    for key in node.keys():
        res[key] = h5_to_map(node[key])
        
    return res

In [4]:
def flatten_map(node):
    
    res = {}
    for key in node.keys(): 
        if type(node[key]) is dict:
            item = flatten_map(node[key])
            for k in item.keys():
                res[key+"/"+k] = item[k]
        else:
            res[key] = node[key]
    return res
            
def numpy_to_TFFeature(item):
    
    if len(item.shape) == 0:
        item = np.array([item])
    
    if len(item.shape) != 1:
        tensor = tf.io.serialize_tensor(item)
        item = np.array([tensor.numpy()])
    
    if item.dtype == 'float64':
        value = tf.train.FloatList(value = item.astype(np.float64))
        return tf.train.Feature(float_list = value)
    elif item.dtype == 'int32':
        value = tf.train.Int64List(value = item)
        return tf.train.Feature(int64_list = value)
    else: # Only String type remains
        value = tf.train.BytesList(value = item)
        return tf.train.Feature(bytes_list = value)

def map_to_TFExample(obj):
    feat = {}
    
    for key, el in obj.items():
        feat[key] = numpy_to_TFFeature(el)
        
    features = tf.train.Features(feature = feat)
    example = tf.train.Example(features = features)
    return example


def song_to_tfrecord(ID):
    
    path = get_path_from_id(ID, '../msd/data/')                     
    file_song = h5py.File(path, 'r')

    tree = h5_to_map(file_song)
    map = flatten_map(tree)

    tf_example = map_to_TFExample(map)
    return tf_example

In [2]:
import json

with open('balanced_track_with_genre.json') as json_file:
    track_json = json.load(json_file)
    


{'track_id': 'TRZWOFZ12903CEB1D8', 'genre': 'hip-hop'}

In [5]:
import os
import h5py
import numpy as np

path = get_path_from_id('TRAAAAK128F9318786', '../msd/data/')                     

file_song = h5py.File(path, 'r')

In [16]:
h5_to_map(file_song)['metadata']['songs']

[{'analyzer_version': b'',
  'artist_7digitalid': 324573,
  'artist_familiarity': 0.6399025154955147,
  'artist_hotttnesss': 0.46131833754118956,
  'artist_id': b'ARJNIUY12298900C91',
  'artist_latitude': nan,
  'artist_location': b'',
  'artist_longitude': nan,
  'artist_mbid': b'6ae6a016-91d7-46cc-be7d-5e8e5d320c54',
  'artist_name': b'Adelitas Way',
  'artist_playmeid': 166043,
  'genre': b'',
  'idx_artist_terms': 0,
  'idx_similar_artists': 0,
  'release': b'Adelitas Way',
  'release_7digitalid': 497103,
  'song_hotttnesss': 0.7333716199617285,
  'song_id': b'SOBLFFE12AF72AA5BA',
  'title': b'Scream',
  'track_7digitalid': 5504670}]