In [1]:
import pandas as pd
import numpy as np
import h5py

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
file_path = "msd_summary_file.h5"
with h5py.File(file_path, "r") as hdf:
    # List all datasets in the file
    def print_datasets(name, obj):
        if isinstance(obj, h5py.Dataset):
            print(name)

    hdf.visititems(print_datasets)

    # Choose a dataset (replace 'dataset_name' with the actual dataset)
    dataset_name = "metadata/songs"
    data = hdf[dataset_name][:]

# Convert to DataFrame
df = pd.DataFrame(data)

analysis/songs
metadata/songs
musicbrainz/songs


In [3]:
file_path = "msd_summary_file.h5"

# Open the file
with h5py.File(file_path, 'r') as h5file:
    # Load datasets into Pandas DataFrames
    analysis_df = pd.DataFrame(h5file['analysis/songs'][:]) if 'analysis/songs' in h5file else pd.DataFrame()
    metadata_df = pd.DataFrame(h5file['metadata/songs'][:]) if 'metadata/songs' in h5file else pd.DataFrame()
    musicbrainz_df = pd.DataFrame(h5file['musicbrainz/songs'][:]) if 'musicbrainz/songs' in h5file else pd.DataFrame()

print('Dataframes loaded.')

Dataframes loaded.


In [4]:
from itertools import zip_longest

# Get column names (features)
analysis_features = list(analysis_df.columns)
metadata_features = list(metadata_df.columns)
musicbrainz_features = list(musicbrainz_df.columns)

# Find the longest list length
max_length = max(len(analysis_features), len(metadata_features), len(musicbrainz_features))

# Pad shorter lists with empty strings
analysis_features += [''] * (max_length - len(analysis_features))
metadata_features += [''] * (max_length - len(metadata_features))
musicbrainz_features += [''] * (max_length - len(musicbrainz_features))

# Print header
print(f'{"Analysis Features":<40}{"Metadata Features":<40}{"Musicbrainz Features"}')
print()
# Print each row side by side
for a, m, mb in zip(analysis_features, metadata_features, musicbrainz_features):
    print(f'{a:<40}{m:<40}{mb}')


Analysis Features                       Metadata Features                       Musicbrainz Features

analysis_sample_rate                    analyzer_version                        idx_artist_mbtags
audio_md5                               artist_7digitalid                       year
danceability                            artist_familiarity                      
duration                                artist_hotttnesss                       
end_of_fade_in                          artist_id                               
energy                                  artist_latitude                         
idx_bars_confidence                     artist_location                         
idx_bars_start                          artist_longitude                        
idx_beats_confidence                    artist_mbid                             
idx_beats_start                         artist_name                             
idx_sections_confidence                 artist_playmeid            

In [5]:
# Merge datasets
df = pd.concat([analysis_df, metadata_df, musicbrainz_df], axis=1)

# Convert all bytes to string for readability
df = df.map(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

# Display the final merged DataFrame
df = df.iloc[:, ::-1]
df.head()

Unnamed: 0,year,idx_artist_mbtags,track_7digitalid,title,song_id,song_hotttnesss,release_7digitalid,release,idx_similar_artists,idx_artist_terms,genre,artist_playmeid,artist_name,artist_mbid,artist_longitude,artist_location,artist_latitude,artist_id,artist_hotttnesss,artist_familiarity,artist_7digitalid,analyzer_version,track_id,time_signature_confidence,time_signature,tempo,start_of_fade_out,mode_confidence,mode,loudness,key_confidence,key,idx_tatums_start,idx_tatums_confidence,idx_segments_timbre,idx_segments_start,idx_segments_pitches,idx_segments_loudness_start,idx_segments_loudness_max_time,idx_segments_loudness_max,idx_segments_confidence,idx_sections_start,idx_sections_confidence,idx_beats_start,idx_beats_confidence,idx_bars_start,idx_bars_confidence,energy,end_of_fade_in,duration,danceability,audio_md5,analysis_sample_rate
0,2003,0,7032331,Silent Night,SOQMMHC12AB0180CB8,0.542899,633681,Monster Ballads X-Mas,0,0,,44895,Faster Pussy cat,357ff05d-848a-44cf-b608-cb34b5701ae5,,,,ARYZTJS1187B98C555,0.394032,0.649822,4069,,TRMMMYQ128F932D901,0.94,4,87.002,236.635,0.688,0,-4.829,0.777,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,2.049,252.05506,0.0,aee9820911781c734e7694c5432990ca,22050
1,1995,0,1514808,Tanssi vaan,SOVFVAK12A8C1350D9,0.299877,145266,Karkuteillä,0,0,,-1,Karkkiautomaatti,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,,,,ARMVN3U1187FB3A1EB,0.356992,0.439604,113480,,TRMMMKD128F425225D,0.0,1,150.778,148.66,0.355,1,-10.555,0.808,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.258,156.55138,0.0,ed222d07c83bac7689d52753610a513a,22050
2,2006,0,6945353,No One Could Ever,SOGTUKN12AB017F4F1,0.617871,625706,Butter,0,0,,-1,Hudson Mohawke,3d403d44-36ce-465c-ad43-ae877e65adc4,-4.24251,"Glasgow, Scotland",55.8578,ARGEKB01187FB50750,0.437504,0.643681,63531,,TRMMMRX128F93187D9,0.446,4,177.768,138.971,0.566,1,-2.06,0.418,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,138.97098,0.0,96c7104889a128fef84fa469d60e380c,22050
3,2003,0,2168257,Si Vos Querés,SOBNYVR12A8C13558C,,199368,De Culo,0,0,,34000,Yerba Brava,12be7648-7094-495f-90e6-df4189d68615,,,,ARNWYLR1187B9B2F9C,0.372349,0.448501,65051,,TRMMMCH128F425532C,0.0,4,87.433,138.687,0.451,1,-4.654,0.125,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,145.05751,0.0,0f7da84b6b583e3846c7e022fb3a92a2,22050
4,0,0,2264873,Tangle Of Aspens,SOHSBXH12A8C13B0DF,,209038,Rene Ablaze Presents Winter Sessions,0,0,,-1,Der Mystic,,,,,AREQDTE1269FB37231,0.0,0.0,158279,,TRMMMWA128F426B589,0.315,4,140.035,506.717,0.29,0,-7.806,0.097,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,514.29832,0.0,228dd6392ad8001b0281f533f34c72fd,22050


In [6]:
columns_to_keep = ['track_id', 'title', 'artist_name', 'artist_mbid', 'artist_id', 'release', 'genre', 'year', 'duration', ]

df = df.drop(columns=[col for col in df.columns if col not in columns_to_keep])
df.head()

Unnamed: 0,year,title,release,genre,artist_name,artist_mbid,artist_id,track_id,duration
0,2003,Silent Night,Monster Ballads X-Mas,,Faster Pussy cat,357ff05d-848a-44cf-b608-cb34b5701ae5,ARYZTJS1187B98C555,TRMMMYQ128F932D901,252.05506
1,1995,Tanssi vaan,Karkuteillä,,Karkkiautomaatti,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,ARMVN3U1187FB3A1EB,TRMMMKD128F425225D,156.55138
2,2006,No One Could Ever,Butter,,Hudson Mohawke,3d403d44-36ce-465c-ad43-ae877e65adc4,ARGEKB01187FB50750,TRMMMRX128F93187D9,138.97098
3,2003,Si Vos Querés,De Culo,,Yerba Brava,12be7648-7094-495f-90e6-df4189d68615,ARNWYLR1187B9B2F9C,TRMMMCH128F425532C,145.05751
4,0,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,,Der Mystic,,AREQDTE1269FB37231,TRMMMWA128F426B589,514.29832
