In [2]:
from pathlib import Path
import tempfile
import tarfile
import zstandard  

import os
from glob import iglob

import json
from collections import defaultdict
import numpy as np
import pandas as pd

In [4]:
#extract zst files function
# from https://gist.github.com/scivision/ad241e9cf0474e267240e196d7545eca
def extract_zst(archive: Path, out_path: Path):
    """extract .zst file
    works on Windows, Linux, MacOS, etc.
    
    Parameters
    ----------
    archive: pathlib.Path or str
      .zst file to extract
    out_path: pathlib.Path or str
      directory to extract files and directories to
    """

    archive = Path(archive).expanduser()
    out_path = Path(out_path).expanduser().resolve()
    # need .resolve() in case intermediate relative dir doesn't exist

    dctx = zstandard.ZstdDecompressor()

    with tempfile.TemporaryFile(suffix=".tar") as ofh:
        with archive.open("rb") as ifh:
            dctx.copy_stream(ifh, ofh)
        ofh.seek(0)
        with tarfile.open(fileobj=ofh) as z:
            z.extractall(out_path)

In [None]:
#local variables to file path, set as needed
path_to_zst = "/Users/ryanlnewbury/Downloads/acousticbrainz-highlevel-json-20220623-0.tar.zst"
path_to_zst_extract = "/Users/ryanlnewbury/Downloads/highlevel"

In [5]:
extract_zst(path_to_zst,
            path_to_zst_extract)


KeyboardInterrupt



In [7]:
#creates directory route for every file in database
local_path = "/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel"
rootdir_glob = local_path + "/**/*"
# This will return absolute paths
file_list = [f for f in iglob(rootdir_glob, recursive=True) if os.path.isfile(f)]

In [8]:
#list of directories
file_list[:20]

['/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/619aa0ff-588d-4764-a7b8-25d5ec3f287f-1.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/61959535-0a81-4aac-ba6a-6c1837a1abc0-0.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/619c913b-975c-404c-bcbb-dfde093f6cd8-1.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/619e8920-070f-403f-a382-207693e8ad8b-0.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/61957c67-1a86-4d32-99b8-ef234ff88e68-4.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/61957c67-1a86-4d32-99b8-ef234ff88e68-5.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/619ee6cc-e9b6-4bd5-a6de-afd5cdebd96e-0.json',
 '/Use

In [25]:
#example json
f = open(file_list[0])
data = json.load(f)
data

{'highlevel': {'danceability': {'all': {'danceable': 0.040728867054,
    'not_danceable': 0.959271132946},
   'probability': 0.959271132946,
   'value': 'not_danceable',
   'version': {'essentia': '2.1-beta1',
    'essentia_build_sha': '8e24b98b71ad84f3024c7541412f02124a26d327',
    'essentia_git_sha': 'v2.1_beta1-228-g260734a',
    'extractor': 'music 1.0',
    'gaia': '2.4-dev',
    'gaia_git_sha': '857329b',
    'models_essentia_git_sha': 'v2.1_beta1'}},
  'gender': {'all': {'female': 0.256680905819, 'male': 0.743319094181},
   'probability': 0.743319094181,
   'value': 'male',
   'version': {'essentia': '2.1-beta1',
    'essentia_build_sha': '8e24b98b71ad84f3024c7541412f02124a26d327',
    'essentia_git_sha': 'v2.1_beta1-228-g260734a',
    'extractor': 'music 1.0',
    'gaia': '2.4-dev',
    'gaia_git_sha': '857329b',
    'models_essentia_git_sha': 'v2.1_beta1'}},
  'genre_dortmund': {'all': {'alternative': 4.77929384957e-09,
    'blues': 5.2068469536e-09,
    'electronic': 0.999974

In [18]:
data['metadata']['version'].keys()

dict_keys(['highlevel', 'lowlevel'])

In [None]:
#extraxts data for every file in directories
#cuation this will take a while
dic = defaultdict(list)
for d in file_list:
        f = open(d)
        data = json.load(f)
        #doesn't collect data if no metadata
        if data.get('metadata') == None:
            continue
        #gets mbid from file title
        temp = d.split('\\')[-1].split('-')
        temp.pop(-1);
        id1 = '-'.join(temp)
        #deduplicates mbids by only allowing one id into dictionary
        if id1 in dic['id']:
            continue
        else:
            dic['id'].append(id1)
        #appends new information to dicts
        dic['danceability'].append(data.get('highlevel').get('danceability').get('all').get('danceable'))
        dic['gender_male'].append(data.get('highlevel').get('gender').get('all').get('male'))
        dic['alternative'].append(data.get('highlevel').get('genre_dortmund').get('all').get('alternative'))
        dic['blues'].append(data.get('highlevel').get('genre_dortmund').get('all').get('blues'))
        dic['electronic'].append(data.get('highlevel').get('genre_dortmund').get('all').get('electronic'))
        dic['folkcountry'].append(data.get('highlevel').get('genre_dortmund').get('all').get('folkcountry'))
        dic['funksoulrnb'].append(data.get('highlevel').get('genre_dortmund').get('all').get('funksoulrnb'))
        dic['jazz'].append(data.get('highlevel').get('genre_dortmund').get('all').get('jazz'))
        dic['pop'].append(data.get('highlevel').get('genre_dortmund').get('all').get('pop'))
        dic['raphiphop'].append(data.get('highlevel').get('genre_dortmund').get('all').get('raphiphop'))
        dic['rock'].append(data.get('highlevel').get('genre_dortmund').get('all').get('rock'))
        dic['genre'].append(data.get('highlevel').get('genre_dortmund').get('value'))
        dic['acoustic'].append(data.get('highlevel').get('mood_acoustic').get('all').get('acoustic'))
        dic['aggressive'].append(data.get('highlevel').get('mood_aggressive').get('all').get('aggressive'))
        dic['mood_electronic'].append(data.get('highlevel').get('mood_electronic').get('all').get('electronic'))
        dic['happy'].append(data.get('highlevel').get('mood_happy').get('all').get('happy'))
        dic['party'].append(data.get('highlevel').get('mood_party').get('all').get('party'))
        dic['relaxed'].append(data.get('highlevel').get('mood_relaxed').get('all').get('relaxed'))
        dic['sad'].append(data.get('highlevel').get('mood_sad').get('all').get('sad'))
        dic['mood_mirex_1'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster1'))
        dic['mood_mirex_2'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster2'))
        dic['mood_mirex_3'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster3'))
        dic['mood_mirex_4'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster4'))
        dic['mood_mirex_5'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster5'))
        dic['timbre_bright'].append(data.get('highlevel').get('timbre').get('all').get('bright'))
        dic['tonal'].append(data.get('highlevel').get('tonal_atonal').get('all').get('tonal'))
        dic['instrumental'].append(data.get('highlevel').get('voice_instrumental').get('all').get('instrumental'))
        dic['bit_rate'].append(data.get('metadata').get('audio_properties').get('bit_rate'))
        dic['codec'].append(data.get('metadata').get('audio_properties').get('codec'))
        dic['length'].append(data.get('metadata').get('audio_properties').get('length'))
        dic['lossless'].append(data.get('metadata').get('audio_properties').get('lossless'))
        dic['replay_gain'].append(data.get('metadata').get('audio_properties').get('replay_gain'))
        dic['true_genre'].append(data.get('metadata').get('tags').get('genre'))

        #same with metadata, but metadata in lists so needs an except if the list is empty
        try:
            dic['artist'].append(data.get('metadata').get('tags').get('artist')[0])
        except:
            dic['artist'].append(data.get('metadata').get('tags').get('artist'))       
        try:
            dic['album'].append(data.get('metadata').get('tags').get('album')[0])
        except:
            dic['album'].append(data.get('metadata').get('tags').get('album'))
        try:
            dic['bpm'].append(data.get('metadata').get('tags').get('bpm')[0])
        except:
            dic['bpm'].append(data.get('metadata').get('tags').get('bpm'))
        try:
            dic['year'].append(int(data.get('metadata').get('tags').get('date')[0].split('-')[0]))
        except:
            dic['year'].append(data.get('metadata').get('tags').get('date'))
        try:
            dic['date'].append(data.get('metadata').get('tags').get('date')[0])     
        except:
            dic['date'].append(data.get('metadata').get('tags').get('date'))
        try:
            dic['label'].append(data.get('metadata').get('tags').get('label')[0])
        except:
            dic['label'].append(data.get('metadata').get('tags').get('label'))
        try:
            dic['song'].append(data.get('metadata').get('tags').get('title')[0])
        except:
            dic['song'].append(data.get('metadata').get('tags').get('title'))
        try:
            dic['artistsort'].append(data.get('metadata').get('tags').get('artistsort')[0])
        except:
            dic['artistsort'].append(data.get('metadata').get('tags').get('artistsort'))

In [131]:
#creates dataframe from dict created
data = pd.DataFrame(dic)

In [137]:
len(data)

754286

In [141]:
#number of rows with a value in year
data['year'].count()

730178

In [139]:
#remove null years
data = data[data['year'].isnull() == False]

In [140]:
len(data)

730178

In [142]:
data.head(10)

Unnamed: 0,id,danceability,gender_male,alternative,blues,electronic,folkcountry,funksoulrnb,jazz,pop,...,replay_gain,true_genre,artist,album,bpm,year,date,label,song,artistsort
0,00000baf-9215-483a-8900-93756eaf1cfc,0.9999094,0.5,0.1232231,0.02935208,0.678169,0.01836665,0.004645366,0.02795881,0.007834436,...,-6.764685,,In Extremo,Weckt die Toten!,,1998,1998-05-01,EFA,Como Poden,In Extremo
2,00002fc9-7283-44dd-bf6f-94c9492d0998,0.9128905,0.177181,0.06570297,0.06069383,0.538495,0.1011928,0.0230902,0.03234591,0.03749089,...,-11.074022,[Hip-Hop],Danny Brown,Old,,2013,2013-10-08,Fool's Gold Records,Lonely,"Brown, Danny"
3,00005a44-2152-4971-80c1-c217563845eb,0.2611271,0.013684,0.008683714,0.0004818535,0.988906,0.0009305365,3.203383e-05,0.0002341192,7.360673e-05,...,-5.080051,[Rock],Warlock,Burning the Witches,128.1吵,1984,1984,,Without You,
4,000078f4-308b-44a1-8601-6bc37d9e1623,4.28796e-07,0.49097,0.1094945,0.002266773,0.8623,0.01392666,0.0002019748,0.0004161819,0.002697128,...,-11.506065,[Space Rock],Spiritualized,"The Complete Works, Volume 1",119.6,2003,2003-04-15,Arista,Lay Back in the Sun (‘Electric Mainline’ EP ve...,Spiritualized
5,00007960-9d81-4192-b548-ad33d6b0ca54,0.4546444,0.922832,0.4180764,0.04206907,0.384046,0.05705131,0.004202647,0.006691809,0.01481296,...,-5.130407,[Indie Rock],Dandy Warhols,...The Dandy Warhols Come Down,116.03,1997,1997,Capitol Records,Not If You Were the Last Junkie on Earth,
6,00008af6-1070-45d5-936d-fd22a5394dd7,0.9482443,0.149076,0.1588343,0.08892059,0.345241,0.08827031,0.02571717,0.0678313,0.04839006,...,-5.078226,[Oldies],Petula Clark,The Atomic Café: French Cuts 2,86.94,2003,2003,,A Well Respected Man (Franz.),
7,0000975b-110a-428a-b329-5207d4eb675f,0.1068272,0.985882,0.07420029,0.1136003,0.371166,0.2652968,0.01488504,0.0276267,0.01748554,...,-8.468992,,Doug Kershaw,The Crazy Cajun Recordings,,1998,1998,Edsel Records,You Done Me Wrong,"Kershaw, Doug"
8,0000bbe1-12f7-48de-8737-a5ac362e76c8,0.00773344,0.875382,4.348425e-14,5.55813e-14,1.0,2.061889e-08,1.918473e-08,1.619071e-07,1.467535e-08,...,-4.47872,[Classical],Ludwig van Beethoven,Piano Sonatas (Complete),,2005,2005,Brilliant Classics,"Piano Sonata no. 3 in C major, op. 2 no. 3: I....","Beethoven, Ludwig van"
9,0000d8a7-8a9b-4b9d-a95c-038c6cb66547,0.961273,0.939157,0.2280277,0.02217325,0.62148,0.06854345,0.00239364,0.004151657,0.009586353,...,-5.745031,[Alternative],Kaiser Chiefs,"Education, Education, Education & War",91.35,2014,2014,Fiction Records,Coming Home,
10,0001093f-3ca1-42c8-bcc7-8fbbcd2f7c72,0.2814359,0.983833,0.1495392,0.03137565,0.669721,0.04712693,0.003812399,0.006459226,0.007076079,...,-8.899014,[Alternative],Motion City Soundtrack,Even If It Kills Me,156.01,2007,2007,Epitaph Europe,Point of Extinction,Motion City Soundtrack


In [144]:
#loads features extracted from acousticbrainz lowlevel features, from 3 different csvs on their website
lowlevel = pd.read_csv("C:\\Users\\ryannewbury\\Downloads\\acousticbrainz-lowlevel-features-20220623\\acousticbrainz-lowlevel-features-20220623-lowlevel.csv")
rhythm = pd.read_csv("C:\\Users\\ryannewbury\\Downloads\\acousticbrainz-lowlevel-features-20220623\\acousticbrainz-lowlevel-features-20220623-rhythm.csv")
tonal = pd.read_csv("C:\\Users\\ryannewbury\\Downloads\\acousticbrainz-lowlevel-features-20220623\\acousticbrainz-lowlevel-features-20220623-tonal.csv")

In [147]:
lowlevel.head(10)

Unnamed: 0,mbid,submission_offset,average_loudness,dynamic_complexity,mfcc_zero_mean
0,0e11c0fd-a1da-4b88-a438-7ef55c5809ec,0,0.70328,5.612967,-722.370972
1,7fef22bd-76aa-4803-b56b-93a5d6e70662,0,0.657434,5.046805,-690.498535
2,71c0e054-b700-4fd2-a35b-95c7afc566cb,0,0.228847,5.314451,-706.800476
3,2d1201cf-59bb-4ffa-9f52-f5b3afa13346,0,0.328406,4.47598,-721.950439
4,96685213-a25c-4678-9a13-abd9ec81cf35,0,0.57007,9.305593,-784.208496
5,73b01cea-2dad-4fc2-9e61-02a31477c1b1,0,0.168545,9.074841,-792.64502
6,7c278a16-ae04-460c-88ea-39155cadcd09,0,0.854816,2.146111,-652.554626
7,19084069-642f-465f-9127-f71bcd800a05,0,0.030876,3.55784,-749.831482
8,70fda1f4-c0cf-4bb5-b25e-79b5e921b198,0,0.925762,2.719668,-646.503784
9,da39a905-7b84-4e2a-bbcf-74de3d6ebd03,0,0.475052,5.929364,-682.161255


In [148]:
rhythm.head(10)

Unnamed: 0,mbid,submission_offset,bpm,bpm_histogram_first_peak_bpm_mean,bpm_histogram_first_peak_bpm_median,bpm_histogram_second_peak_bpm_mean,bpm_histogram_second_peak_bpm_median,danceability,onset_rate
0,0e11c0fd-a1da-4b88-a438-7ef55c5809ec,0,120.763885,120.0,120.0,133.0,133.0,0.996203,2.867577
1,7fef22bd-76aa-4803-b56b-93a5d6e70662,0,125.956993,126.0,126.0,136.0,136.0,1.131311,3.568778
2,71c0e054-b700-4fd2-a35b-95c7afc566cb,0,132.617203,133.0,133.0,140.0,140.0,0.915099,2.858371
3,2d1201cf-59bb-4ffa-9f52-f5b3afa13346,0,144.318924,144.0,144.0,152.0,152.0,0.972823,2.395773
4,96685213-a25c-4678-9a13-abd9ec81cf35,0,128.347702,129.0,129.0,120.0,120.0,1.102882,2.419718
5,73b01cea-2dad-4fc2-9e61-02a31477c1b1,0,120.360603,120.0,120.0,115.0,115.0,0.974217,1.876432
6,7c278a16-ae04-460c-88ea-39155cadcd09,0,151.575623,152.0,152.0,157.0,157.0,1.139013,3.394924
7,19084069-642f-465f-9127-f71bcd800a05,0,82.642754,82.0,82.0,94.0,94.0,0.872103,5.799162
8,70fda1f4-c0cf-4bb5-b25e-79b5e921b198,0,129.358032,129.0,129.0,123.0,123.0,1.155036,3.361048
9,da39a905-7b84-4e2a-bbcf-74de3d6ebd03,0,91.101822,91.0,91.0,172.0,172.0,1.073015,2.519788


In [158]:
tonal.head(10)

Unnamed: 0,mbid,submission_offset,key_key,key_scale,tuning_frequency,tuning_equal_tempered_deviation
0,0e11c0fd-a1da-4b88-a438-7ef55c5809ec,0,A,major,434.193115,0.141634
1,7fef22bd-76aa-4803-b56b-93a5d6e70662,0,A,major,434.193115,0.177662
2,71c0e054-b700-4fd2-a35b-95c7afc566cb,0,G,major,434.193115,0.234276
3,2d1201cf-59bb-4ffa-9f52-f5b3afa13346,0,D,major,434.193115,0.219335
4,96685213-a25c-4678-9a13-abd9ec81cf35,0,A,minor,434.193115,0.164615
5,73b01cea-2dad-4fc2-9e61-02a31477c1b1,0,G,minor,442.54892,0.0
6,7c278a16-ae04-460c-88ea-39155cadcd09,0,G,major,434.193115,0.224572
7,19084069-642f-465f-9127-f71bcd800a05,0,D,minor,434.193115,0.195417
8,70fda1f4-c0cf-4bb5-b25e-79b5e921b198,0,E,major,445.112549,0.095341
9,da39a905-7b84-4e2a-bbcf-74de3d6ebd03,0,A,major,444.598633,0.032828


In [175]:
#length of the 3 csv files
len(rhythm)

29460584

In [177]:
len(tonal)

29460584

In [179]:
len(lowlevel)

29460584

In [174]:
#number of unique ids in csvs
rhythm['mbid'].nunique()

7564215

In [176]:
tonal['mbid'].nunique()

7564215

In [178]:
lowlevel['mbid'].nunique()

7564215

In [180]:
#dropping duplicates from csvs
rhythm = rhythm.drop_duplicates(subset = ['mbid'], ignore_index = True)

In [181]:
len(rhythm)

7564215

In [182]:
tonal = tonal.drop_duplicates(subset = ['mbid'], ignore_index = True)

In [183]:
len(tonal)

7564215

In [184]:
lowlevel = lowlevel.drop_duplicates(subset = ['mbid'], ignore_index = True)

In [185]:
len(lowlevel)

7564215

In [186]:
#joining all data into one dataframe
data1 = data.set_index('id').join(rhythm.set_index('mbid'),how = 'left',rsuffix = '_2')

In [189]:
data2 = data1.join(tonal.set_index('mbid'),how = 'left', rsuffix = '_2')

In [190]:
data3 = data2.join(lowlevel.set_index('mbid'),how = 'left', rsuffix = '_2')

In [5]:
len(data3)

730178

In [6]:
data3.head(10)

Unnamed: 0,id,danceability,gender_male,alternative,blues,electronic,folkcountry,funksoulrnb,jazz,pop,...,onset_rate,submission_offset_2,key_key,key_scale,tuning_frequency,tuning_equal_tempered_deviation,submission_offset_2.1,average_loudness,dynamic_complexity,mfcc_zero_mean
0,00000baf-9215-483a-8900-93756eaf1cfc,0.9999094,0.5,0.1232231,0.02935208,0.678169,0.01836665,0.004645366,0.02795881,0.007834436,...,3.725091,0,A,minor,434.193115,0.238814,0,0.955147,2.210088,-612.303894
1,00002fc9-7283-44dd-bf6f-94c9492d0998,0.9128905,0.177181,0.06570297,0.06069383,0.538495,0.1011928,0.0230902,0.03234591,0.03749089,...,3.908956,0,A#,major,434.193115,0.189041,0,0.935944,3.92467,-687.685669
2,00005a44-2152-4971-80c1-c217563845eb,0.2611271,0.013684,0.008683714,0.0004818535,0.988906,0.0009305365,3.203383e-05,0.0002341192,7.360673e-05,...,2.669715,0,D,minor,442.037933,0.025623,0,0.902541,3.104318,-653.310913
3,000078f4-308b-44a1-8601-6bc37d9e1623,4.28796e-07,0.49097,0.1094945,0.002266773,0.8623,0.01392666,0.0002019748,0.0004161819,0.002697128,...,3.020281,0,G,major,438.984558,0.056967,0,0.962707,2.667426,-589.02179
4,00007960-9d81-4192-b548-ad33d6b0ca54,0.4546444,0.922832,0.4180764,0.04206907,0.384046,0.05705131,0.004202647,0.006691809,0.01481296,...,2.074923,0,A,minor,434.193115,0.253977,0,0.962148,2.34846,-642.020752
5,00008af6-1070-45d5-936d-fd22a5394dd7,0.9482443,0.149076,0.1588343,0.08892059,0.345241,0.08827031,0.02571717,0.0678313,0.04839006,...,3.493876,0,F,minor,453.940613,0.347294,0,0.959342,3.295431,-645.288147
6,0000975b-110a-428a-b329-5207d4eb675f,0.1068272,0.985882,0.07420029,0.1136003,0.371166,0.2652968,0.01488504,0.0276267,0.01748554,...,3.073224,0,D#,major,456.043091,0.176944,0,0.731738,4.467504,-714.238098
7,0000bbe1-12f7-48de-8737-a5ac362e76c8,0.00773344,0.875382,4.348425e-14,5.55813e-14,1.0,2.061889e-08,1.918473e-08,1.619071e-07,1.467535e-08,...,3.330849,0,C,major,448.46756,0.0,0,0.160667,7.483846,-848.82959
8,0000d8a7-8a9b-4b9d-a95c-038c6cb66547,0.961273,0.939157,0.2280277,0.02217325,0.62148,0.06854345,0.00239364,0.004151657,0.009586353,...,2.896211,0,G,major,441.017792,0.014545,0,0.946732,2.265507,-612.595093
9,0001093f-3ca1-42c8-bcc7-8fbbcd2f7c72,0.2814359,0.983833,0.1495392,0.03137565,0.669721,0.04712693,0.003812399,0.006459226,0.007076079,...,3.42356,0,D#,major,440.508636,0.019257,0,0.948849,2.752212,-612.61145


In [7]:
data3.isnull().sum()

id                                           0
danceability                                 0
gender_male                                  0
alternative                                  0
blues                                        0
electronic                                   0
folkcountry                                  0
funksoulrnb                                  0
jazz                                         0
pop                                          0
raphiphop                                    0
rock                                         0
genre                                        0
acoustic                                     0
aggressive                                   0
mood_electronic                              0
happy                                        0
party                                        0
relaxed                                      0
sad                                          0
mood_mirex_1                                 0
mood_mirex_2 

In [8]:
data3.to_csv('data.csv')