## Import Required Libraries 

In [1]:
import json
import numpy as np
import time
import os
import pandas as pd
import csv

## Read CSV containing all artists, tracks and albums 

In [2]:
# Read in CSV files
df_artists = pd.read_csv('./data/artists.csv', header=None, names=['Artists'])
df_albums = pd.read_csv('./data/albums.csv', header=None, names=['Albums'])
df_tracks = pd.read_csv('./data/tracks.csv', header=None, names=['Tracks'])

# Convert dataframes to lists
artists = df_artists['Artists'].tolist()
albums = df_albums['Albums'].tolist()
tracks = df_tracks['Tracks'].tolist()

# Convert lists to dictionaries
artist_dict = dict()
for i in range(0,len(artists)):
    artist_dict[artists[i]] = i

album_dict = dict()
for i in range(0,len(albums)):
    album_dict[albums[i]] = i

track_dict = dict()
for i in range(0,len(tracks)):
    track_dict[tracks[i]] = i

## Read Spotify Data and store using IDs from dictionaries

In [3]:
# List of all input data files
files = []
for file in os.listdir(".\data\spotify_million_playlist_dataset\data"):
    if file.endswith(".json"):
        files.append(file)

In [27]:
# Initialize lists to store artist, playlists and tracks
artist_playlists = dict({})
track_playlists = dict({})
album_playlists = dict({})

# Loop through each file and store data
count = 0
playlist_num = 0
t0 = time.time()
for file in files:
    
    # Open file
    f = open("./data/spotify_million_playlist_dataset/data/" + file)

    # Read JSON data
    data = json.load(f)

    # Closing file
    f.close()
    
    # Loop through each playlist in file
    for playlist in data['playlists']:
        
        # Initialize list of artists, traacks and albums for playlist
        artist_list = []
        track_list = []
        album_list = []
        
        # Loop through each track in playlist
        for song in playlist['tracks']:
            
            # Split text strings
            artist_uri = song['artist_uri'].split(':')
            track_uri = song['track_uri'].split(':')
            album_uri = song['album_uri'].split(':')
            
            # Store all artists, tracks and albums using id from dictionary
            artist_list.append(artist_dict[artist_uri[2]])
            track_list.append(track_dict[track_uri[2]])
            album_list.append(album_dict[album_uri[2]])
    
        # Append playlist on list of all playlists
        artist_playlists[playlist_num] = np.unique(artist_list)
        track_playlists[playlist_num] = np.unique(track_list)
        album_playlists[playlist_num] = np.unique(album_list)
        
        # Increase playlist number
        playlist_num = playlist_num+1
        
    
    # Display progress
    print('Filename:', file)
    print('File ', count, ": ", count/len(files), 'Time: ', time.time()-t0)
    count = count + 1

Filename: mpd.slice.0-999.json
File  0 :  0.0 Time:  0.4082756042480469
Filename: mpd.slice.1000-1999.json
File  1 :  0.001 Time:  0.8032705783843994
Filename: mpd.slice.10000-10999.json
File  2 :  0.002 Time:  1.172372817993164
Filename: mpd.slice.100000-100999.json
File  3 :  0.003 Time:  1.556295394897461
Filename: mpd.slice.101000-101999.json
File  4 :  0.004 Time:  1.914099931716919
Filename: mpd.slice.102000-102999.json
File  5 :  0.005 Time:  2.2756130695343018
Filename: mpd.slice.103000-103999.json
File  6 :  0.006 Time:  2.636929988861084
Filename: mpd.slice.104000-104999.json
File  7 :  0.007 Time:  3.0099973678588867
Filename: mpd.slice.105000-105999.json
File  8 :  0.008 Time:  3.369356155395508
Filename: mpd.slice.106000-106999.json
File  9 :  0.009 Time:  3.7377026081085205
Filename: mpd.slice.107000-107999.json
File  10 :  0.01 Time:  4.101115703582764
Filename: mpd.slice.108000-108999.json
File  11 :  0.011 Time:  4.460625410079956
Filename: mpd.slice.109000-109999.json

Filename: mpd.slice.189000-189999.json
File  100 :  0.1 Time:  37.72091031074524
Filename: mpd.slice.19000-19999.json
File  101 :  0.101 Time:  38.112603425979614
Filename: mpd.slice.190000-190999.json
File  102 :  0.102 Time:  38.47762370109558
Filename: mpd.slice.191000-191999.json
File  103 :  0.103 Time:  38.86973524093628
Filename: mpd.slice.192000-192999.json
File  104 :  0.104 Time:  39.23475790023804
Filename: mpd.slice.193000-193999.json
File  105 :  0.105 Time:  39.60742545127869
Filename: mpd.slice.194000-194999.json
File  106 :  0.106 Time:  39.97746825218201
Filename: mpd.slice.195000-195999.json
File  107 :  0.107 Time:  40.37230896949768
Filename: mpd.slice.196000-196999.json
File  108 :  0.108 Time:  40.73289513587952
Filename: mpd.slice.197000-197999.json
File  109 :  0.109 Time:  41.09201645851135
Filename: mpd.slice.198000-198999.json
File  110 :  0.11 Time:  41.54803514480591
Filename: mpd.slice.199000-199999.json
File  111 :  0.111 Time:  41.91617822647095
Filename

Filename: mpd.slice.278000-278999.json
File  199 :  0.199 Time:  75.7101411819458
Filename: mpd.slice.279000-279999.json
File  200 :  0.2 Time:  76.09596705436707
Filename: mpd.slice.28000-28999.json
File  201 :  0.201 Time:  76.50911736488342
Filename: mpd.slice.280000-280999.json
File  202 :  0.202 Time:  76.9083993434906
Filename: mpd.slice.281000-281999.json
File  203 :  0.203 Time:  77.28006505966187
Filename: mpd.slice.282000-282999.json
File  204 :  0.204 Time:  77.65006875991821
Filename: mpd.slice.283000-283999.json
File  205 :  0.205 Time:  78.0348379611969
Filename: mpd.slice.284000-284999.json
File  206 :  0.206 Time:  78.43127584457397
Filename: mpd.slice.285000-285999.json
File  207 :  0.207 Time:  78.87289595603943
Filename: mpd.slice.286000-286999.json
File  208 :  0.208 Time:  79.27652859687805
Filename: mpd.slice.287000-287999.json
File  209 :  0.209 Time:  79.65531730651855
Filename: mpd.slice.288000-288999.json
File  210 :  0.21 Time:  80.02471137046814
Filename: mp

Filename: mpd.slice.367000-367999.json
File  298 :  0.298 Time:  113.82914566993713
Filename: mpd.slice.368000-368999.json
File  299 :  0.299 Time:  114.20043158531189
Filename: mpd.slice.369000-369999.json
File  300 :  0.3 Time:  114.57362532615662
Filename: mpd.slice.37000-37999.json
File  301 :  0.301 Time:  114.9631597995758
Filename: mpd.slice.370000-370999.json
File  302 :  0.302 Time:  115.35559821128845
Filename: mpd.slice.371000-371999.json
File  303 :  0.303 Time:  115.71919322013855
Filename: mpd.slice.372000-372999.json
File  304 :  0.304 Time:  116.09357285499573
Filename: mpd.slice.373000-373999.json
File  305 :  0.305 Time:  116.52232122421265
Filename: mpd.slice.374000-374999.json
File  306 :  0.306 Time:  116.89093470573425
Filename: mpd.slice.375000-375999.json
File  307 :  0.307 Time:  117.25619959831238
Filename: mpd.slice.376000-376999.json
File  308 :  0.308 Time:  117.61466932296753
Filename: mpd.slice.377000-377999.json
File  309 :  0.309 Time:  117.990817308425

Filename: mpd.slice.456000-456999.json
File  397 :  0.397 Time:  151.2724587917328
Filename: mpd.slice.457000-457999.json
File  398 :  0.398 Time:  151.6387529373169
Filename: mpd.slice.458000-458999.json
File  399 :  0.399 Time:  152.0334484577179
Filename: mpd.slice.459000-459999.json
File  400 :  0.4 Time:  152.41304397583008
Filename: mpd.slice.46000-46999.json
File  401 :  0.401 Time:  152.7833342552185
Filename: mpd.slice.460000-460999.json
File  402 :  0.402 Time:  153.22241973876953
Filename: mpd.slice.461000-461999.json
File  403 :  0.403 Time:  153.62947702407837
Filename: mpd.slice.462000-462999.json
File  404 :  0.404 Time:  154.00393795967102
Filename: mpd.slice.463000-463999.json
File  405 :  0.405 Time:  154.39868593215942
Filename: mpd.slice.464000-464999.json
File  406 :  0.406 Time:  154.7946424484253
Filename: mpd.slice.465000-465999.json
File  407 :  0.407 Time:  155.16540813446045
Filename: mpd.slice.466000-466999.json
File  408 :  0.408 Time:  155.57082891464233
F

Filename: mpd.slice.545000-545999.json
File  496 :  0.496 Time:  189.51685166358948
Filename: mpd.slice.546000-546999.json
File  497 :  0.497 Time:  189.90452027320862
Filename: mpd.slice.547000-547999.json
File  498 :  0.498 Time:  190.2987310886383
Filename: mpd.slice.548000-548999.json
File  499 :  0.499 Time:  190.67290139198303
Filename: mpd.slice.549000-549999.json
File  500 :  0.5 Time:  191.13581490516663
Filename: mpd.slice.55000-55999.json
File  501 :  0.501 Time:  191.50359106063843
Filename: mpd.slice.550000-550999.json
File  502 :  0.502 Time:  191.89599347114563
Filename: mpd.slice.551000-551999.json
File  503 :  0.503 Time:  192.27042412757874
Filename: mpd.slice.552000-552999.json
File  504 :  0.504 Time:  192.65271615982056
Filename: mpd.slice.553000-553999.json
File  505 :  0.505 Time:  193.03642320632935
Filename: mpd.slice.554000-554999.json
File  506 :  0.506 Time:  193.40063095092773
Filename: mpd.slice.555000-555999.json
File  507 :  0.507 Time:  193.791691303253

Filename: mpd.slice.634000-634999.json
File  595 :  0.595 Time:  228.0335657596588
Filename: mpd.slice.635000-635999.json
File  596 :  0.596 Time:  228.3981373310089
Filename: mpd.slice.636000-636999.json
File  597 :  0.597 Time:  228.8496024608612
Filename: mpd.slice.637000-637999.json
File  598 :  0.598 Time:  229.19917464256287
Filename: mpd.slice.638000-638999.json
File  599 :  0.599 Time:  229.58288383483887
Filename: mpd.slice.639000-639999.json
File  600 :  0.6 Time:  229.95970034599304
Filename: mpd.slice.64000-64999.json
File  601 :  0.601 Time:  230.33838987350464
Filename: mpd.slice.640000-640999.json
File  602 :  0.602 Time:  230.71050834655762
Filename: mpd.slice.641000-641999.json
File  603 :  0.603 Time:  231.0826654434204
Filename: mpd.slice.642000-642999.json
File  604 :  0.604 Time:  231.4540660381317
Filename: mpd.slice.643000-643999.json
File  605 :  0.605 Time:  231.84242153167725
Filename: mpd.slice.644000-644999.json
File  606 :  0.606 Time:  232.20784306526184
F

Filename: mpd.slice.723000-723999.json
File  694 :  0.694 Time:  266.73988604545593
Filename: mpd.slice.724000-724999.json
File  695 :  0.695 Time:  267.12075662612915
Filename: mpd.slice.725000-725999.json
File  696 :  0.696 Time:  267.5465145111084
Filename: mpd.slice.726000-726999.json
File  697 :  0.697 Time:  267.942857503891
Filename: mpd.slice.727000-727999.json
File  698 :  0.698 Time:  268.32933282852173
Filename: mpd.slice.728000-728999.json
File  699 :  0.699 Time:  268.7841956615448
Filename: mpd.slice.729000-729999.json
File  700 :  0.7 Time:  269.1842300891876
Filename: mpd.slice.73000-73999.json
File  701 :  0.701 Time:  269.5636639595032
Filename: mpd.slice.730000-730999.json
File  702 :  0.702 Time:  269.9761815071106
Filename: mpd.slice.731000-731999.json
File  703 :  0.703 Time:  270.3589746952057
Filename: mpd.slice.732000-732999.json
File  704 :  0.704 Time:  270.77345538139343
Filename: mpd.slice.733000-733999.json
File  705 :  0.705 Time:  271.1723575592041
Filen

Filename: mpd.slice.812000-812999.json
File  793 :  0.793 Time:  305.7073428630829
Filename: mpd.slice.813000-813999.json
File  794 :  0.794 Time:  306.09816241264343
Filename: mpd.slice.814000-814999.json
File  795 :  0.795 Time:  306.4981515407562
Filename: mpd.slice.815000-815999.json
File  796 :  0.796 Time:  306.86043643951416
Filename: mpd.slice.816000-816999.json
File  797 :  0.797 Time:  307.2603952884674
Filename: mpd.slice.817000-817999.json
File  798 :  0.798 Time:  307.63028478622437
Filename: mpd.slice.818000-818999.json
File  799 :  0.799 Time:  307.99437856674194
Filename: mpd.slice.819000-819999.json
File  800 :  0.8 Time:  308.3829231262207
Filename: mpd.slice.82000-82999.json
File  801 :  0.801 Time:  308.77277660369873
Filename: mpd.slice.820000-820999.json
File  802 :  0.802 Time:  309.1672441959381
Filename: mpd.slice.821000-821999.json
File  803 :  0.803 Time:  309.54640889167786
Filename: mpd.slice.822000-822999.json
File  804 :  0.804 Time:  309.962895154953
Fil

Filename: mpd.slice.901000-901999.json
File  892 :  0.892 Time:  345.72859477996826
Filename: mpd.slice.902000-902999.json
File  893 :  0.893 Time:  346.15255761146545
Filename: mpd.slice.903000-903999.json
File  894 :  0.894 Time:  346.5475986003876
Filename: mpd.slice.904000-904999.json
File  895 :  0.895 Time:  346.91574478149414
Filename: mpd.slice.905000-905999.json
File  896 :  0.896 Time:  347.2967348098755
Filename: mpd.slice.906000-906999.json
File  897 :  0.897 Time:  347.69674134254456
Filename: mpd.slice.907000-907999.json
File  898 :  0.898 Time:  348.0977370738983
Filename: mpd.slice.908000-908999.json
File  899 :  0.899 Time:  348.5077004432678
Filename: mpd.slice.909000-909999.json
File  900 :  0.9 Time:  348.9287350177765
Filename: mpd.slice.91000-91999.json
File  901 :  0.901 Time:  349.3387403488159
Filename: mpd.slice.910000-910999.json
File  902 :  0.902 Time:  349.7517018318176
Filename: mpd.slice.911000-911999.json
File  903 :  0.903 Time:  350.1337425708771
File

Filename: mpd.slice.991000-991999.json
File  991 :  0.991 Time:  384.8977725505829
Filename: mpd.slice.992000-992999.json
File  992 :  0.992 Time:  385.280042886734
Filename: mpd.slice.993000-993999.json
File  993 :  0.993 Time:  385.65371108055115
Filename: mpd.slice.994000-994999.json
File  994 :  0.994 Time:  386.02645921707153
Filename: mpd.slice.995000-995999.json
File  995 :  0.995 Time:  386.3985266685486
Filename: mpd.slice.996000-996999.json
File  996 :  0.996 Time:  386.7699496746063
Filename: mpd.slice.997000-997999.json
File  997 :  0.997 Time:  387.1579267978668
Filename: mpd.slice.998000-998999.json
File  998 :  0.998 Time:  387.5336503982544
Filename: mpd.slice.999000-999999.json
File  999 :  0.999 Time:  387.9036748409271


## Generate CVS files containing playlist id vs category

In [34]:
# Write Playlist-Artist ID matrix to csv
with open('./data/artist_w_id.csv', 'w', newline='') as f:
    c = csv.writer(f)
    count = 0
    c.writerow(['Playlist ID', 'Artist ID'])
    for key in artist_playlists:
        for value in artist_playlists[key]:
            c.writerow([key, value])
            count = count+1

In [35]:
# Write Playlist-Track ID matrix to csv
with open('./data/track_w_id.csv', 'w', newline='') as f:
    c = csv.writer(f)
    count = 0
    c.writerow(['Playlist ID', 'Track ID'])
    for key in track_playlists:
        for value in track_playlists[key]:
            c.writerow([key, value])
            count = count+1

In [36]:
# Write Playlist-Album ID matrix to csv
with open('./data/album_w_id.csv', 'w', newline='') as f:
    c = csv.writer(f)
    count = 0
    c.writerow(['Playlist ID', 'Album ID'])
    for key in album_playlists:
        for value in album_playlists[key]:
            c.writerow([key, value])
            count = count+1

In [40]:
df_art = pd.read_csv('./data/artist_w_id.csv')

In [41]:
df_art

Unnamed: 0,Playlist ID,Artist ID
0,0,4087
1,0,13618
2,0,30143
3,0,54181
4,0,61336
...,...,...
38088206,999999,215891
38088207,999999,234008
38088208,999999,237810
38088209,999999,241865
