In [38]:
import numpy as np
import pandas as pd
import json
import copy


In [2]:
# n is the number of json files we wish to read (max is 1000)
n = 5

for i in range(n):
    # create the file_name string in order to read in file
    file_name = '../spotify_million_playlist_dataset/data/mpd.slice.' \
        + str(i*1000) \
        + '-' + str(i*1000+999) + '.json'
    
    # Uncomment the following line to show progress
    # print(file_name)

    # open the file and store its contents in file_contents
    with open(file_name) as user_file:
        file_contents = user_file.read()

    # we only care about the "playlists" part of this dictionary
    # save the list of playlists in playlist_list
    parsed_json = json.loads(file_contents)
    playlist_list = parsed_json["playlists"]


    # create dataframe if it's first playlist, otherwise append info to existing dataframe
    # the record_path argument tells the json_normalize function how to flatten the data
    # the meta argument tells the json_nomralize function what meta data to keep
    if i == 0:
        data = pd.json_normalize(
            playlist_list,
            record_path = "tracks",
            meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
        ) 
    else:
        data = pd.concat( 
            [
                data,
                pd.json_normalize(
                    playlist_list,
                    record_path = "tracks",
                    meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
                )
            ],     
            ignore_index = True
        )

# For some reason reading in the data creates a new column called pos so I delete it here
data =data.drop(columns=['pos'])

In [31]:
# dictionary to translate track uri's to integer indices
def make_track_dict(df): 
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        result[uri] = i
        i += 1
    return result

# create the big array R consisting of 1's and 0's. I didn't end up using this
def make_R(df): 
    m = len(pd.unique(df['pid']))
    n = len(pd.unique(df['track_uri']))
    track_dict = make_track_dict(df)

    result = np.zeros([m,n])
    for index , row in df.iterrows():
        result[row['pid'], track_dict[row['track_uri']]] = 1
    return result

# create list of tuples where recommendation array should have a 1
# i.e. if (2,18) is in this list, then playlist 2 contains track 18
def make_R_list(df): 
    result = []
    track_dict = make_track_dict(df)
    for index , row in df.iterrows():
        result.append((row['pid'], track_dict[row['track_uri']]))
    return result

# use gradient descent to minimize MSE (with l2 regularization)
def update_params_loop(R_list, P, Q, alpha, llambda): 
    newP = P.copy() 
    newQ = Q.copy()
    m , f = np.shape(P)
    n = np.shape(Q)[0]

    for u,i in R_list:
        newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * Q[i,:]
        newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * P[u,:]
    for u in range(m):
        newP[u,:] -= alpha * 2 * llambda * P[u,:]
    for i in range(n):
        newQ[i,:] -= alpha * 2 * llambda * Q[i,:]
    return (newP, newQ)

In [17]:
R_list = make_R_list(data)

f = 3 # number of latent features

# initialize random values for matrices P and Q
P = np.random.rand(np.shape(R)[0], f)
Q = np.random.rand(np.shape(R)[1], f)

In [60]:
# Run the gradient descent algorithm 
P_current = P.copy()
Q_current = Q.copy()
for i in range(100):
    if i % 10 == 0:
        print(i)
    P_current , Q_current = update_params_loop(R_list, P_current, Q_current, 0.1, 0.25)

0
10
20
30
40
50
60
70
80
90


In [None]:
# Get indices for Yesterday and Help! by the Beatles
track_dict = make_track_dict(data)
yesterday_index = track_dict['spotify:track:1e0hllQ23AG0QGFgezgLOq']
help_index = track_dict['spotify:track:7DD7eSuYSC5xk2ArU62esN'])

# They have very different vectors before training
print(Q[yesterday_index,:])
print(Q[help_index,:])

# They have very similar vectors after training, but Track 10000 is very different
print(Q_current[yesterday_index,:])
print(Q_current[help_index,:])
print(Q_current[10000,:])