In [3]:
import numpy as np
import pandas as pd
import json
import copy
import random

In [4]:
# n is the number of json files we wish to read (max is 1000)
n = 5

for i in range(n):
    # create the file_name string in order to read in file
    file_name = '../spotify_million_playlist_dataset/data/mpd.slice.' \
        + str(i*1000) \
        + '-' + str(i*1000+999) + '.json'
    
    # Uncomment the following line to show progress
    # print(file_name)

    # open the file and store its contents in file_contents
    with open(file_name) as user_file:
        file_contents = user_file.read()

    # we only care about the "playlists" part of this dictionary
    # save the list of playlists in playlist_list
    parsed_json = json.loads(file_contents)
    playlist_list = parsed_json["playlists"]


    # create dataframe if it's first playlist, otherwise append info to existing dataframe
    # the record_path argument tells the json_normalize function how to flatten the data
    # the meta argument tells the json_nomralize function what meta data to keep
    if i == 0:
        data = pd.json_normalize(
            playlist_list,
            record_path = "tracks",
            meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
        ) 
    else:
        data = pd.concat( 
            [
                data,
                pd.json_normalize(
                    playlist_list,
                    record_path = "tracks",
                    meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
                )
            ],     
            ignore_index = True
        )

# For some reason reading in the data creates a new column called pos so I delete it here
data =data.drop(columns=['pos'])

In [None]:
# dictionary to translate track uri's to integer indices
def make_track_dict(df): 
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        result[uri] = i
        i += 1
    return result

# create the big array R consisting of 1's and 0's. I didn't end up using this
def make_R(df): 
    m = len(pd.unique(df['pid']))
    n = len(pd.unique(df['track_uri']))
    track_dict = make_track_dict(df)

    result = np.zeros([m,n])
    for index , row in df.iterrows():
        result[row['pid'], track_dict[row['track_uri']]] = 1
    return result

# create list of tuples where recommendation array should have a 1
# i.e. if (2,18) is in this list, then playlist 2 contains track 18
def make_R_list(df): 
    result = []
    track_dict = make_track_dict(df)
    for index , row in df.iterrows():
        result.append((row['pid'], track_dict[row['track_uri']]))
    return result

# use gradient descent to minimize MSE (with l2 regularization)
def update_params_loop(R_list, P, Q, alpha, llambda): 
    newP = P 
    newQ = Q
    m , f = np.shape(P)
    n = np.shape(Q)[0]

    for u,i in R_list:
        newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * Q[i,:]
        newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * P[u,:]
    for u in range(m):
        newP[u,:] -= alpha * 2 * llambda * P[u,:]
    for i in range(n):
        newQ[i,:] -= alpha * 2 * llambda * Q[i,:]
    return (newP, newQ)

#runs the gradient descent loop in batches
def gd_batch(R_list, P, Q, alpha, llambda, batch_num, iterations, R = None, verbose=False): 
    #make copies of P and Q
    P_current = P.copy()
    Q_current = Q.copy()
    
    #shuffle R_list
    #divide R_list into batch_num subsets
    random.shuffle(R_list)
    batch_size = int(np.ceil(len(R_list)/batch_num))
    R_batch = [R_list[i:i+batch_size] for i in range(0,len(R_list), batch_size) ]
   
    #loop over total iterations
    for i in range(iterations):
        #if verbose == true print out error function
        if verbose:
            print(f'Step {i*batch_num}: Error function={error_function(R_list,R , P_current, Q_current)}')
        #loop over batch_num
        for batch in R_batch:
            #run update_param_loop on batch
            P_current , Q_current = update_params_loop(batch, P_current, Q_current, alpha, llambda)
            
    return (P_current , Q_current)

#error function without l2 normalization factor
def error_function( R_list,R, P , Q ):
    result = 0
    #sum over R_list
    for row, col in R_list:
        result = result + (R[row,col] - P[row,:]@Q[col,:])**2 

    return result


# Initial exploration

In [None]:
R_list = make_R_list(data)

f = 3 # number of latent features

# initialize random values for matrices P and Q
P = np.random.rand(np.shape(R)[0], f)
Q = np.random.rand(np.shape(R)[1], f)

In [None]:
# Run the gradient descent algorithm 
P_current = P.copy()
Q_current = Q.copy()
for i in range(100):
    if i % 10 == 0:
        print(i)
    P_current , Q_current = update_params_loop(R_list, P_current, Q_current, 0.1, 0.25)

In [None]:
# Get indices for Yesterday and Help! by the Beatles
track_dict = make_track_dict(data)
yesterday_index = track_dict['spotify:track:1e0hllQ23AG0QGFgezgLOq']
help_index = track_dict['spotify:track:7DD7eSuYSC5xk2ArU62esN'])

# They have very different vectors before training
print(Q[yesterday_index,:])
print(Q[help_index,:])

# They have very similar vectors after training, but Track 10000 is very different
print(Q_current[yesterday_index,:])
print(Q_current[help_index,:])
print(Q_current[10000,:])

# Testing batch or not

In [6]:
import time

In [20]:
batch_num = 5 #how many batches subdivide R_list into
iterations = 20 #how many times each batch gets run. Total steps = batch_num*iterations


In [21]:
R = make_R(data)
R_list = make_R_list(data)

In [22]:
f = 3 # number of latent features
P = np.random.rand(np.shape(R)[0], f)
Q = np.random.rand(np.shape(R)[1], f)

In [23]:
#run gradient descent
t0 = time.time()

P_gd = P.copy()
Q_gd = Q.copy()
for i in range(100):
    if i % 10 == 0:
        print(f'Step {i}: Error Function={error_function(R_list,R,P_gd,Q_gd)}')
    P_gd , Q_gd = update_params_loop(R_list, P_gd, Q_gd, 0.1, 0.25)
    
t1 = time.time()   

print(f'Gradient descent took {t1-t0} seconds to run')
print()


Step 0: Error Function=69729.45827299704
Step 10: Error Function=2633.8024770489474
Step 20: Error Function=2323.6476452364022
Step 30: Error Function=2184.148127061852
Step 40: Error Function=2104.2149873031876
Step 50: Error Function=2049.629981392706
Step 60: Error Function=2038.2251146629299
Step 70: Error Function=1998.3455449263297
Step 80: Error Function=1994.5357177718254
Step 90: Error Function=1972.954489078316
Gradient descent took 131.72564816474915 seconds to run



In [24]:
#run batch GD
t0= time.time()
P_batch, Q_batch = gd_batch(R_list, P , Q , .1, .01, batch_num, iterations, R , verbose=True)
t1=time.time()

print(f'Batch gradient descent took {t1-t0} seconds to run')
print()

Step 0: Error function=69729.45827299508
Step 5: Error function=8355.514746055096
Step 10: Error function=4567.872230635358
Step 15: Error function=2738.973398451681
Step 20: Error function=1735.3831738618026
Step 25: Error function=1118.2781580204976
Step 30: Error function=735.0277343368789
Step 35: Error function=472.1509129252549
Step 40: Error function=314.7716689939038
Step 45: Error function=209.7706008732556
Step 50: Error function=144.83689177673233
Step 55: Error function=101.9548847372596
Step 60: Error function=74.55999513682674
Step 65: Error function=55.88294842639376
Step 70: Error function=44.474014794833685
Step 75: Error function=36.337518435694754
Step 80: Error function=31.277600474226908
Step 85: Error function=27.510811093487085
Step 90: Error function=25.17093593121125
Step 95: Error function=23.142003592726446
Batch gradient descent took 43.44975686073303 seconds to run



In [25]:
###error function for GD
err_gd = error_function(R_list, R, P_gd, Q_gd)

print(f'The error for gradient descent is {err_gd}')
print()

#error function on Batch
err_batch = error_function(R_list, R , P_batch, Q_batch)
print(f'The error for batch gradient descent is {err_batch}')

The error for gradient descent is 1965.448499815883

The error for batch gradient descent is 22.04733570033638


0.0