In [38]:
import numpy as np
import pandas as pd
import json
import copy


In [2]:
# n is the number of json files we wish to read (max is 1000)
n = 5

for i in range(n):
    # create the file_name string in order to read in file
    file_name = '../spotify_million_playlist_dataset/data/mpd.slice.' \
        + str(i*1000) \
        + '-' + str(i*1000+999) + '.json'
    
    # Uncomment the following line to show progress
    # print(file_name)

    # open the file and store its contents in file_contents
    with open(file_name) as user_file:
        file_contents = user_file.read()

    # we only care about the "playlists" part of this dictionary
    # save the list of playlists in playlist_list
    parsed_json = json.loads(file_contents)
    playlist_list = parsed_json["playlists"]


    # create dataframe if it's first playlist, otherwise append info to existing dataframe
    # the record_path argument tells the json_normalize function how to flatten the data
    # the meta argument tells the json_nomralize function what meta data to keep
    if i == 0:
        data = pd.json_normalize(
            playlist_list,
            record_path = "tracks",
            meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
        ) 
    else:
        data = pd.concat( 
            [
                data,
                pd.json_normalize(
                    playlist_list,
                    record_path = "tracks",
                    meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
                )
            ],     
            ignore_index = True
        )

# For some reason reading in the data creates a new column called pos so I delete it here
data =data.drop(columns=['pos'])

In [31]:
def make_track_dict(df): # create a dictionary pairing a track_uri with an index
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        result[uri] = i
        i += 1
    return result

def make_R(df): # create the recommendation array
    m = len(pd.unique(df['pid']))
    n = len(pd.unique(df['track_uri']))
    track_dict = make_track_dict(df)

    result = np.zeros([m,n])
    for index , row in df.iterrows():
        result[row['pid'], track_dict[row['track_uri']]] = 1
    return result

def make_R_list(df): # create list of tuples where recommendation array should have a 1
    result = []
    track_dict = make_track_dict(df)
    for index , row in df.iterrows():
        result.append((row['pid'], track_dict[row['track_uri']]))
    return result

def update_params_loop(R_list, P, Q, alpha, llambda): # use gradient descent to minimize MSE (with l2 regularization)
    newP = P 
    newQ = Q
    m , f = np.shape(P)
    n = np.shape(Q)[0]

    for u,i in R_list:
        newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * Q[i,:]
        newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * P[u,:]
    for u in range(m):
        newP[u,:] -= alpha * 2 * llambda * P[u,:]
    for i in range(n):
        newQ[i,:] -= alpha * 2 * llambda * Q[i,:]
    return (newP, newQ)



    



In [17]:
R_list = make_R_list(data)
ones_df = pd.DataFrame(R_list, columns = ['pid', 'tid'])

In [11]:
R_list = make_R_list(data)

In [39]:
f = 3 # number of latent features
P = np.random.rand(np.shape(R)[0], f)
Q = np.random.rand(np.shape(R)[1], f)

In [32]:
update_params_loop(R_list, P, Q, 0.01, 0.25)

(array([[0.73858128, 0.2997585 , 0.7634779 ],
        [0.59943615, 0.38234632, 0.72303317],
        [0.54843629, 0.33340238, 0.88512886],
        ...,
        [0.57436532, 0.23138165, 0.95009341],
        [0.54650902, 0.76882325, 0.64106865],
        [0.29835871, 0.44194522, 0.83308253]]),
 array([[0.38626966, 0.3296744 , 0.67962062],
        [0.1373481 , 0.77791887, 0.62463076],
        [0.62392472, 0.60427956, 0.33523491],
        ...,
        [0.57984391, 0.14167416, 0.51159917],
        [0.44200693, 0.1058984 , 0.64163648],
        [0.09829083, 0.45720571, 0.50995845]]))

In [60]:
P_current = P.copy()
Q_current = Q.copy()
for i in range(100):
    if i % 10 == 0:
        print(i)
    P_current , Q_current = update_params_loop(R_list, P_current, Q_current, 0.1, 0.25)

0
10
20
30
40
50
60
70
80
90


In [18]:
P[0,:] @ Q[0,:]

0.7748264353267021

In [54]:
print(Q_current)
print(Q)

[[0.36319391 0.24552166 0.23132487]
 [0.40897814 0.19635149 0.22422859]
 [0.42764512 0.15334543 0.25504751]
 ...
 [0.22835163 0.27942994 0.26384813]
 [0.25322913 0.26721552 0.25121552]
 [0.2336221  0.28665723 0.24644682]]
[[0.18330633 0.13432415 0.55469912]
 [0.2794049  0.48886117 0.97864137]
 [0.95450681 0.08142545 0.74100955]
 ...
 [0.19187137 0.41115092 0.54018852]
 [0.86053959 0.73915226 0.78939881]
 [0.28055962 0.54103101 0.34080658]]


In [53]:
data.loc[data['pid'] == 8];

In [52]:
track_dict = make_track_dict(data)
print(track_dict['spotify:track:1e0hllQ23AG0QGFgezgLOq'], track_dict['spotify:track:7DD7eSuYSC5xk2ArU62esN'])

472 475


In [61]:
print(Q[472,:])
print(Q[475,:])
print(Q_current[472,:])
print(Q_current[475,:])
print(Q_current[10000,:])

[0.73113458 0.06293369 0.26661692]
[0.11346046 0.70710329 0.53108864]
[0.32460992 0.30696397 0.1781691 ]
[0.35227247 0.36706276 0.13203964]
[0.33693136 0.14928818 0.33439918]
