<a href="https://colab.research.google.com/github/dymiyata/erdos2023_million_playlist_challenge/blob/master/matrix_factorization/matrix_factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("Testing Colab and Github integration")

In [2]:
import numpy as np
import pandas as pd
import json
import copy
import random

# Define functions

First we define a function to read in the first n json files of data into a pandas dataframe

In [4]:

# n is the number of json files we wish to read (max is 1000)
def read_data(n):
    for i in range(n):
        # create the file_name string in order to read in file
        file_name = '../spotify_million_playlist_dataset/data/mpd.slice.' \
            + str(i*1000) \
            + '-' + str(i*1000+999) + '.json'

        # Uncomment the following line to show progress
        # print(file_name)

        # open the file and store its contents in file_contents
        with open(file_name) as user_file:
            file_contents = user_file.read()

        # we only care about the "playlists" part of this dictionary
        # save the list of playlists in playlist_list
        parsed_json = json.loads(file_contents)
        playlist_list = parsed_json["playlists"]


        # create dataframe if it's first playlist, otherwise append info to existing dataframe
        # the record_path argument tells the json_normalize function how to flatten the data
        # the meta argument tells the json_nomralize function what meta data to keep
        if i == 0:
            data = pd.json_normalize(
                playlist_list,
                record_path = "tracks",
                meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
            )
        else:
            data = pd.concat(
                [
                    data,
                    pd.json_normalize(
                        playlist_list,
                        record_path = "tracks",
                        meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
                    )
                ],
                ignore_index = True
            )

    return data

Next, we define functions for creating relevant dictionaries and lists from our data

In [26]:

# dictionary to translate track uri's to integer indices
def make_track_dict(df):
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        result[uri] = i
        i += 1
    return result

# dictionary to translate track uri's to integer indices
def make_reverse_track_dict(df):
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        result[i] = uri
        i += 1
    return result

# dictionary to translate from tid to track name
def make_index_to_track_dict(df):
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        name = df.query("track_uri == @uri").iloc[0]["track_name"]
        result[i] = name
        i += 1
    return result

# dictionary to translate pid to integer indices
def make_pid_dict(df):
    result = {}
    i = 0
    for pid in pd.unique(df['pid']):
        result[pid] = i
        i += 1
    return result

# get list of tid's sorted in order of how many times song appears (highest to lowest)
def make_sorted_tid_list(R_list):
    # sort R_list by second entry
    R_list_sorted = copy.deepcopy(R_list)
    R_list_sorted.sort(key = lambda x : x[1])

    # create song_count_dict whose keys are tid's and whose values are the number of playlists the given key appears in
    song_count_dict = {}
    tid_current = 0
    song_count_dict[0] = 0
    for pid, tid in R_list_sorted:
        if tid == tid_current:
            song_count_dict[tid] += 1
        else:
            tid_current = tid
            song_count_dict[tid] = 1

    # result is a list of keys of song_count_dict sorted by their value (highest to lowest)
    result = sorted(song_count_dict, key = song_count_dict.get)[::-1]
    return result



The functions that convert the data stored in our dataframe to forms that will be used by our model

In [8]:

# create the big array R consisting of 1's and 0's. I didn't end up using this
def make_R(df):
    m = len(pd.unique(df['pid']))
    n = len(pd.unique(df['track_uri']))
    track_dict = make_track_dict(df)
    pid_dict = make_pid_dict(df)

    result = np.zeros([m,n])
    for index , row in df.iterrows():
        result[pid_dict[row['pid']], track_dict[row['track_uri']]] = 1
    return result


# create list of tuples where recommendation array should have a 1
# i.e. if (2,18) is in this list, then playlist 2 contains track 18
def make_R_list(df):
    result = []
    track_dict = make_track_dict(df)
    pid_dict = make_pid_dict(df)
    for index , row in df.iterrows():
        result.append((pid_dict[row['pid']], track_dict[row['track_uri']]))
    return result

# create the R_list for grandma's hypothesis method.  As coded top 20% gets 2 instead of 1.
def make_R_list_grandma(df):
    result = []
    track_dict = make_track_dict(df)
    for index , row in df.iterrows():
        if row['pos'] < 0.2 * row['num_tracks']:
            result.append((row['pid'], track_dict[row['track_uri']], 2))
        else: 
            result.append((row['pid'], track_dict[row['track_uri']], 1))
    return result

The functions for running gradient descent and computing the error function

In [58]:

# use gradient descent to minimize MSE (with l2 regularization)
def update_params_loop(R_list, P, Q, alpha, llambda):
    newP = P
    newQ = Q
    m , f = np.shape(P)
    n = np.shape(Q)[0]

    for u,i in R_list:
        newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * Q[i,:]
        newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * P[u,:]
    for u in range(m):
        newP[u,:] -= alpha * 2 * llambda * P[u,:]
    for i in range(n):
        newQ[i,:] -= alpha * 2 * llambda * Q[i,:]
    return (newP, newQ)

# use gradient descent with where R_list has triples (u,i,score)
def update_params_loop_score(R_list, P, Q, alpha, llambda):
    newP = P
    newQ = Q
    m , f = np.shape(P)
    n = np.shape(Q)[0]

    for u,i, score in R_list:
        newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - score) * Q[i,:]
        newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - score) * P[u,:]
    for u in range(m):
        newP[u,:] -= alpha * 2 * llambda * P[u,:]
    for i in range(n):
        newQ[i,:] -= alpha * 2 * llambda * Q[i,:]
    return (newP, newQ)

# treat missing top 1000 songs as 0's and also include z random 0's per playlist
def update_params_loop_zeros(R_list, P, Q, alpha, llambda, z):
    newP = P
    newQ = Q
    m , f = np.shape(P)
    n = np.shape(Q)[0]

    u_current = 0
    u_set = set()

    for u,i in R_list:

        # Do the usual gd update
        if u_current == u: 
            newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * Q[i,:]
            newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * P[u,:]
            u_set.add(i) # keep track of songs in the playlist
        else: 
            rand_song_list = [random.randint(0,n-1) for a in range(z)] # Get list of z random songs
            #rand_song_set = set(rand_song_list)

            for song in rand_song_list: # Treat these random songs as 0's for this playlist and do gd update
                if song not in u_set:
                    newP[u,:] -= alpha * 2 * (P[u,:] @ Q[song,:]) * Q[song,:]
                    newQ[song,:] -= alpha * 2 * (P[u,:] @ Q[song,:]) * P[u,:]
            # for song in top_1000: # Treat songs missing from the top 1000 as 0's
            #     if song not in u_set and song not in rand_song_set:
            #         newP[u,:] -= alpha * 2 * (P[u,:] @ Q[song,:]) * Q[song,:]
            #         newQ[song,:] -= alpha * 2 * (P[u,:] @ Q[song,:]) * P[u,:]

            newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * Q[i,:]
            newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * P[u,:]
            u_current = u
            u_set = set([i])

    # Regularization
    for u in range(m):
        newP[u,:] -= alpha * 2 * llambda * P[u,:]
    for i in range(n):
        newQ[i,:] -= alpha * 2 * llambda * Q[i,:]
    return (newP, newQ)


#runs the gradient descent loop in batches
def gd_batch(R_list, P, Q, alpha, llambda, batch_num, iterations, R = None, verbose=False):
    #make copies of P and Q
    P_current = P.copy()
    Q_current = Q.copy()

    #shuffle R_list
    #divide R_list into batch_num subsets
    random.shuffle(R_list)
    batch_size = int(np.ceil(len(R_list)/batch_num))
    R_batch = [R_list[i:i+batch_size] for i in range(0,len(R_list), batch_size) ]

    #loop over total iterations
    for i in range(iterations):
        #if verbose == true print out error function
        if verbose:
            print(f'Step {i*batch_num}: Error function={error_function(R_list,R , P_current, Q_current)}')
        #loop over batch_num
        for batch in R_batch:
            #run update_param_loop on batch
            P_current , Q_current = update_params_loop(batch, P_current, Q_current, alpha, llambda)

    return (P_current , Q_current)

#error function without l2 normalization factor
def error_function( R_list,R, P , Q ):
    result = 0
    #sum over R_list
    for row, col in R_list:
        result = result + (R[row,col] - P[row,:]@Q[col,:])**2

    return result

The function for obtaining the playlist vector that minimizes the cost function for a fixed $Q$ from a list of track id's. Note, to use this, the song list must have at least $f$ songs where $f$ is the number of latent features.

In [62]:
def new_user_vec(tid_list, Q, llambda):
    Y = Q[tid_list,:]
    f = np.shape(Q)[1]
    d = len(tid_list)
    vec = np.linalg.inv(np.transpose(Y) @ Y + llambda * np.identity(1)) @ np.transpose(Y) @ np.ones((d,1))
    return np.transpose(vec)

# Running the Model

In [13]:
num_jsons = 20 # number of files to read
data = read_data(num_jsons)


In [27]:
R_list = make_R_list(data)
track_dict = make_track_dict(data)
reverse_track_dict = make_reverse_track_dict(data)
sorted_tid_list = make_sorted_tid_list(R_list)

In [37]:
f = 7 # number of latent features
num_songs = max(reverse_track_dict.keys()) + 1
num_playlists = num_jsons*1000


# initialize random values for matrices P and Q. Entries are between -1 and 1
P = np.random.rand(num_playlists, f) * 2 - 1
Q = np.random.rand(num_songs, f) * 2 - 1

In [59]:
# Run the gradient descent algorithm
P_current = P.copy()
Q_current = Q.copy()
for i in range(100):
    if i % 1 == 0:
        print(i, end = ', ')
    P_current , Q_current = update_params_loop_zeros(R_list, P_current, Q_current, 0.1, 0.25, 1000)

0, 1, 2, 3, Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Dane\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3548, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Dane\AppData\Local\Temp\ipykernel_24436\2788256200.py", line 7, in <module>
    P_current , Q_current = update_params_loop_zeros(R_list, P_current, Q_current, 0.1, 0.25, 1000)
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Dane\AppData\Local\Temp\ipykernel_24436\1334580116.py", line -1, in update_params_loop_zeros
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Dane\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 2142, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Dane\Ap