<a href="https://colab.research.google.com/github/dymiyata/erdos2023_million_playlist_challenge/blob/master/matrix_factorization/matrix_factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("Testing Colab and Github integration")

In [1]:
import numpy as np
import pandas as pd
import json
import copy
import random
from numba import njit

from sklearn.model_selection import train_test_split

# Define functions

First we define a function to read in the first n json files of data into a pandas dataframe

In [2]:

# n is the number of json files we wish to read (max is 1000)
def read_data(n):
    for i in range(n):
        # create the file_name string in order to read in file
        file_name = '../spotify_million_playlist_dataset/data/mpd.slice.' \
            + str(i*1000) \
            + '-' + str(i*1000+999) + '.json'

        # Uncomment the following line to show progress
        # print(file_name)

        # open the file and store its contents in file_contents
        with open(file_name) as user_file:
            file_contents = user_file.read()

        # we only care about the "playlists" part of this dictionary
        # save the list of playlists in playlist_list
        parsed_json = json.loads(file_contents)
        playlist_list = parsed_json["playlists"]


        # create dataframe if it's first playlist, otherwise append info to existing dataframe
        # the record_path argument tells the json_normalize function how to flatten the data
        # the meta argument tells the json_nomralize function what meta data to keep
        if i == 0:
            data = pd.json_normalize(
                playlist_list,
                record_path = "tracks",
                meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
            )
        else:
            data = pd.concat(
                [
                    data,
                    pd.json_normalize(
                        playlist_list,
                        record_path = "tracks",
                        meta = ["name", "collaborative", "pid", "num_followers", "num_edits"]
                    )
                ],
                ignore_index = True
            )

    return data

Next, we define functions for creating relevant dictionaries and lists from our data

In [3]:

# dictionary to translate track uri's to integer indices
def make_track_dict(df):
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        result[uri] = i
        i += 1
    return result

# dictionary to translate integer indices to track uri's
def make_reverse_track_dict(df):
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        result[i] = uri
        i += 1
    return result

# dictionary to translate from tid to track name
def make_index_to_track_dict(df):
    result = {}
    i = 0
    for uri in pd.unique(df['track_uri']):
        name = df.query("track_uri == @uri").iloc[0]["track_name"]
        result[i] = name
        i += 1
    return result

# dictionary to translate pid to integer indices
def make_pid_dict(df):
    result = {}
    i = 0
    for pid in pd.unique(df['pid']):
        result[pid] = i
        i += 1
    return result

# get list of tid's sorted in order of how many times song appears (highest to lowest)
def make_sorted_tid_list(R_list):
    # sort R_list by second entry
    R_list_sorted = copy.deepcopy(R_list)
    R_list_sorted.sort(key = lambda x : x[1])

    # create song_count_dict whose keys are tid's and whose values are the number of playlists the given key appears in
    song_count_dict = {}
    tid_current = 0
    song_count_dict[0] = 0
    for pid, tid in R_list_sorted:
        if tid == tid_current:
            song_count_dict[tid] += 1
        else:
            tid_current = tid
            song_count_dict[tid] = 1

    # result is a list of keys of song_count_dict sorted by their value (highest to lowest)
    result = sorted(song_count_dict, key = song_count_dict.get)[::-1]
    return result



The functions that convert the data stored in our dataframe to forms that will be used by our model

In [4]:

# create the big array R consisting of 1's and 0's. I didn't end up using this
def make_R(df):
    m = len(pd.unique(df['pid']))
    n = len(pd.unique(df['track_uri']))
    track_dict = make_track_dict(df)
    pid_dict = make_pid_dict(df)

    result = np.zeros([m,n])
    for index , row in df.iterrows():
        result[pid_dict[row['pid']], track_dict[row['track_uri']]] = 1
    return result


# create list of tuples where recommendation array should have a 1
# i.e. if (2,18) is in this list, then playlist 2 contains track 18
def make_R_list(df):
    result = []
    track_dict = make_track_dict(df)
    pid_dict = make_pid_dict(df)
    for index , row in df.iterrows():
        result.append((pid_dict[row['pid']], track_dict[row['track_uri']]))
    return np.array(result)

# create the R_list for grandma's hypothesis method.  As coded top 20% gets 2 instead of 1.
def make_R_list_grandma(df):
    result = []
    track_dict = make_track_dict(df)
    for index , row in df.iterrows():
        if row['pos'] < 0.2 * row['num_tracks']:
            result.append((row['pid'], track_dict[row['track_uri']], 2))
        else: 
            result.append((row['pid'], track_dict[row['track_uri']], 1))
    return result

The functions for running gradient descent and computing the error function

In [5]:

# use gradient descent to minimize MSE (with l2 regularization)
def update_params_loop(R_list, P, Q, alpha, llambda):
    newP = P
    newQ = Q
    m , f = np.shape(P)
    n = np.shape(Q)[0]

    for u,i in R_list:
        newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * Q[i,:]
        newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - 1) * P[u,:]
    for u in range(m):
        newP[u,:] -= alpha * 2 * llambda * P[u,:]
    for i in range(n):
        newQ[i,:] -= alpha * 2 * llambda * Q[i,:]
    return (newP, newQ)

# use gradient descent with where R_list has triples (u,i,score)
def update_params_loop_score(R_list, P, Q, alpha, llambda):
    newP = P
    newQ = Q
    m , f = np.shape(P)
    n = np.shape(Q)[0]

    for u,i, score in R_list:
        newP[u,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - score) * Q[i,:]
        newQ[i,:] -= alpha * 2 * (P[u,:] @ Q[i,:] - score) * P[u,:]
    for u in range(m):
        newP[u,:] -= alpha * 2 * llambda * P[u,:]
    for i in range(n):
        newQ[i,:] -= alpha * 2 * llambda * Q[i,:]
    return (newP, newQ)

# Run an epoch of gradient descent where the iterations parameter is the number of iterations.
@njit
def run_epoch(R_array, P, Q, alpha, llambda, iterations):
    oldP = P.copy()
    oldQ = Q.copy()
    f = np.shape(P)[1]

    for iter in range(iterations):
        newP = oldP.copy()
        newQ = oldQ.copy()
        for u,i in R_array:
            dotprod = 0
            for feature in range(f):
                dotprod += oldP[u,feature] * oldQ[i,feature]
            error = dotprod - 1

            for feature in range(f):
                pf = oldP[u,feature]
                qf = oldQ[i,feature]
                newP[u,feature] -= alpha * (error * qf + llambda * pf) 
                newQ[i,feature] -= alpha * (error * pf + llambda * qf)
        oldP = newP
        oldQ = newQ
    return newP, newQ


#runs the gradient descent loop in batches
def gd_batch(R_list, P, Q, alpha, llambda, batch_num, iterations, R = None, verbose=False):
    #make copies of P and Q
    P_current = P.copy()
    Q_current = Q.copy()

    #shuffle R_list
    #divide R_list into batch_num subsets
    random.shuffle(R_list)
    batch_size = int(np.ceil(len(R_list)/batch_num))
    R_batch = [R_list[i:i+batch_size] for i in range(0,len(R_list), batch_size) ]

    #loop over total iterations
    for i in range(iterations):
        #if verbose == true print out error function
        if verbose:
            print(f'Step {i*batch_num}: Error function={error_function(R_list,R , P_current, Q_current)}')
        #loop over batch_num
        for batch in R_batch:
            #run update_param_loop on batch
            P_current , Q_current = update_params_loop(batch, P_current, Q_current, alpha, llambda)

    return (P_current , Q_current)

#error function without l2 normalization factor
def error_function( R_list,R, P , Q ):
    result = 0
    #sum over R_list
    for row, col in R_list:
        result = result + (R[row,col] - P[row,:]@Q[col,:])**2

    return result

@njit
def error_function_l2( R_list, P , Q, llambda):
    result = 0
    #sum over R_list
    for row, col in R_list:
        result = result + (1 - P[row,:]@Q[col,:])**2
    result += llambda * (np.linalg.norm(P)**2 + np.linalg.norm(Q)**2)
    return result

The function for obtaining the playlist vector that minimizes the cost function for a fixed $Q$ from a list of track id's.

In [6]:
def new_user_vec(tid_list, Q, llambda):
    Y = Q[tid_list,:]
    f = np.shape(Q)[1]
    d = len(tid_list)
    vec = np.linalg.inv(np.transpose(Y) @ Y + llambda * np.identity(f)) @ np.transpose(Y) @ np.ones((d,1))
    return np.transpose(vec)

# Example of training the Model

The next two cells read in data and creates the list of data points.  Ideally this will be done by querying the SQL database.

In [21]:
num_jsons = 6 # number of files to read
num_playlists = num_jsons*1000
data = read_data(num_jsons)

In [22]:
R_list = make_R_list(data)
track_dict = make_track_dict(data)
reverse_track_dict = make_reverse_track_dict(data)

Create train-test split

In [23]:
I_train, I_test = train_test_split(range(100), test_size=0.25, shuffle=False)
print(I_train)
print(I_test)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]
[75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [79]:
# Percentage of the total database to reserve for validation and testing
val_size_abs = 0.25
test_size    = 0.25
shuffle = False

# Note: the first pid_train contains (1-test_size) percent of the data.
# We need to use val_size so that val_size*(1-test_size) = val_size_abs.
val_size = val_size_abs/(1-test_size)
pid_train, pid_test = train_test_split(range(num_playlists), test_size=test_size, shuffle=shuffle)
pid_train, pid_val  = train_test_split(pid_train, test_size=val_size, shuffle=shuffle)

R_list_train = R_list[ np.isin(R_list[:,0], pid_train), :]
R_list_val   = R_list[ np.isin(R_list[:,0], pid_val),   :]
R_list_test  = R_list[ np.isin(R_list[:,0], pid_test),  :]

# Store the track id of songs in the train/val/test sets
tid_train = list(np.unique( R_list_train[:,1] ))
tid_val   = list(np.unique( R_list_val[:,1]   ))
tid_test  = list(np.unique( R_list_test[:,1]  ))

Next, we specify the number of features and create matrices $P$ and $Q$ whose entries are randomly taken from a normal distribution with $\mu = 0$ and $\sigma = 0.1$.

In [30]:
f = 10 # number of latent features
# num_songs = max(reverse_track_dict.keys()) + 1
num_songs = len(tid_train)
num_playlists = len(pid_train)

# initialize random values for matrices P and Q. Entries are between -1 and 1
P = np.random.normal(0, 0.1, (num_playlists, f))
Q = np.random.normal(0, 0.1, (num_songs, f))

In the following cell, we run the gradient descent algorithm and store the resulting matrices in P_trained and Q_trained.

In [31]:
# Run gradient descent algorithm with alpha = 0.001, llambda = 0.005 for 100 iterations
P_trained, Q_trained = run_epoch(R_list_train, P, Q, 0.001, 0.005, 100)

# Example of computing error on non-training data

Now, to demonstrate how to compute error for a new collection of playlists, let's first read in an extra json file's worth of data (I could've just read in the single extra file instead of all original files again but I was too lazy to change the read_data function). Again this step should utilize the SQL database.

**Note:** I changed R_list_new into R_list_test during the train-val-test split.

In [None]:
# The list of data points corresponding to our extra json file
# R_list_new = np.array([(u,i) for u,i in make_R_list(data1) if u >= num_jsons*1000])

For our fixed Q, given a list of (new) playlist ids, we can compute the P matrix that minimizes the 

In [72]:
# THIS FUNCTION, (as well as new_user_vec) CAN AND SHOULD BE OPTIMIZED TO USE numba.  
# For the sake of time, I'll leave it as is for now
# If we can avoid using lists (like tid_list), then things will be numba compatible. 
def make_Pval(new_pids, R_list_new, Q, llambda, tid_known=tid_train):
    num_songs , f = np.shape(Q)
    P_val = np.zeros((len(new_pids), f))
    count = 0 # keeps track of where in pid_list we are
    
    for pid in new_pids:
        # create list of tracks in the playlist
        # Remember: R_list_new has two columns: 0 is pid, 1 is tid
        tid_list = R_list_new[ R_list_new[:,0]==pid, 1]
        
        # With repetition
        tid_list = list(tid_list)
        tid_list = [tid for tid in tid_list if tid in tid_known]
        
        # Without repetition
        # tid_list = list( set(tid_list).intersection(tid_known) )

        # x is the row of Pval corresponding to this pid
        x = new_user_vec(tid_list, Q_trained, llambda)
        for feature in range(f):
            P_val[count, feature] = x[0,feature]
        count += 1
        
    return P_val

Pval2 = make_Pval(pid_test, R_list_test, Q_trained, 0.005)

Next, we compute the error function on this set. Note, we can't use the function defined at the top of this document because now the pid does not correspond to the index of the column of Pval.

In [17]:
def val_error(R_list_new, pid_list, Pval, Q, llambda):
    num_songs, f = np.shape(Q)
    result = 0
    for pid, tid in R_list_new:
        # As in the make_Pval function, the following condition needs to be changed for a randomized val set.
        if tid < num_songs:
            result += (1 - Pval[pid_list.index(pid), :] @ Q[tid, :])
    result += llambda * (np.linalg.norm(Pval)**2)
    return result

val_error(R_list_test, pid_test, Pval, Q_trained, 0.005)

56951.27274909439

In [84]:
import sqlalchemy as db
db.__version__

'1.4.39'