In [1]:
import os
import sys
import numpy as np
import implicit
import scipy.sparse as sp
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
%matplotlib inline
import json
import re
import random
import collections
import datetime
import pickle
import pandas as pd
import numpy as np
from scipy.sparse.linalg import spsolve
from sklearn import metrics
from sklearn.preprocessing import normalize as sknormalize
import random
import concurrent.futures
import multiprocessing as mp
import mkl

In [2]:
def avg(lst):
    return sum(lst)/len(lst)

In [3]:
def printResults():
    print("reg:", regularization)
    print("alpha:", alpha)
    print("normalized:", normali)
    print("ALS")
    print("avg total:", avg(als_tot_auc))
    print("avg sh:", avg(als_sh_auc))
    print("avg mb:", avg(als_mb_auc))
    print("avg lt:", avg(als_lt_auc))
    print("POP")
    print("avg total:", avg(pop_tot_auc))
    print("avg sh:", avg(pop_sh_auc))
    print("avg mb:", avg(pop_mb_auc))
    print("avg lt:", avg(pop_lt_auc))
    return

In [4]:
def loadData():
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat
    playlist_names = np.load("playlist_names.npy", allow_pickle = True)
    playlist_followers = np.load("playlist_followers.npy", allow_pickle = True)
    s_track_names = np.load("s_track_names.npy", allow_pickle = True)
    s_track_frequencies = np.load("s_track_frequencies.npy", allow_pickle = True)
    mat = sp.load_npz("test_set.npz")
    return

In [5]:
def cutData(cutoff):
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat
    
    #remove less popular songs from dataset, if less than 10 total playlist adds
    mat = mat[:,:cutoff]
    s_track_frequencies = s_track_frequencies[:cutoff]
    s_track_names = s_track_names[:cutoff]
    return

In [6]:
def shuffleData():
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat
    
    #shuffle order of playlists (rows), maintain in bookkeeping
    shuffleInds = np.arange(np.shape(mat)[0])
    np.random.shuffle(shuffleInds)
    playlist_names = playlist_names[shuffleInds]
    playlist_followers = playlist_followers[shuffleInds]
    mat = sp.csr_matrix(mat)
    mat = mat[shuffleInds,:]
    return

In [7]:
def sortData():
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat
    
    #sort columns by track pop, maintain in names and freqs
    sortInds = np.flip(np.argsort(np.ravel(np.sum(mat, axis=0))))
    mat = mat[:,sortInds]
    s_track_names = s_track_names[sortInds]
    s_track_frequencies = s_track_frequencies[sortInds]
    return

In [8]:
#takes in the matrix and returns the training set, and the test set which is 
def buildData(pct_mask, test_size):
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat, proj
    global train, test, masked
    m, n = np.shape(mat)
    train = mat[:m - test_size,:]
    test = mat[m - test_size:,:]
    num_mask = np.ravel(np.ceil(np.sum(test, axis = 1) * pct_mask))
    masked = sp.lil_matrix(np.shape(test))
    
    for i in range(test_size):
        #get all indices from the test playlist where there are 1s, make a list, get 20% of them =
        inds = random.sample(np.where(test[i,:].todense() != 0)[1].tolist(),int(num_mask[i]))
        masked[i,inds] = 1
        test[i,inds] = 0
        
    masked = sp.csr_matrix(masked)
    return

In [9]:
def trainModel(fctrs, reg, alpha):
    global model, train
    model = implicit.als.AlternatingLeastSquares(factors=fctrs,
                                                 regularization = reg,
                                                 calculate_training_loss=True)
    model.fit(train.T * alpha, show_progress=True)
    return

In [10]:
def testData(iters):
    for i in tqdm_notebook(range(iters)):
        testPlaylist(i)
    printResults()
    return

In [11]:
def testPlaylist(i):
    #get sparse true vector for playlist
    playlist = test[i,:].todense()
    
    #indices where we have 1s and 0s in the playlist
    zero_inds = np.where(playlist == 0)[1]
    one_inds = np.where(playlist == 1)[1]

    #make a vector of true values to identify
    masked_plst = np.ravel(masked[i,zero_inds].todense())
    
    #get als recs, zip to two parallel lists, return scores to index order
    inds, scores = zip(*model.recommend(i, test, np.size(masked_plst), filter_items = one_inds.tolist(), filter_already_liked_items = False, recalculate_user = True))
    inds, scores = np.array(inds), np.array(scores)
    recs = scores[np.argsort(inds)]
    pl_pops = pops[zero_inds]
    #test with the masked values, the popularities excluding the places known to be 
    aurocs(masked_plst, recs, pl_pops)

In [12]:
def aurocs(pl, recs, pl_pops):
#Adds the auc for pops and als recs to the correct list,
#counting ValueErrors due to having 1 or fewer entries in any of the 1/3rds.
    global sh_valerrs, mb_valerrs, lt_valerrs, pops

    sh_pl = np.copy(pl)
    sh_pl[sh:] = 0
                    
    mb_pl = np.copy(pl)                
    mb_pl[:sh] = 0
    mb_pl[mb:] = 0
    
    lt_pl = np.copy(pl)
    lt_pl[:mb] = 0
    
    try:
        als_tot_auc.append(metrics.roc_auc_score(pl, recs))
        pop_tot_auc.append(metrics.roc_auc_score(pl, pl_pops))
    except ValueError:
        print("total failure")
        pass
    
    try:
        als_sh_auc.append(metrics.roc_auc_score(sh_pl, recs))
        pop_sh_auc.append(metrics.roc_auc_score(sh_pl, pl_pops))        
    except ValueError:
        sh_valerrs+=1
    
    try:
        als_mb_auc.append(metrics.roc_auc_score(mb_pl, recs))
        pop_mb_auc.append(metrics.roc_auc_score(mb_pl, pl_pops))        
    except ValueError:
        mb_valerrs+=1
    
    try:
        als_lt_auc.append(metrics.roc_auc_score(lt_pl, recs))
        pop_lt_auc.append(metrics.roc_auc_score(lt_pl, pl_pops))        
    except ValueError:
        lt_valerrs+=1
        
    return

In [13]:
def main():
    
    global train, pops
    
    loadData()
    #print("Loaded...")
    
    shuffleData()
    #print("Shuffled...")
    
    sortData()
    #print("Sorted...")
    
    #cutData(cutoff)#cutoff
    #print("Cut...")
    
    buildData(.2, num_test) #percent_mask, num_test_items
    #print("Sets built...")
    
    pops = np.array(np.ravel(np.sum(train, axis=0)))
    
    if normali:
        train = sknormalize(train, axis = 0)
    
    trainModel(factors, regularization, alpha) #factors, regularization, alpha
    
    testData(num_test)#test songs

    #print("All tasks completed successfully.")

    return

In [14]:
mkl.set_num_threads(1)

sh_valerrs = 0
mb_valerrs = 0
lt_valerrs = 0
als_tot_auc = []
als_sh_auc = []
als_mb_auc = []
als_lt_auc = []
pop_tot_auc = []
pop_sh_auc = []
pop_mb_auc = []
pop_lt_auc = []
model = 0
train = 0
test = 0
masked = 0
pops = 0
playlist_names = 0
playlist_followers = 0
s_track_names = 0
s_track_frequencies = 0
mat = 0
sh = 2641
mb = 22530
#cutoff = 369199
factors = 192
normali = False
regularization = .0625
alpha = 512
num_test = 1000
main()

mkl.set_num_threads(1)

sh_valerrs = 0
mb_valerrs = 0
lt_valerrs = 0
als_tot_auc = []
als_sh_auc = []
als_mb_auc = []
als_lt_auc = []
pop_tot_auc = []
pop_sh_auc = []
pop_mb_auc = []
pop_lt_auc = []
model = 0
train = 0
test = 0
masked = 0
pops = 0
playlist_names = 0
playlist_followers = 0
s_track_names = 0
s_track_frequencies = 0
mat = 0
sh = 2641
mb = 22530
#cutoff = 369199
factors = 192
normali = True
regularization = .5
alpha = 512
num_test = 1000
main()

100%|██████████████████████████████████████████████████████████████████| 15.0/15 [12:01<00:00, 52.97s/it, loss=0.00223]


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


reg: 0.0625
alpha: 512
normalized: False
ALS
avg total: 0.9739791048735005
avg sh: 0.9927075164423007
avg mb: 0.990166235703118
avg lt: 0.9331768856357165
POP
avg total: 0.9503153584957311
avg sh: 0.9995637401301841
avg mb: 0.9961091578891565
avg lt: 0.891015494673396


100%|█████████████████████████████████████████████████████████████████| 15.0/15 [12:06<00:00, 52.91s/it, loss=0.000802]


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


reg: 0.5
alpha: 512
normalized: True
ALS
avg total: 0.9831526146954662
avg sh: 0.9942900182803885
avg mb: 0.9927869359933881
avg lt: 0.9613112283365279
POP
avg total: 0.9557925182120979
avg sh: 0.9995597102979362
avg mb: 0.9960399627812364
avg lt: 0.9047575347728561


In [15]:
'''for uncut dataset
first third of density is at: 2641
second third of density is at: 22530

for cut dataset:
first third of density is at: 2303
second third of density is at: 16841
'''

'for uncut dataset\nfirst third of density is at: 2641\nsecond third of density is at: 22530\n\nfor cut dataset:\nfirst third of density is at: 2303\nsecond third of density is at: 16841\n'

In [16]:
'''sh = 2302
mb = 19141
cutoff = 369199
factors = 192
normali = True
regularization = .0625
alpha = 64
num_test = 500
for i in range(4):
    for j in range(4):

        sh_valerrs = 0
        mb_valerrs = 0
        lt_valerrs = 0
        tot_auc = []
        sh_auc = []
        mb_auc = []
        lt_auc = []
        model = 0
        train = 0
        test = 0
        masked = 0
        pops = 0
        playlist_names = 0
        playlist_followers = 0
        s_track_names = 0
        s_track_frequencies = 0
        mat = 0
        main()
        alpha *= 2
    regularization *= 4
    alpha = 64
    
regularization = .0625
alpha = 64

normali = False

for i in range(4):
    for j in range(4):

        sh_valerrs = 0
        mb_valerrs = 0
        lt_valerrs = 0
        tot_auc = []
        sh_auc = []
        mb_auc = []
        lt_auc = []
        model = 0
        train = 0
        test = 0
        masked = 0
        pops = 0
        playlist_names = 0
        playlist_followers = 0
        s_track_names = 0
        s_track_frequencies = 0
        mat = 0
        main()
        alpha *= 2
    regularization *= 4
    alpha = 64
    '''

'sh = 2302\nmb = 19141\ncutoff = 369199\nfactors = 192\nnormali = True\nregularization = .0625\nalpha = 64\nnum_test = 500\nfor i in range(4):\n    for j in range(4):\n\n        sh_valerrs = 0\n        mb_valerrs = 0\n        lt_valerrs = 0\n        tot_auc = []\n        sh_auc = []\n        mb_auc = []\n        lt_auc = []\n        model = 0\n        train = 0\n        test = 0\n        masked = 0\n        pops = 0\n        playlist_names = 0\n        playlist_followers = 0\n        s_track_names = 0\n        s_track_frequencies = 0\n        mat = 0\n        main()\n        alpha *= 2\n    regularization *= 4\n    alpha = 64\n    \nregularization = .0625\nalpha = 64\n\nnormali = False\n\nfor i in range(4):\n    for j in range(4):\n\n        sh_valerrs = 0\n        mb_valerrs = 0\n        lt_valerrs = 0\n        tot_auc = []\n        sh_auc = []\n        mb_auc = []\n        lt_auc = []\n        model = 0\n        train = 0\n        test = 0\n        masked = 0\n        pops = 0\n     

In [17]:
'''
reg: 0.0625
alpha: 1000
normalized: True
ALS normalized
avg total: 0.9805580801601111
avg sh: 0.991411154656214
avg mb: 0.9889732088598824
avg lt: 0.9505102546825114


ALS standard
avg total: 0.9820323031181131
avg sh: 0.9962993745213498
avg mb: 0.9879645241354366
avg lt: 0.9483032883499841


POP
avg total: 0.8937503409847241
avg sh: 0.9975566624494981
avg mb: 0.9783287241832815
avg lt: 0.7621075699444444

quick hack for lt recs, doesn't hurt short head recommendation
 
broke into 4 sections
reg: 0.0625
alpha: 1000
normalized: False
ALS
avg total: 0.9820323031181131
avg sh: 0.9962993745213498
avg mb: 0.9879645241354366
avg lt: 0.9483032883499841
POP
avg total: 0.8938976857958685
avg sh: 0.9976060576921425
avg mb: 0.9792021772836429
avg lt: 0.7550828705262271
'''

"\nreg: 0.0625\nalpha: 1000\nnormalized: True\nALS normalized\navg total: 0.9805580801601111\navg sh: 0.991411154656214\navg mb: 0.9889732088598824\navg lt: 0.9505102546825114\n\n\nALS standard\navg total: 0.9820323031181131\navg sh: 0.9962993745213498\navg mb: 0.9879645241354366\navg lt: 0.9483032883499841\n\n\nPOP\navg total: 0.8937503409847241\navg sh: 0.9975566624494981\navg mb: 0.9783287241832815\navg lt: 0.7621075699444444\n\nquick hack for lt recs, doesn't hurt short head recommendation\n\nbroke into 4 sections\nreg: 0.0625\nalpha: 1000\nnormalized: False\nALS\navg total: 0.9820323031181131\navg sh: 0.9962993745213498\navg mb: 0.9879645241354366\navg lt: 0.9483032883499841\nPOP\navg total: 0.8938976857958685\navg sh: 0.9976060576921425\navg mb: 0.9792021772836429\navg lt: 0.7550828705262271\n"