In [None]:
import os
import sys
import numpy as np
import implicit
import scipy.sparse as sp
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
%matplotlib inline
import json
import re
import random
import collections
import datetime
import pickle
import pandas as pd
import numpy as np
from scipy.sparse.linalg import spsolve
from sklearn import metrics
from sklearn.preprocessing import normalize as sknormalize
import random
import concurrent.futures
import multiprocessing as mp
import mkl

In [None]:
def avg(lst):
    return sum(lst)/len(lst)

In [None]:
def printResults():
    print("reg:", regularization)
    print("alpha:", alpha)
    print("normalized:", normali)
    print("ALS")
    print("avg total:", avg(als_tot_auc))
    print("avg sh:", avg(als_sh_auc))
    print("avg mb:", avg(als_mb_auc))
    print("avg lt:", avg(als_lt_auc))
    print("POP")
    print("avg total:", avg(pop_tot_auc))
    print("avg sh:", avg(pop_sh_auc))
    print("avg mb:", avg(pop_mb_auc))
    print("avg lt:", avg(pop_lt_auc))
    return

In [None]:
def loadData():
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat
    playlist_names = np.load("playlist_names.npy", allow_pickle = True)
    playlist_followers = np.load("playlist_followers.npy", allow_pickle = True)
    s_track_names = np.load("s_track_names.npy", allow_pickle = True)
    s_track_frequencies = np.load("s_track_frequencies.npy", allow_pickle = True)
    mat = sp.load_npz("test_set.npz")
    return

In [None]:
def cutData(cutoff):
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat
    
    #remove less popular songs from dataset, if less than 10 total playlist adds
    mat = mat[:,:cutoff]
    s_track_frequencies = s_track_frequencies[:cutoff]
    s_track_names = s_track_names[:cutoff]
    return

In [None]:
def shuffleData():
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat
    
    #shuffle order of playlists (rows), maintain in bookkeeping
    shuffleInds = np.arange(np.shape(mat)[0])
    np.random.shuffle(shuffleInds)
    playlist_names = playlist_names[shuffleInds]
    playlist_followers = playlist_followers[shuffleInds]
    mat = sp.csr_matrix(mat)
    mat = mat[shuffleInds,:]
    return

In [None]:
def sortData():
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat
    
    #sort columns by track pop, maintain in names and freqs
    sortInds = np.flip(np.argsort(np.ravel(np.sum(mat, axis=0))))
    mat = mat[:,sortInds]
    s_track_names = s_track_names[sortInds]
    s_track_frequencies = s_track_frequencies[sortInds]
    return

In [None]:
#takes in the matrix and returns the training set, and the test set which is 
def buildData(pct_mask, test_size):
    global playlist_names, playlist_followers, s_track_names, s_track_frequencies, mat, proj
    global train, test, masked
    m, n = np.shape(mat)
    train = mat[:m - test_size,:]
    test = mat[m - test_size:,:]
    num_mask = np.ravel(np.ceil(np.sum(test, axis = 1) * pct_mask))
    masked = sp.lil_matrix(np.shape(test))
    
    for i in range(test_size):
        #get all indices from the test playlist where there are 1s, make a list, get 20% of them =
        inds = random.sample(np.where(test[i,:].todense() != 0)[1].tolist(),int(num_mask[i]))
        masked[i,inds] = 1
        test[i,inds] = 0
        
    masked = sp.csr_matrix(masked)
    return

In [None]:
def trainModel(fctrs, reg, alpha):
    global model, train
    model = implicit.als.AlternatingLeastSquares(factors=fctrs,
                                                 regularization = reg,
                                                 calculate_training_loss=True)
    model.fit(train.T * alpha, show_progress=True)
    return

In [None]:
def testData(iters):
    for i in tqdm_notebook(range(iters)):
        testPlaylist(i)
    printResults()
    return

In [None]:
def testPlaylist(i):
    #get sparse true vector for playlist
    playlist = test[i,:].todense()
    
    #indices where we have 1s and 0s in the playlist
    zero_inds = np.where(playlist == 0)[1]
    one_inds = np.where(playlist == 1)[1]

    #make a vector of true values to identify
    masked_plst = np.ravel(masked[i,zero_inds].todense())
    
    #get als recs, zip to two parallel lists, return scores to index order
    inds, scores = zip(*model.recommend(i, test, np.size(masked_plst), filter_items = one_inds.tolist(), filter_already_liked_items = False, recalculate_user = True))
    inds, scores = np.array(inds), np.array(scores)
    recs = scores[np.argsort(inds)]
    pl_pops = pops[zero_inds]
    #test with the masked values, the popularities excluding the places known to be 
    aurocs(masked_plst, recs, pl_pops)

In [None]:
def aurocs(pl, recs, pl_pops):
#Adds the auc for pops and als recs to the correct list,
#counting ValueErrors due to having 1 or fewer entries in any of the 1/3rds.
    global sh_valerrs, mb_valerrs, lt_valerrs, pops

    sh_pl = np.copy(pl)
    sh_pl[sh:] = 0
                    
    mb_pl = np.copy(pl)                
    mb_pl[:sh] = 0
    mb_pl[mb:] = 0
    
    lt_pl = np.copy(pl)
    lt_pl[:mb] = 0
    
    try:
        als_tot_auc.append(metrics.roc_auc_score(pl, recs))
        pop_tot_auc.append(metrics.roc_auc_score(pl, pl_pops))
    except ValueError:
        print("total failure")
        pass
    
    try:
        als_sh_auc.append(metrics.roc_auc_score(sh_pl, recs))
        pop_sh_auc.append(metrics.roc_auc_score(sh_pl, pl_pops))        
    except ValueError:
        sh_valerrs+=1
    
    try:
        als_mb_auc.append(metrics.roc_auc_score(mb_pl, recs))
        pop_mb_auc.append(metrics.roc_auc_score(mb_pl, pl_pops))        
    except ValueError:
        mb_valerrs+=1
    
    try:
        als_lt_auc.append(metrics.roc_auc_score(lt_pl, recs))
        pop_lt_auc.append(metrics.roc_auc_score(lt_pl, pl_pops))        
    except ValueError:
        lt_valerrs+=1
        
    return

In [None]:
def main():
    
    global train, pops
    
    loadData()
    #print("Loaded...")
    
    shuffleData()
    #print("Shuffled...")
    
    sortData()
    #print("Sorted...")
    
    #cutData(cutoff)#cutoff
    #print("Cut...")
    
    buildData(.2, num_test) #percent_mask, num_test_items
    #print("Sets built...")
    
    pops = np.array(np.ravel(np.sum(train, axis=0)))
    
    if normali:
        train = sknormalize(train, axis = 0)
    
    trainModel(factors, regularization, alpha) #factors, regularization, alpha
    
    testData(num_test)#test songs

    #print("All tasks completed successfully.")

    return

In [None]:
mkl.set_num_threads(1)

sh_valerrs = 0
mb_valerrs = 0
lt_valerrs = 0
als_tot_auc = []
als_sh_auc = []
als_mb_auc = []
als_lt_auc = []
pop_tot_auc = []
pop_sh_auc = []
pop_mb_auc = []
pop_lt_auc = []
model = 0
train = 0
test = 0
masked = 0
pops = 0
playlist_names = 0
playlist_followers = 0
s_track_names = 0
s_track_frequencies = 0
mat = 0
sh = 2641
mb = 22530
#cutoff = 369199
factors = 192
normali = False
regularization = .0625
alpha = 512
num_test = 1000
main()

mkl.set_num_threads(1)

sh_valerrs = 0
mb_valerrs = 0
lt_valerrs = 0
als_tot_auc = []
als_sh_auc = []
als_mb_auc = []
als_lt_auc = []
pop_tot_auc = []
pop_sh_auc = []
pop_mb_auc = []
pop_lt_auc = []
model = 0
train = 0
test = 0
masked = 0
pops = 0
playlist_names = 0
playlist_followers = 0
s_track_names = 0
s_track_frequencies = 0
mat = 0
sh = 2641
mb = 22530
#cutoff = 369199
factors = 192
normali = True
regularization = .5
alpha = 512
num_test = 1000
main()