In [4]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

# Plotting library
from matplotlib import pyplot
import matplotlib as mpl

# Optimization module in scipy
import scipy
# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# library written for this exercise providing additional functions for assignment submission, and others
import utils
import time
#import progressbar as pb
import concurrent


# define the submission/grader object for this exercise

# tells matplotlib to embed plots within the notebook
%matplotlib inline
import pickle
import csv
import implicit
import implicit.cuda

import itertools

In [6]:
def popularityScale(X, zeta):
    #frequencies of each track
    freqs = np.sum(X, axis = 0)
    #popularity scaling term is the nth root over the original
    z = (1-zeta)/zeta
    #scalar to make it so the most popular song is populated with 1s 
    scalar = np.reciprocal(np.amax(freqs)**z)
    #applying frequency scaling to the matrix
    return X * (scalar * (freqs**z))

def runTests():
    users=100
    n=5
    model = implicit.als.AlternatingLeastSquares(factors=224, use_gpu=True)
    raw = scipy.sparse.load_npz("Data\data10.npz")
    songlist = np.zeros(500)
    rawsonglist = np.zeros(500)
    scorelist = np.zeros(500)
    rawscorelist = np.zeros(500)
    for matrixes in range(10,21):
        string = "Data\data"+str(matrixes)
        string+=".npz"
        mat = scipy.sparse.load_npz(string)
        model.fit(mat.T)

        for row in range(users):
            recs = model.recommend(row, mat, N=n, recalculate_user=True)
            rawrecs = model.recommend(row, raw, N=n, recalculate_user=True)
            for i in range(n):
                rawsonglist[n*row+i]=rawrecs[i][0]
                rawscorelist[n*row+i]=rawrecs[i][1]
                songlist[n*row+i]=recs[i][0]
                scorelist[n*row+i]=recs[i][1]
        print("Zeta parameter:",matrixes/10)
        print("number of unique songs recommended:",np.size(np.unique(songlist)))
        if(matrixes==10):
            baseline = np.average(scorelist)
            print("Baseline score:",baseline)
        print("number of unique songs recommended:",np.size(np.unique(songlist)))
        print("baseline - raw avg:",baseline-np.average(rawscorelist))
        print("error:",100*((baseline-np.average(rawscorelist))/baseline),"%")
runTests()

ValueError: No CUDA extension has been built, can't train on GPU.

In [4]:
mat = scipy.sparse.load_npz("Data/data.npz")
regmat = scipy.sparse.csr_matrix(popularityScale(mat,1))
scipy.sparse.save_npz("Data/data10.npz",regmat)

KeyboardInterrupt: 

100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:07<00:00,  1.94it/s]


Data\data10.npz
number of unique songs recommended: 348
average score: 0.276832785377


 70%|████████████████████████████████████████████████████████                        | 10.5/15 [00:06<00:02,  1.77it/s]


KeyboardInterrupt: 

In [None]:
path = "Data\data_big"
files = os.listdir(path)
master_playlists = list()
master_songs_set = set()
master_songs_records = list()

def process_file(file):
    start_time = time.time()
    file_row = files.index(file)
    file_contents_json = open(path + file)
    file_contents = json.load(file_contents_json)
    playlists_in_file = file_contents['playlists']

    for playlist in playlists_in_file:
        playlist_name = dict(playlist)['name']
        songs = list()
       
        for song in dict(playlist)['tracks']:
            song_name = dict(song)['track_name']
            songs.append(song_name)
            master_songs_set.add(song_name)
            master_songs_records.append((song_name, playlist_name))
           
        master_playlists.append(playlist_name)

    del playlists_in_file

def process_all_files(file_list):
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(process_file, file_list)
matrix

In [3]:
users=100
n=5
model = implicit.als.AlternatingLeastSquares(factors=224, use_gpu=True)
raw = scipy.sparse.load_npz("Data\data10.npz")
songlist = np.zeros(500)
rawsonglist = np.zeros(500)
scorelist = np.zeros(500)
rawscorelist = np.zeros(500)
for matrixes in range(10,21):
    string = "Data\data"+str(matrixes)
    string+=".npz"
    mat = scipy.sparse.load_npz(string)
    model.fit(mat.T)

    for row in range(users):
        recs = model.recommend(row, mat, N=n, recalculate_user=True)
        rawrecs = model.recommend(row, raw, N=n, recalculate_user=True)
        for i in range(n):
            rawsonglist[n*row+i]=rawrecs[i][0]
            rawscorelist[n*row+i]=rawrecs[i][1]
            songlist[n*row+i]=recs[i][0]
            scorelist[n*row+i]=recs[i][1]
    print("Zeta parameter:",matrixes/10)
    print("number of unique songs recommended:",np.size(np.unique(songlist)))
    if(matrixes==10):
        baseline = np.average(scorelist)
        print("Baseline score:",baseline)
    print("number of unique songs recommended:",np.size(np.unique(songlist)))
    print("baseline - raw avg:",baseline-np.average(rawscorelist))
    print("error:",100*((baseline-np.average(rawscorelist))/baseline),"%")




ValueError: No CUDA extension has been built, can't train on GPU.