In [None]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

# Plotting library
from matplotlib import pyplot
import matplotlib as mpl

# Optimization module in scipy
import scipy
# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# library written for this exercise providing additional functions for assignment submission, and others
import utils
import time
import progressbar as pb
import concurrent

# define the submission/grader object for this exercise

# tells matplotlib to embed plots within the notebook
%matplotlib inline
import pickle
import csv
import implicit
import itertools

In [39]:
def popularityScale(X, zeta):
    #frequencies of each track
    freqs = np.sum(X, axis = 0)
    #popularity scaling term is the nth root over the original
    z = (1-zeta)/zeta
    #scalar to make it so the most popular song is 
    scalar = np.reciprocal(np.amax(freqs)**z)
    #applying frequency scaling to the matrix
    return X * (scalar * (freqs**z))

In [44]:
mat = scipy.sparse.load_npz("Data/data.npz")
densemat = scipy.sparse.dok_matrix.toarray(mat)
regmat = scipy.sparse.csr_matrix(popularityScale(densemat,1))
scipy.sparse.save_npz("Data/data10.npz",regmat)

In [48]:
mat = scipy.sparse.load_npz("Data/data10.npz")
model = implicit.als.AlternatingLeastSquares(factors=200, use_gpu=False)
model.fit(mat.T)
#recs = model.recommend(row, interaction_matrix, N=num_tracks, recalculate_user=METHOD=="ALS")

100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:05<00:00,  2.46it/s]


In [None]:
path = "Data/data_big\\"
files = os.listdir(path)
master_playlists = list()
master_songs_set = set()
master_songs_records = list()

def process_file(file):
    start_time = time.time()
    file_row = files.index(file)
    file_contents_json = open(path + file)
    file_contents = json.load(file_contents_json)
    playlists_in_file = file_contents['playlists']

    for playlist in playlists_in_file:
        playlist_name = dict(playlist)['name']
        songs = list()
       
        for song in dict(playlist)['tracks']:
            song_name = dict(song)['track_name']
            songs.append(song_name)
            master_songs_set.add(song_name)
            master_songs_records.append((song_name, playlist_name))
           
        master_playlists.append(playlist_name)

    del playlists_in_file
    print(str(file_row) + "," + time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) + "\t")

def process_all_files(file_list):
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(process_file, file_list)
    print("all files processed in " + time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) + "\t(" + str(len(master_playlists)) + " playlists, " + str(len(master_songs_set)) + " songs)")

process_all_files(files)

master_songs_set_list = list(master_songs_set)
master_songs_vector = np.asarray(master_songs_set_list)
m = len(master_playlists)
n = len(master_songs_set_list)
matrix = scipy.sparse.dok_matrix((m,n), dtype=int)

start_time_total = time.time()
counter = 0
progress = pb.ProgressBar(widgets=[pb.Percentage(),"\t", pb.Bar(),"\t", pb.Timer(), "\tTotal Completed: ", pb.Counter()], maxval=len(master_songs_records)).start()
for record in master_songs_records:
    start_time = time.time()
    matrix[master_playlists.index(record[1]),master_songs_set_list.index(record[0])] = 1
    counter  = counter + 1
    progress.update(counter)
print("all playlists processed in " + time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time_total)))
matrix