# VL6 Python Implementation
### Danny Akimchuk

In [1]:
import json
import os
import numpy as np
import multiprocessing
from collections import defaultdict
from copy import copy
import nltk
nltk.download('punkt')
from joblib import Parallel, delayed
import multiprocessing
import string
from threading import Thread
import tqdm
import implicit #use conda install -c conda-forge implicit 
from scipy.sparse import csr_matrix, find, lil_matrix, dok_matrix
import time
from colorama import Fore, Back, Style


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akimchukdaniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Object Structures

In [2]:
class Playlist:
    def __init__(self, name, collaborative, pid, modified_at, num_tracks, num_albums, num_followers, tracks):
        self.name = name
        self.collaborative = collaborative
        self.pid = pid
        self.modified_at = modified_at
        self.num_tracks = num_tracks
        self.num_albums = num_albums
        self.num_followers = num_followers
        self.tracks = tracks
        self.features = {}

class Track:
    def __init__(self, artist_name, track_uri, artist_uri, track_name, album_uri, duration_ms, album_name):
        self.artist_name = artist_name
        self.track_uri = track_uri
        self.artist_uri = artist_uri
        self.track_name = track_name
        self.album_uri = album_uri
        self.duration_ms = duration_ms
        self.album_name = album_name
        self.features = {}

class PlaylistTrack:
    def __init__(self, track, pos):
        self.track = track
        self.pos = pos

class SparseMatrix:
    def __init__(self, num_rows, num_cols, entries=None):
        #NOTE: entries are (row, col, value)
        self.num_rows = num_rows
        self.num_cols = num_cols
        
        self.rows = {}
        
        if entries is not None:
            for row, col, value in entries:
                if row not in self.rows:
                    self.rows[row] = defaultdict(lambda: 0)
                self.rows[row][col] = value
    
    def setValue(self, row, col, value):
        if row not in self.rows:
            self.rows[row] = defaultdict(lambda: 0)
        self.rows[row][col] = value
    
    def getValue(self, row, col):
        if row in self.rows:
            return self.rows[row][col]
        return 0
    
    def getRow(self, row):
        if row in self.rows:
            to_return = np.zeros((self.num_cols,))
            for col, value in self.rows[row].items():
                to_return[col] = value
            return to_return
        return None
    
    def getCol(self, col):
        colVec = np.zeros(self.num_rows)
        for ind, ints in self.rows.items():
            colVec[ind] = ints[col]
        return colVec
    
    def printMatrix(self):
        for row in range(self.num_rows):
            print(self.getRow(row))
            
    def getInteractions(self):
        interactions = []
        for row, vals in self.rows.items():
            for col, val in vals.items():
                interactions.append((row, col, val))
        return interactions
    
    def getTransposeInteractions(self):
        interactions = []
        for row, vals in self.rows.items():
            for col, val in vals.items():
                interactions.append((col, row, val))
        return interactions
    
    def getRowInteractions(self, row):
        interactions = []
        for interaction in self.rows[row].items():
            interactions.append(interaction)
        return interactions
    
    def getTranspose(self):
        newMatr = SparseMatrix(self.num_cols, self.num_rows, self.getTransposeInteractions())
        return newMatr

class Artist:
    def __init__(self, name, uri, popularity):
        self.name = name
        self.uri = uri
        self.popularity = popularity
    def __str__(self):
        return "Name: " + self.name + ", URI: " + self.uri + ", Popularity: " + str(self.popularity)
    

## Import Data

In [3]:
local_data_path = "/Users/akimchukdaniel/Google Drive/locals.json"
local_artists = {}
local_data_file = open(local_data_path)
file_contents= local_data_file.read()
local_json = json.loads(file_contents)
for city in local_json:
    local_artists[city] = {}
    for artist in local_json[city]:
        local_artists[city][artist["artist_uri"]] = Artist(artist["artist_name"], artist["artist_uri"], artist["artist_popularity"])
print("Done importing local artists.")

Done importing local artists.


In [4]:
data_path = "/Users/akimchukdaniel/mpd_data/mpd.v1/data_big/"
test_data_path = "/Users/akimchukdaniel/mpd_data/challenge.v1/challenge_set.json"
city_to_test = "Los Angeles"
tracks = {}
track_ids = []
track_id_len = 0
track_id_to_index = {}
playlists = {}
pids = []
test_pids = []
local_pids = []
potential_eval_pids = []
interactions = {}
file_count = 0
filenames = list(os.listdir(data_path))
for i in range(len(filenames)):
    filenames[i] = data_path + filenames[i]
filenames.append(test_data_path)

for filename in tqdm.tqdm(filenames):
    if filename == ".DS_Store":
        continue
    if filename == test_data_path:
        num_playlists_train = num_playlists
        isTest = True
    else:
        isTest = False
    data_file = open(filename)
    file_count += 1
    file_contents = data_file.read()
    jsonArray = json.loads(file_contents)
    for playlist_data in jsonArray["playlists"]:
        is_local = False
        num_local = 0
        try:
            pid = int(playlist_data["pid"])
            
            try:
                name = playlist_data["name"]
            except:
                name = None
                
            try:
                collab = playlist_data["collaborative"] == 'true'
            except:
                collab = None
            
            try:
                modified_at = int(playlist_data["modified_at"])
            except:
                modified_at = None
            
            try:
                num_tracks = int(playlist_data["num_tracks"])
            except:
                num_tracks = None
            
            try:
                num_albums = int(playlist_data["num_albums"])
            except:
                num_albums = None
                
            try:
                num_followers = int(playlist_data["num_followers"])
            except:
                num_followers = None
                
            try:
                tracks_data = playlist_data["tracks"]
                playlist_interactions = defaultdict(lambda: 0)
                playlist_tracks = []
                for track_data in tracks_data:
                    track_uri = track_data["track_uri"]
                    if track_uri in tracks:
                        track = tracks[track_uri]
                    else:
                        artist_name = track_data["artist_name"]
                        artist_uri = track_data["artist_uri"]
                        if artist_uri in local_artists[city_to_test]:
                            num_local += 1
                            is_local = True
                            #print(artist_name, "is local")
                        track_name = track_data["track_name"]
                        album_uri = track_data["album_uri"]
                        duration_ms = track_data["duration_ms"]
                        album_name = track_data["album_name"]
                        track = Track(artist_name, track_uri, artist_uri, track_name, album_uri, duration_ms, album_name)
                        tracks[track_uri] = track
                        track_ids.append(track_uri)
                        track_id_to_index[track_uri] = track_id_len
                        track_id_len += 1
                    try:
                        pos = int(track_data["pos"])
                    except:
                        pos = None
                    playlist_tracks.append(PlaylistTrack(track, pos))
                    playlist_interactions[track_uri] = playlist_interactions[track_uri] + 1
            except:
                playlist_tracks = []
                playlist_interactions = None
            
            playlist = Playlist(name, collab, pid, modified_at, num_tracks, num_albums, num_followers, playlist_tracks)
            playlists[pid] = playlist
            interactions[pid] = playlist_interactions
            pids.append(pid)
            if isTest:
                test_pids.append(pid)
            if is_local:
                #print(pid,"is a local playlist")
                local_pids.append(pid)
                if num_local > 10:
                    print("ADDING",pid,"TO POTENTIAL EVAL")
                    potential_eval_pids.append(pid)
        except Exception as e:
            print(str(e))
            pass
    num_playlists = len(playlists)
    num_tracks = len(tracks)

print("Imported " + str(num_playlists) + " playlists containing " + str(num_tracks) + " unique tracks from " + str(file_count) + " files.")
print("Local Playlists: " + str(len(local_pids)))
del filenames

    

  0%|          | 2/1001 [00:01<09:08,  1.82it/s]

ADDING 613393 TO POTENTIAL EVAL


  0%|          | 3/1001 [00:01<09:33,  1.74it/s]

ADDING 115768 TO POTENTIAL EVAL


100%|██████████| 1001/1001 [14:25<00:00,  1.21it/s] 

Imported 1010000 playlists containing 2262292 unique tracks from 1001 files.
Local Playlists: 375





## Get Playlist Features

In [5]:
# punctRemover = str.maketrans('', '', string.punctuation)
# punctToSpace = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space

# playlist_features = ["name_tokenized", "name_regexed", "name_original", "n_tracks"]

# num_unique = {}

# name_tokenized_map = {}
# name_regexed_map = {}
# name_original_map = {}

# for feature in playlist_features:
#     num_unique[feature] = 0

# def getIndexForFeature(feature, feature_map, feature_name):
#     if feature in feature_map:
#         return feature_map[feature]
#     else:
#         index = num_unique[feature_name]
#         num_unique[feature_name] += 1
#         feature_map[feature] = index
#         return index

# playlist_count = 0
# for pid in tqdm.tqdm(pids):
#     playlist_count += 1
#     playlist = playlists[pid]
#     for feature in playlist_features:
#         if feature == "name_tokenized":
#             if playlist.name is not None:
#                 name = playlist.name
#                 # vl6 uses a lucene text tokenizer, mine is a bit different.
#                 # this treats plural as different than the original
#                 # it also allows emoji/unicode characters that I think they remove.
#                 # may want to tweak
#                 tokens_full = nltk.word_tokenize(name)
#                 token_indices = []
#                 for token in tokens_full:
#                     simp = token.lower()
#                     simp = simp.translate(punctRemover)
#                     index = getIndexForFeature(simp, name_tokenized_map, feature)
#                     if index not in token_indices:
#                         token_indices.append(index)
                
#                 if len(token_indices) > 0:
#                     playlist.features[feature] = token_indices
#                 else:
#                     playlist.features[feature] = None
#         elif feature == "name_regexed":
#             if playlist.name is not None:
#                 name = playlist.name
#                 simp = name.lower()
#                 simp = simp.translate(punctToSpace)
#                 simp = ' '.join(simp.split())
#                 index = getIndexForFeature(simp, name_regexed_map, feature)
#                 playlist.features[feature] = index
#         elif feature == "name_original":
#             if playlist.name is not None:
#                 index = getIndexForFeature(playlist.name, name_original_map, feature)
#                 playlist.features[feature] = index
#         elif feature == "n_tracks":
#             playlist.features[feature] = playlist.num_tracks
# print("Built features for", playlist_count, "playlists...")

# del name_tokenized_map
# del name_regexed_map
# del name_original_map

# num_playlists = len(pids)

# #build name_tokenized feature matrix
# #name_tokenized_matrix = SparseMatrix(num_playlists, num_unique["name_tokenized"])
# name_tokenized_matrix_rows = []
# name_tokenized_matrix_cols = []
# name_tokenized_matrix_vals = []
# name_tokenized_matrix = None
# def buildTokenized():
#     global name_tokenized_matrix
#     print("Building tokenized matrix...")
#     for i in range(len(pids)):
#         playlist = playlists[pids[i]]
#         if "name_tokenized" in playlist.features:
#             for token_index in playlist.features["name_tokenized"]:
#                 #name_tokenized_matrix.setValue(i, token_index, 1)
#                 name_tokenized_matrix_rows.append(i)
#                 name_tokenized_matrix_cols.append(token_index)
#                 name_tokenized_matrix_vals.append(1)
#     name_tokenized_matrix = csr_matrix((name_tokenized_matrix_vals, (name_tokenized_matrix_rows, name_tokenized_matrix_cols)), shape=(num_playlists,num_unique["name_tokenized"]))
#     print("Done building tokenized matrix.")
            
# #build name_regexed feature matrix
# #name_regexed_matrix = SparseMatrix(num_playlists, num_unique["name_regexed"])
# name_regexed_matrix_rows = []
# name_regexed_matrix_cols = []
# name_regexed_matrix_vals = []
# name_regexed_matrix = None
# def buildRegexed():
#     global name_regexed_matrix
#     print("Building regexed matrix...")
#     for i in range(len(pids)):
#         playlist = playlists[pids[i]]
#         if "name_regexed" in playlist.features:
#             #name_regexed_matrix.setValue(i, playlist.features["name_regexed"], 1)
#             name_regexed_matrix_rows.append(i)
#             name_regexed_matrix_cols.append(playlist.features["name_regexed"])
#             name_regexed_matrix_vals.append(1)
#     name_regexed_matrix = csr_matrix((name_regexed_matrix_vals, (name_regexed_matrix_rows, name_regexed_matrix_cols)), shape=(num_playlists,num_unique["name_regexed"]))
#     print("Done building regexed matrix.")

# #build name_original feature matrix
# #name_original_matrix = SparseMatrix(num_playlists,num_unique["name_original"])
# name_original_matrix_rows = []
# name_original_matrix_cols = []
# name_original_matrix_vals = []
# name_original_matrix = None
# def buildOriginal():
#     global name_original_matrix
#     print("Building original matrix...")
#     for i in range(len(pids)):
#         playlist = playlists[pids[i]]
#         if "name_original" in playlist.features:
#             #name_original_matrix.setValue(i,playlist.features["name_original"], 1)
#             name_original_matrix_rows.append(i)
#             name_original_matrix_cols.append(playlist.features["name_original"])
#             name_original_matrix_vals.append(1)
#     name_original_matrix = csr_matrix((name_original_matrix_vals, (name_original_matrix_rows, name_original_matrix_cols)), shape=(num_playlists,num_unique["name_original"]))
#     print("Done building original matrix.")

# #n_tracks_matrix = SparseMatrix(1, num_playlists) # just a vector
# n_tracks_matrix_rows = []
# n_tracks_matrix_cols = []
# n_tracks_matrix_vals = []
# n_tracks_matrix = None
# def buildNTracks():
#     global n_tracks_matrix
#     print("Building nTracks matrix...")
#     for i in range(len(pids)):
#         playlist = playlists[pids[i]]
#         if "n_tracks" in playlist.features:
#             #n_tracks_matrix.setValue(0,i, playlist.features["n_tracks"])
#             n_tracks_matrix_rows.append(i)
#             n_tracks_matrix_cols.append(0)
#             n_tracks_matrix_vals.append(playlist.features["n_tracks"])
#     n_tracks_matrix = csr_matrix((n_tracks_matrix_vals, (n_tracks_matrix_rows, n_tracks_matrix_cols)), shape=(num_playlists,1))
#     print("Done building nTracks matrix.")

# tokenizedThread = Thread(target=buildTokenized)
# regexedThread = Thread(target=buildRegexed)
# originalThread = Thread(target=buildOriginal)
# nTracksThread = Thread(target=buildNTracks)

# tokenizedThread.start()
# regexedThread.start()
# originalThread.start()
# nTracksThread.start()

# tokenizedThread.join()
# regexedThread.join()
# originalThread.join()
# nTracksThread.join()

# del playlists


# print("Constructed sparse feature matrix")
# print("Features for Name Tokens:", num_unique["name_tokenized"])
# print("Features for Regexed Name:", num_unique["name_regexed"])
# print("Features for Original Name:", num_unique["name_original"])
# print("Features for Number of Tracks: 1")

## Get Song Features

In [6]:
# song_features = ['track_name', 'artist_id', 'album_id', 'duration']

# for feature in song_features:
#     num_unique[feature] = 0

# track_name_map = {}
# artist_id_map = {}
# album_id_map = {}

# track_count = 0
    
# for track_id in tqdm.tqdm(track_ids):
#     track_count += 1
#     track = tracks[track_id]
#     for feature in song_features:
#         if feature == "track_name":
#             name = track.track_name
#             # vl6 uses a lucene text tokenizer, mine is a bit different.
#             # this treats plural as different than the original
#             # it also allows emoji/unicode characters that I think they remove.
#             # may want to tweak
#             tokens_full = nltk.word_tokenize(name)
#             token_indices = []
#             for token in tokens_full:
#                 simp = token.lower()
#                 simp = simp.translate(punctRemover)
#                 index = getIndexForFeature(simp, track_name_map, feature)
#                 if index not in token_indices:
#                     token_indices.append(index)
#             if len(token_indices) > 0:
#                 track.features[feature] = token_indices
#             else:
#                 track.features[feature] = None
#         elif feature == "artist_id":
#             artist_id = track.artist_uri
#             index = getIndexForFeature(artist_id, artist_id_map, feature)
#             track.features[feature] = index
#         elif feature == "album_id":
#             album_id = track.album_uri
#             index = getIndexForFeature(album_id, album_id_map, feature)
#             track.features[feature] = index
#         elif feature == "duration":
#             duration = track.duration_ms # vl6 converts this to seconds?
#             track.features[feature] = duration 

# del track_name_map
# del artist_id_map
# del album_id_map

# num_tracks = len(track_ids)

# #build track_name feature matrix
# #track_name_matrix = SparseMatrix(num_tracks, num_unique["track_name"])
# track_name_matrix_rows = []
# track_name_matrix_cols = []
# track_name_matrix_vals = []
# track_name_matrix = None
# def buildTrackName():
#     global track_name_matrix
#     print("Building track name matrix...")
#     for i in range(len(track_ids)):
#         track = tracks[track_ids[i]]
#         if "track_name" in track.features:
#             for token_index in track.features["track_name"]:
#                 #track_name_matrix.setValue(i, token_index, 1)
#                 track_name_matrix_rows.append(i)
#                 track_name_matrix_cols.append(token_index)
#                 track_name_matrix_vals.append(1)
#     track_name_matrix = csr_matrix((track_name_matrix_vals, (track_name_matrix_rows, track_name_matrix_cols)), shape=(num_tracks,num_unique["track_name"]))
#     print("Done building track name matrix.")

# #build artist_id feature matrix
# #artist_id_matrix = SparseMatrix(num_tracks, num_unique["artist_id"])
# artist_id_matrix_rows = []
# artist_id_matrix_cols = []
# artist_id_matrix_vals = []
# artist_id_matrix = None
# def buildArtistID():
#     global artist_id_matrix
#     print("Building artist ID matrix...")
#     for i in range(len(track_ids)):
#         track = tracks[track_ids[i]]
#         if "artist_id" in track.features:
#             #artist_id_matrix.setValue(i,track.features['artist_id'], 1)
#             artist_id_matrix_rows.append(i)
#             artist_id_matrix_cols.append(track.features['artist_id'])
#             artist_id_matrix_vals.append(1)
#     artist_id_matrix = csr_matrix((artist_id_matrix_vals, (artist_id_matrix_rows, artist_id_matrix_cols)), shape=(num_tracks,num_unique["artist_id"]))
#     print("Done building artist ID matrix.")

# #build album_id feature matrix
# #album_id_matrix = SparseMatrix(num_tracks, num_unique["album_id"])
# album_id_matrix_rows = []
# album_id_matrix_cols = []
# album_id_matrix_vals = []
# album_id_matrix = None
# def buildAlbumID():
#     global album_id_matrix
#     print("Building album ID matrix...")
#     for i in range(len(track_ids)):
#         track = tracks[track_ids[i]]
#         if "album_id" in track.features:
#             #album_id_matrix.setValue(i,track.features["album_id"], 1)
#             album_id_matrix_rows.append(i)
#             album_id_matrix_cols.append(track.features["album_id"])
#             album_id_matrix_vals.append(1)
#     album_id_matrix = csr_matrix((album_id_matrix_vals, (album_id_matrix_rows, album_id_matrix_cols)), shape=(num_tracks,num_unique["album_id"]))
#     print("Done building album ID matrix.")

# #build duration feature matrix
# #duration_matrix = SparseMatrix(1, num_tracks)
# duration_matrix_rows = []
# duration_matrix_cols = []
# duration_matrix_vals = []
# duration_matrix = None
# def buildDuration():
#     global duration_matrix
#     print("Building duration matrix...")
#     for i in range(len(track_ids)):
#         track = tracks[track_ids[i]]
#         if "duration" in track.features:
#             #duration_matrix.setValue(0,i,track.features["duration"])
#             duration_matrix_rows.append(i)
#             duration_matrix_cols.append(0)
#             duration_matrix_vals.append(track.features["duration"])
#     duration_matrix = csr_matrix((duration_matrix_vals, (duration_matrix_rows, duration_matrix_cols)), shape=(num_tracks,1))
#     print("Done building duration matrix.")
            
# trackThread = Thread(target=buildTrackName)
# artistThread = Thread(target=buildArtistID)
# albumThread = Thread(target=buildAlbumID)
# durationThread = Thread(target=buildDuration)

# trackThread.start()
# artistThread.start()
# albumThread.start()
# durationThread.start()

# trackThread.join()
# artistThread.join()
# albumThread.join()
# durationThread.join()  



# print("Constructed sparse feature matrix")
# print("Features for Name Tokens:", num_unique["track_name"])
# print("Features for Artist ID:", num_unique["artist_id"])
# print("Features for Album ID:", num_unique["album_id"])
# print("Features for Duration: 1")

## Build Interaction Matrix

In [7]:
#interaction_matrix = SparseMatrix(num_playlists, num_tracks)
interaction_matrix_rows = []
interaction_matrix_cols = []
interaction_matrix_vals = []
#interaction_matrix = dok_matrix((num_playlists,num_tracks))

row_count = 0
test_indexes = []
r_train = dok_matrix((num_playlists,num_tracks))
eval_pids = np.random.choice(potential_eval_pids, 10)
correct_ids = {}
for row in tqdm.tqdm(range(len(pids))):
    row_count += 1
    is_eval = pids[row] in eval_pids
    if is_eval:
        correct_ids[pids[row]] = []
    ints = interactions[pids[row]]
    if pids[row] in test_pids:
        test_indexes.append(row)
    for (track_id, count) in ints.items():
        index = track_id_to_index[track_id]
        #interaction_matrix.setValue(row, index, count)
        if not is_eval or tracks[track_id].artist_uri not in local_artists[city_to_test]:
            interaction_matrix_rows.append(row)
            interaction_matrix_cols.append(index)
            interaction_matrix_vals.append(count)
        if not is_eval:
            r_train[row,index] = count
        elif tracks[track_id].artist_uri in local_artists[city_to_test]:
            correct_ids[pids[row]].append(tracks[track_id].artist_uri)
        #interaction_matrix[row,index] = count
    #for playlist_track in playlist.tracks:
    #    track_uri = playlist_track.track.track_uri
    #    col = track_id_to_index[track_uri]
    #    interaction_matrix[row,col] = 1

del interactions
interaction_matrix = csr_matrix((interaction_matrix_vals, (interaction_matrix_rows, interaction_matrix_cols)), shape=(num_playlists,num_tracks))
print("Built interaction matrix for", row_count, "playlists.")

100%|██████████| 1010000/1010000 [16:52<00:00, 997.17it/s] 


Built interaction matrix for 1010000 playlists.


In [8]:
eval_pids = list(dict.fromkeys(eval_pids).keys())

## Generate Split
##### (Want to go over)

In [8]:
#r_train = SparseMatrix(num_playlists, num_tracks, interaction_matrix.getInteractions())
#r_valid = SparseMatrix(num_playlists, num_tracks)
# print("copying interaction matrix to dok")
# r_train = dok_matrix(interaction_matrix)
# print("done copy")
    


# valid_indexes = []
# added_indexes = set()
# n_exact = 0
# n_at_least = 0

# test_length = len(test_indexes)
# test_indexes = np.array(test_indexes)
# test_indexes.sort()
# rows=find(interaction_matrix)[0]

# the_range = range(num_playlists)
# is_test = np.isin(np.array(the_range), test_indexes)
# num_tracks_arr = n_tracks_matrix.toarray().T[0]

# for count in tqdm.tqdm(range(len(test_indexes))):
#     index = test_indexes[count]
#     #print("checking has items", index)
#     if not np.any(rows==index):
#         #print("NO ITEMS")
#         continue
#     #print("done checking items")
#     n_tracks_total = n_tracks_matrix[index,0]
#     n_tracks_train = interaction_matrix[index].sum()
#     #print(time.time(),"good playlist")

#     #find training playlists with n_tracks
#     exact = []
#     at_least = []
    
#     #print("starting calc exact, at_least")
    
    
    
#     for i in np.extract(np.all([is_test==False,num_tracks_arr>=n_tracks_total], axis=0), the_range):
#         #skip test playlists
        
#         # NOTE: they did unique songs here, I'm just doing straight number of songs
#         n_tracks = num_tracks_arr[i]
#         if n_tracks == n_tracks_total:
#             exact.append(i)
#         elif n_tracks > n_tracks_total:
#             at_least.append(i)
#     #print("ending calc exact, at_least")
#     #print(time.time(),"got exact and at least")

#     np.random.shuffle(exact)
#     np.random.shuffle(at_least)
    
#     #print("staring valid")
#     repeat = 0
#     while repeat < 10:
#         #print(repeat)
#         valid_index = None
#         if valid_index is None:
#             while len(exact) > 0:
#                 candidate = exact[0]
#                 del exact[0]

#                 if candidate not in added_indexes:
#                     valid_index = candidate
#                     n_exact += 1
#                     break
#         if valid_index is None:
#             while len(at_least) > 0:
#                 candidate = at_least[0]
#                 del at_least[0]
                
#                 if candidate not in added_indexes:
#                     valid_index = candidate
#                     n_at_least += 1
#                     break
#         if valid_index is None:
#             break
#         added_indexes.add(valid_index)
#         if repeat == 0:
#             valid_indexes.append(valid_index)
#         repeat += 1
        
#         #split row at valid_index
#         row_interactions = find(interaction_matrix[valid_index])
            
#         row_indexes = np.array(range(len(row_interactions[0])))
#         np.random.shuffle(row_indexes)
        
#         train_indexes = set()
#         for i in range(n_tracks_train):
#             train_indexes.add(row_interactions[1][row_indexes[i]])
        
#         for index in row_indexes:
#             col = row_interactions[1][index]
#             val = row_interactions[2][index]
#             if col in train_indexes:
#                 #the randomly sampled tracks from this playlist
#                 #r_train.setValue(valid_index, col, val) #note, this UPDATES the value, as train was a copy of interactions
#                 r_train[valid_index,col] = val
#             else:
#                 #r_valid.setValue(valid_index, col, val)  
#                 r_valid[valid_index,col] = val
#     #print("ending valid")
#     #print(time.time(),"done")
        
# valid_indexes = np.array(valid_indexes)
# valid_indexes.sort()

# valid_cols = np.array(range(num_tracks))
        
# print("Generated Split for",count + 1,"playlists.")
# print("n_exact:", n_exact)
# print("n_at_least:", n_at_least)
# print("valid_indexes:", len(valid_indexes))

## Weighted Regularized Matrix Factorization (WRMF)

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=224, use_gpu=False)
model.fit(r_train.T)

playlist_id=eval_pids[0]
for playlist_id in eval_pids:
    remaining_artists = list(local_artists[city_to_test].keys()).copy()
    print("pid:",playlist_id)
    row = pids.index(playlist_id)
    #print(row)
    #print(local_artists["Nashville"])
    #recs = model.recommend(row, interaction_matrix, N=num_tracks, recalculate_user=True)
    recs = model.recommend(row, interaction_matrix, N=num_tracks, recalculate_user=False)

    interactions = interaction_matrix[row]
    print("IN PLAYLIST")
    for interaction in interactions.nonzero()[1]:
        track = tracks[track_ids[interaction]]
        print(track.track_name, "by",track.artist_name)
        if track.artist_uri in local_artists[city_to_test]:
            print("^^LOCAL")
    print("\nRECOMMENDS")
    count=1
    for rec,score in recs:
        track = tracks[track_ids[rec]]
        if track.artist_uri in remaining_artists:
            if track.artist_uri in correct_ids[playlist_id]:
                escape=Back.GREEN
            else:
                escape=Back.RED
            print(escape,track.track_name, "by",track.artist_name, "score:",score,"AT POSITION:",count)
            remaining_artists.remove(track.artist_uri)
        count+=1
    print(Style.RESET_ALL + "\n\n\n")

 63%|██████▎   | 9.5/15 [14:11<08:08, 88.88s/it]

In [12]:
model = implicit.als.AlternatingLeastSquares(factors=200)
model.fit(interaction_matrix.T)

U = model.user_factors
V = model.item_factors
U.dump("U")
V.dump("V")


  0%|          | 0/15 [00:00<?, ?it/s][A
  3%|▎         | 0.5/15 [00:32<15:42, 65.00s/it][A
  7%|▋         | 1.0/15 [01:24<17:54, 76.77s/it][A
 10%|█         | 1.5/15 [01:55<16:12, 72.05s/it][A
 13%|█▎        | 2.0/15 [02:49<17:58, 82.99s/it][A
 17%|█▋        | 2.5/15 [03:20<15:59, 76.78s/it][A
 20%|██        | 3.0/15 [04:13<17:04, 85.38s/it][A
 23%|██▎       | 3.5/15 [04:44<14:59, 78.23s/it][A
 27%|██▋       | 4.0/15 [05:37<15:54, 86.78s/it][A
 30%|███       | 4.5/15 [06:10<14:03, 80.38s/it][A
 33%|███▎      | 5.0/15 [07:04<14:46, 88.68s/it][A
 37%|███▋      | 5.5/15 [07:35<12:46, 80.73s/it][A
 40%|████      | 6.0/15 [08:26<13:03, 87.05s/it][A
 43%|████▎     | 6.5/15 [08:55<11:06, 78.40s/it][A
 47%|████▋     | 7.0/15 [09:46<11:24, 85.57s/it][A
 50%|█████     | 7.5/15 [10:15<09:38, 77.19s/it][A
 53%|█████▎    | 8.0/15 [11:06<09:52, 84.70s/it][A
 57%|█████▋    | 8.5/15 [11:35<08:17, 76.57s/it][A
 60%|██████    | 9.0/15 [12:26<08:27, 84.51s/it][A
 63%|██████▎   | 9.5/

In [13]:
print(correct_ids)
print(local_artists[city_to_test])

{613393: ['spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO', 'spotify:artist:3AmgGrYHXqgbmZ2yKoIVzO'], 115768: ['spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0Ek', 'spotify:artist:68tKVjVvcqUfKFFLr2j0E

In [14]:
playlist_id=eval_pids[0]
for playlist_id in eval_pids:
    remaining_artists = list(local_artists[city_to_test].keys()).copy()
    print("pid:",playlist_id)
    row = pids.index(playlist_id)
    #print(row)
    #print(local_artists["Nashville"])
    recs = model.recommend(row, interaction_matrix, N=num_tracks)
    interactions = interaction_matrix[row]
    print("IN PLAYLIST")
    for interaction in interactions.nonzero()[1]:
        track = tracks[track_ids[interaction]]
        print(track.track_name, "by",track.artist_name)
        if track.artist_uri in local_artists[city_to_test]:
            print("^^LOCAL")
    print("\nRECOMMENDS")
    count=1
    for rec,score in recs:
        track = tracks[track_ids[rec]]
        if track.artist_uri in remaining_artists:
            if track.artist_uri in correct_ids[playlist_id]:
                escape=Back.GREEN
            else:
                escape=Back.RED
            print(escape,track.track_name, "by",track.artist_name, "score:",score,"AT POSITION:",count)
            remaining_artists.remove(track.artist_uri)
        count+=1
    print(Style.RESET_ALL + "\n\n\n")


pid: 115768
IN PLAYLIST
V. 3005 by Childish Gambino
I'll Make a Man Out of You - From "Mulan"/Soundtrack by Donny Osmond
Me and Your Mama by Childish Gambino
iSpy (feat. Lil Yachty) by KYLE
Bad and Boujee (feat. Lil Uzi Vert) by Migos
IV. sweatpants by Childish Gambino
Sober by Childish Gambino
All Time Low by Jon Bellion
Redbone by Childish Gambino
Nine In The Afternoon - Radio Mix by Panic! At The Disco
Sweater Weather by The Neighbourhood
Car Radio by Twenty One Pilots
1-800-273-8255 by Logic
Straightjacket by Quinn XCII
Hymn For The Weekend - Seeb Remix by Coldplay
1985 by Bowling For Soup
The Great Escape by Boys Like Girls
Sun Models by ODESZA
Say My Name by ODESZA
Too Good At Goodbyes by Sam Smith
All I Wanted by Paramore
Feeling Sorry by Paramore
Circles by Pierce The Veil
Time (feat. G-Eazy & Olivver the Kid) by Skizzy Mars
F.C.P.R.E.M.I.X. by The Fall of Troy
All Fucked Up by The Amity Affliction
Chasing Ghosts by The Amity Affliction
Say It by The Geek x Vrv
Trouble by Never

[41m Weight by Mikal Cronin score: 0.00128123 AT POSITION: 5113
[41m The Past Should Stay Dead by Emarosa score: 0.000440789 AT POSITION: 13373
[41m Destroy Everything You Touch by Ladytron score: 0.000345173 AT POSITION: 16372
[41m Like I Need You by Kan Wakan score: 0.000320757 AT POSITION: 17353
[41m Amorous by Jesse Boykins III score: 0.000170174 AT POSITION: 28542
[41m I Want To See You Dance by Avid Dancer score: 0.000143259 AT POSITION: 32573
[41m Wasting Time by Orchin score: 4.72896e-05 AT POSITION: 74509
[41m Sleeping In by Phil Good score: 3.98965e-05 AT POSITION: 85096
[41m Dillalude #2 by Robert Glasper score: 3.95154e-05 AT POSITION: 85700
[41m Wicket Youth by Sego score: 3.10785e-05 AT POSITION: 102926
[41m Midnight Whispers by Raquel Rodriguez score: 2.05661e-05 AT POSITION: 140035
[41m Our Love Is Gonna Live Forever by Spain score: 2.03632e-05 AT POSITION: 141004
[41m Emergency House Party by American Steel score: 1.36921e-05 AT POSITION: 185630
[41m Blac

## Get Playlist-to-Playlist and Song-to-Song Rankings

In [None]:
# playlist_to_playlist = np.zeros((num_playlists,num_playlists))
# for i in tqdm.tqdm(range(num_playlists)):
#     #train_cols = r_train[i].nonzero()[1]
#     sim = model.similar_users(i,num_playlists)
#     for col, val in sim:
#         #if not np.any(train_cols==col):
#         #if col not in train_cols:
#         #if np.any(train_cols==col):
#         #if col in train_cols:
#         #    continue
#         if r_train[i,col] == 0:
#             playlist_to_playlist[i,col] = val
# playlist_to_playlist

In [None]:
#song_to_song = np.zeros((num_tracks,num_tracks))
#for i in tqdm.tqdm(range(num_tracks)):
#    train_cols = r_train[i].nonzero()[1]
#    sim = model.similar_items(i,num_tracks)
#    for col, val in sim:
#        if not np.any(train_cols==col):
#            song_to_song[i,col] = val
#song_to_song

In [None]:
#r_train.nonzero()

## Evalutation

## 