In [1]:
from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
import pandas as pd
import spotipy
import sys
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as np
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

Contains all functionaility in relation to building and running the RS model.

# Model Creation

In [2]:
def calculate_indptr(users):
    """
    Creates the indptr array for the set of users.
    
    Input:
        users; Pandas Series, the set of users associated to each row in the main data set.
    Output:
        indptr; list, the row values associated to each user.
    """
    total = 0
    counter = 0
    indptr = [0]
    # Loop through each unique user.
    for unique in users.unique():
        # counter responds to the position in the array.
        counter += len(users[users == unique])
        indptr.append(counter)
    
    return(indptr)
#indptr = calculate_indptr(users)

In [3]:
def create_model(data, factors = 64, regularization = 0.0, bm25 = True, iterations = 15, alpha = 1.0):
    
    indptr = calculate_indptr(data['userID'])
    user_items = csr_matrix((data['count'], data['trackID'], indptr))
    
    # weight the matrix, both to reduce impact of users that have played the same artist thousands of times
    # and to reduce the weight given to popular items
    if bm25 == True:
        artist_user_plays = bm25_weight(user_items, K1=100, B=0.8)
    else:
        artist_user_plays = user_items

    # get the transpose since the most of the functions in implicit expect (user, item) sparse matrices instead of (item, user)
    # user_plays = artist_user_plays.T.tocsr()
    user_plays = artist_user_plays.tocsr()

    model = AlternatingLeastSquares(factors = factors, 
                                    regularization = regularization, 
                                    iterations = iterations)#,alpha = alpha)

    model.fit(alpha * user_plays)
    return(model, user_plays)

# Model Testing

In [4]:
def track_exploratoryness(user_history): 
    """
    Calculates the track exploratoryness from a users history
    
    Input:
        user_history: dataframe, contains all of the users history
    Output:
        track explotoryness, can be calucalated using exploratoryness or unique by commenting the relative lines.
    """
    return(  1  - ((1/user_history['count'].sum())* np.sum(user_history['count'].sort_values(ascending = False)/range(1,len(user_history['count'])+1))))
    #return(len(user_history['trackID'].unique()))
    
    
def artist_exploratoryness(user_history):  
    """
    Calculates the artists exploratoryness from a users history
    
    Input:
        user_history: dataframe, contains all of the user's history
    Output:
        artist explotoryness, can be calucalated using exploratoryness or unique by commenting the relative files.
    """
    #return(1- ((1/user_history['count'].sum()) * (np.sum(user_history.groupby('artist-MBID').sum('count')['count'].sort_values(ascending = False) / range(1, len(user_history['artist-MBID'].unique())+1)))))
    return(len(user_history['artist-MBID'].unique()))

def calculate_exploratoryness(user_track_exploration, user_artist_exploration, sample_ids, user_histories):  
    """
    Iterates through each of the sample user and adds their exploratoryness to the relavent files.
    
    Input
        user_track_exploration; dictionary {userID, list of exploratoryness} the track exploratoryness of each user
        user_artist_exploration; dictionary {userID, list of exploratoryness} the artist exploratoryness of each user
        sample_ids; list of int, the list of sample users for the test
        user_histories; Dictionary{userid, dataframe of user history}, the user history for each user.
    """
    for user_id in sample_ids:
        history = user_histories[user_id]
        user_track_exploration[user_id].append(track_exploratoryness(history))
        user_artist_exploration[user_id].append(artist_exploratoryness(history))
    return(user_track_exploration, user_artist_exploration)

In [5]:
def get_decay_predictions(model, userid, user_history, user_decay, n_decay, filter_already_liked_items):
    """
    Obtains predictions when the decay model is active.
    
    Input 
        model; the RS model implemented
        userid; int, the users id
        user_history; Dataframe, the users history
        user_decay; Dictionary{trackID,decay_value} the decay values for each of the recommended tracks for that user
        n_decay; int, the length of decay set
        filter_already_liked_items; bool, whether the model is set to prevent liked items appearing
    
    Output
        user_decay; Dictionary{trackID,decay_value} the updated decay values for the given user
        valid_track_ids; list of int, the tracks recommended to the user.
    """
    
    # Generates enough recommendations so that 10 do not have a decay set.
    track_ids, scores = model.recommend(userid, user_history, N=len(user_decay.keys())+10, filter_already_liked_items=False)
    
    valid_track_ids = []
    i = 0
    
    # Add new tracks to the decay
    while len(valid_track_ids) < 10:
        if track_ids[i] not in user_decay.keys():
            valid_track_ids.append(track_ids[i])
            user_decay[track_ids[i]] = n_decay
        i+=1
        
    
    return(user_decay, valid_track_ids)

In [6]:
def update_decay(user_decay):
    """
    Updates the decay by subtracting 1 from its value and removing recommendations with 0 decay.
    
    Input
        user_decay; dict{track_name:decay value} each track's decay
    Output
        user_decay; dict{track_name:decay value} each track's updated decay
    """
    # Create a list of tracks that need to be deleted so it can be performed outside the loop
    to_delete = []
    for track in user_decay.keys():
        user_decay[track] -= 1
        # Removes tracks with no decay left.
        if user_decay[track] == 0:
            to_delete.append(track) 
    for track in to_delete:
        del(user_decay[track])
    return(user_decay)

In [7]:
def add_sim_songs(user_history, n_sim_songs):
    """
    increment n_sim_songs songs  to the users history.
    
    Input:
        user_history; Dataframe, the users listening history
        n_sim_songs; int, the number of songs to add to the users history.
    
    Output:
        user_history; Dataframe, the users updated listening history
    """
    # Take a random sample of track from the user's listening history
    sim_listening = user_history['trackID'].sample(n_sim_songs)
    mask = user_history['trackID'].isin(sim_listening)
    user_history['count'][mask] += 1
    return(user_history)

In [8]:
def add_predictions(track_ids, unique_tracks, user_history, userid):
    """
    Using the track Ids extract the relavent information about each recommendation and update the users history.
    
    
    Inputs:
        track_ids; list of int
        unique_tracks; Dataframe, contains all metadata about each unique track in the total dataset.
        user_history; Dataframe, the users listening history
        userid; int
    Output;
        user_history; dataframe, the updated user history
        prediction_data; dataframe, meta data about each prediction
    """
    prediction_data = pd.DataFrame(unique_tracks.loc[track_ids])
    prediction_data['already_liked'] = prediction_data['recordingMBID'].isin(user_history['recordingMBID'])

    #Update their user history
    for track in prediction_data.index:
        row = prediction_data.loc[track]
        if prediction_data.loc[track]['already_liked'] == True:
            mask = user_history['trackID'] == track
            user_history['count'][mask] += 1
        else:
            user_history.loc[user_history.shape[0]] = [row['artist-MBID'], 1, "unkown", row['recordingMBID'], row['trackName'], userid, track, row['artistName']]
    #print(len(prediction_data))
    return (user_history, prediction_data)

In [9]:
def simulate_model(data, n_simulations, sample_ids,
                   n_recommendations = 10,
                   decay = False, 
                   n_decay = 5,
                   sim_rand_listen = False,
                   n_sim_songs = 10, 
                   filter_already_liked_items = False):
    
    """
    Simulates the multiple users using the recommendation system for a given number of iterations:
    
    Input:
        data; Pandas dataframe, the whole dataset including MBID's, Names and the test data.
        n_simulations; int, the number of sets of recommendations to generate
        sample_ids; list of int, the user ids tested.
        n_recommendations; int default = 10, the number of recommendations within each iteration
        decay; Bool default = True, Whether to implement a recommendation decay
        n_decay; Int default = 5, the length of decay if implement. (set to n_simulations for inf decay)
        sim_rand_listen; Bool default = False, whether to add random listens to a users history
        n_sim_songs; int default = 10, the number of random songs to add to a users history
        filter_already_liked_items; Bool default = False, whether the recommendation system should only produce new tracks.
    
    Output:
        user_track_exploration; Dictionary {userid:[exploration]}, each users list of track exploratoryness
        user_artist_exploration; Dictionary {userid:[unique artist]}, each users number of unique artists
        user_predictions; Dictionary {userid:[prediction]}, each users preditions
        model; ALS Model, the modified model.
    
    """
    
    # Take a copy of the data to prevent changing the test dataset.
    test_data = data.copy()
    # Extract the data required for the model.
    model_data = test_data[['userID','trackID','count']]
    
    
    user_track_exploration = {idd: [] for idd in sample_ids}
    user_artist_exploration = {idd: [] for idd in sample_ids}
    user_predictions = {idd: [] for idd in sample_ids}
    user_prediction_decay = {idd: {} for idd in sample_ids}
    
    # creates the model
    model, user_plays = create_model(model_data, bm25 = True)
    
    # Extracts each of the user's user histories into a dictionary
    user_histories = {user_id : test_data[test_data['userID'] == user_id] for user_id in sample_ids}

    # Obtains each of the unique tracks availble to users within the simulation.
    unique_tracks = test_data.drop_duplicates(subset = ["trackID"])
    unique_tracks = unique_tracks[['trackID','recordingMBID', 'trackName', 'artist-MBID', 'artistName']].set_index('trackID')
    
    
    for n in tqdm(range(n_simulations)):

        for userid in sample_ids:
            
            # Obtain just that user's history
            user_history = user_histories[userid]
            
            if decay == True:
                
                user_decay, track_ids = get_decay_predictions(model, userid, user_history, user_prediction_decay[userid], n_decay, filter_already_liked_items)
                
            else:
                
                track_ids, scores = model.recommend(userid, user_history, n_recommendations, filter_already_liked_items)
            
            if sim_rand_listen == True:
                user_history = add_sim_songs(user_history, n_sim_songs)
              
            # Obtains the predictions for that user and adds to the user's history
            user_history, prediction_data = add_predictions(track_ids, unique_tracks, user_history, userid)
            
            # Updates the model
            predictions = user_history['trackID'].values
            indptr = [0,len(predictions)]

            user_items = csr_matrix((user_history['count'].values, predictions, indptr))
            
            # Only updates the model for that user.
            model.partial_fit_users([userid], user_items)
            
            user_predictions[userid].append(prediction_data)
            
            to_delete = []     
            user_histories[userid] = user_history
            if decay == True:
                user_prediction_decay[userid] = update_decay(user_decay)
                    
        # Records the data after every 5 rounds.            
        if n%5 == 0:
            user_track_exploration, user_artist_exploration = calculate_exploratoryness(user_track_exploration, user_artist_exploration, sample_ids, user_histories)
            

    return(user_track_exploration, user_artist_exploration, user_predictions, model)    

In [10]:
#uk_data = pd.read_pickle('../../cleaned_data/uk_data.pkl')

In [11]:
#sample_ids = np.random.choice(uk_data['userID'].unique(),20)
#uk_user_track_exploration, uk_user_artist_exploration, uk_user_predictions = simulate_model(uk_data, 100, sample_ids, sim_rand_listen = True, decay = False)