<h1>Recommender system challenge PoliMi 2018</h1>

<b>Import dependencies</b>

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import recommender as recommender

<b>See if we upload it correctly</b>

In [23]:
train = pd.read_csv("all/train.csv")
train.head()

Unnamed: 0,playlist_id,track_id
0,0,14301
1,0,8360
2,0,12844
3,0,18397
4,0,1220


In [24]:
playlist_list = list(np.asarray(train['playlist_id']))
track_list = list(np.asarray(train['track_id']))
playlist_list[0:10]

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

In [25]:
tracks = pd.read_csv("all/tracks.csv")
tracks.head()

Unnamed: 0,track_id,album_id,artist_id,duration_sec
0,0,6306,449,167
1,1,12085,4903,185
2,2,1885,6358,201
3,3,3989,1150,263
4,4,11633,4447,96


In [26]:
test = pd.read_csv("all/target_playlists.csv")
test.head()

Unnamed: 0,playlist_id
0,7
1,25
2,29
3,34
4,50


In [27]:
test_playlist_list = list(np.asarray(test['playlist_id']))
all_track_list = list(np.asarray(train['track_id']))
album_list = list(np.asarray(tracks['album_id']))
artist_list = list(np.asarray(tracks['artist_id']))
duration_list = list(np.asarray(tracks['duration_sec']))



In [28]:
#number of different playlist, tracks, albums, artists
playlist_unique = list(set(playlist_list))
track_unique = list(set(track_list))
album_unique = list(set(album_list))
artist_unique = list(set(artist_list))

num_playlists = len(playlist_list)
num_tracks = len(track_list)
num_albums = len(album_unique)
num_artists = len(artist_unique)

num_tracks

1211791

In [29]:
data = np.ones((num_playlists), dtype=int)

In [30]:
import scipy.sparse as sps
                         
URM_train = sps.coo_matrix((data, (playlist_list, track_list)))

URM_train

<50446x20635 sparse matrix of type '<class 'numpy.int64'>'
	with 1211791 stored elements in COOrdinate format>

In [31]:
URM_train.tocsr()

<50446x20635 sparse matrix of type '<class 'numpy.int64'>'
	with 1211791 stored elements in Compressed Sparse Row format>

In [32]:
train_test_split = 0.80

numInteractions = URM_train.getnnz()
numInteractions

1211791

In [33]:
train_mask = np.random.choice([True,False], numInteractions, p=[train_test_split, 1-train_test_split])
train_mask

array([ True,  True,  True, ...,  True,  True, False])

In [34]:
track_list = np.array(track_list)
playlist_list = np.array(playlist_list)
all_track_list = np.array(all_track_list)
#train_dummies = np.array(train_dummies)

#len(train_dummies)
#len(train_mask)

URM_train = sps.coo_matrix((data[train_mask], (playlist_list[train_mask], track_list[train_mask])))
URM_train = URM_train.tocsr()
URM_train

<50446x20635 sparse matrix of type '<class 'numpy.int64'>'
	with 968777 stored elements in Compressed Sparse Row format>

In [35]:
test_mask = np.logical_not(train_mask)

URM_test = sps.coo_matrix((data[test_mask], (playlist_list[test_mask], track_list[test_mask])))
URM_test = URM_test.tocsr()
URM_test

<50446x20634 sparse matrix of type '<class 'numpy.int64'>'
	with 243014 stored elements in Compressed Sparse Row format>

<h3>Mean Average Precision</h3>

In [36]:
def MAP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

In [37]:
def evaluate_algorithm_different(URM_test, recommender_object, at=10):
    
    cumulative_MAP = 0.0
    
    num_eval = 0

    for playlist_id in playlist_unique:

        relevant_items = URM_test[playlist_id].indices
        
        if len(relevant_items)>0:
            #recommended_items = recommender_object.recommend(playlist_id, at=at)
            recommended_items = recommender_object.recommend(playlist_id)
            num_eval+=1
            cumulative_MAP += MAP(recommended_items, relevant_items)


    cumulative_MAP /= num_eval
    
    print("Recommender performance is: MAP = {:.4f}".format(cumulative_MAP)) 

In [38]:
def evaluate_algorithm(URM_test, recommender_object, at=10):
    
    cumulative_MAP = 0.0
    
    num_eval = 0

    for playlist_id in playlist_unique:

        relevant_items = URM_test[playlist_id].indices
        
        if len(relevant_items)>0:
            recommended_items = recommender_object.recommend(playlist_id, at=at)
            num_eval+=1
            cumulative_MAP += MAP(recommended_items, relevant_items)


    cumulative_MAP /= num_eval
    
    print("Recommender performance is: MAP = {:.4f}".format(cumulative_MAP)) 

<h2>TopPop Recommender</h2>

In [39]:
class TopPopRecommender(object):

    def fit(self, URM_train):
    
        self.URM_train = URM_train
    
        itemPopularity = (URM_train>0).sum(axis=0)
        itemPopularity = np.array(itemPopularity).squeeze()

        # We are not interested in sorting the popularity value,
        # but to order the items according to it
        self.popularItems = np.argsort(itemPopularity)
        self.popularItems = np.flip(self.popularItems, axis = 0)
    
    
    def recommend(self, playlist_id, at=10, remove_seen=True):
        
        if remove_seen:
            unseen_items_mask = np.in1d(self.popularItems, 
                                        self.URM_train[playlist_id].indices, 
                                        assume_unique = True, 
                                        invert = True)
            
            unseen_items = self.popularItems[unseen_items_mask]

            recommended_items = unseen_items[0:at]
        
        else:
            recommended_items = self.popularItems[0:at]

        return recommended_items

<h3>Fit and test the model</h3>

In [70]:
topPopRecommender = TopPopRecommender()
topPopRecommender.fit(URM_train)

In [41]:
for playlist_id in playlist_unique[0:10]:
    print(topPopRecommender.recommend(playlist_id, at=10))

[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]
[ 8956 10848  5606 15578 10496  2674 13980 17239 18266  2272]


In [21]:
evaluate_algorithm(URM_test, topPopRecommender, at=10)

Recommender performance is: MAP = 0.0043


<h2>Works!</h2>

<h3>Code for creation of the submission csv</h3>

In [131]:
test_playlist_list.sort()

In [132]:
submission = pd.DataFrame(columns=["playlist_id","track_ids"])

for playlist_id in test_playlist_list: 
    recommendation = ' '.join(map(str, topPopRecommender.recommend(playlist_id, at=10)))
    row = pd.DataFrame([[playlist_id,recommendation]], columns=["playlist_id","track_ids"])
    submission = submission.append(row)

submission.to_csv("all/sub.csv", index = False)

<h2>Content based similarity</h2>

Build ICM

In [19]:
ones = np.ones((num_tracks_in_tracks), dtype=int)

ICM_all_artist = sps.coo_matrix((ones, (tracks_in_tracks, artist_list)))
ICM_all_artist = ICM_all_artist.tocsr()

ICM_all_artist

<20635x6668 sparse matrix of type '<class 'numpy.int64'>'
	with 20635 stored elements in Compressed Sparse Row format>

In [20]:
ones = np.ones((num_tracks_in_tracks), dtype=int)

ICM_all_album = sps.coo_matrix((ones, (tracks_in_tracks, album_list)))
ICM_all_album = ICM_all_album.tocsr()

ICM_all_album

<20635x12744 sparse matrix of type '<class 'numpy.int64'>'
	with 20635 stored elements in Compressed Sparse Row format>

In [21]:
ones = np.ones((num_tracks_in_tracks), dtype=int)

ICM_all_duration = sps.coo_matrix((ones, (tracks_in_tracks, duration_list)))
ICM_all_duration = ICM_all_duration.tocsr()

ICM_all_duration

<20635x2115 sparse matrix of type '<class 'numpy.int64'>'
	with 20635 stored elements in Compressed Sparse Row format>

In [22]:
class BasicItemKNNRecommender(object):
    """ ItemKNN recommender with cosine similarity and no shrinkage"""

    def __init__(self, URM, k=50, shrinkage=100, similarity='cosine'):
        self.dataset = URM
        self.k = k
        self.shrinkage = shrinkage
        self.similarity_name = similarity
        if similarity == 'cosine':
            self.distance = Cosine(shrinkage=self.shrinkage)
        elif similarity == 'pearson':
            self.distance = Pearson(shrinkage=self.shrinkage)
        elif similarity == 'adj-cosine':
            self.distance = AdjustedCosine(shrinkage=self.shrinkage)
        else:
            raise NotImplementedError('Distance {} not implemented'.format(similarity))

    def __str__(self):
        return "ItemKNN(similarity={},k={},shrinkage={})".format(
            self.similarity_name, self.k, self.shrinkage)

    def fit(self, X):
        item_weights = self.distance.compute(X)
        
        item_weights = check_matrix(item_weights, 'csr') # nearly 10 times faster
        print("Converted to csr")
        
        # for each column, keep only the top-k scored items
        # THIS IS THE SLOW PART, FIND A BETTER SOLUTION        
        values, rows, cols = [], [], []
        nitems = self.dataset.shape[1]
        for i in range(nitems):
            if (i % 10000 == 0):
                print("Item %d of %d" % (i, nitems))
                
            this_item_weights = item_weights[i,:].toarray()[0]
            top_k_idx = np.argsort(this_item_weights) [-self.k:]
                        
            values.extend(this_item_weights[top_k_idx])
            rows.extend(np.arange(nitems)[top_k_idx])
            cols.extend(np.ones(self.k) * i)
        self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)

    def recommend(self, playlist_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.dataset[playlist_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        # rank items
        ranking = scores.argsort()[::-1]
        if exclude_seen:
            ranking = self._filter_seen(playlist_id, ranking)
            
        return ranking[:at]
    
    def _filter_seen(self, playlist_id, ranking):
        user_profile = self.dataset[playlist_id]
        seen = user_profile.indices
        unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
        return ranking[unseen_mask]

In [23]:
def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)

In [24]:
import scipy
class ISimilarity(object):
    """Abstract interface for the similarity metrics"""

    def __init__(self, shrinkage=10):
        self.shrinkage = shrinkage

    def compute(self, X):
        pass


class Cosine(ISimilarity):
    def compute(self, X):
        # convert to csc matrix for faster column-wise operations
        X = check_matrix(X, 'csc', dtype=np.float32)

        # 1) normalize the columns in X
        # compute the column-wise norm
        # NOTE: this is slightly inefficient. We must copy X to compute the column norms.
        # A faster solution is to  normalize the matrix inplace with a Cython function.
        Xsq = X.copy()
        Xsq.data **= 2
        norm = np.sqrt(Xsq.sum(axis=0))
        norm = np.asarray(norm).ravel()
        norm += 1e-6
        # compute the number of non-zeros in each column
        # NOTE: this works only if X is instance of sparse.csc_matrix
        col_nnz = np.diff(X.indptr)
        # then normalize the values in each column
        X.data /= np.repeat(norm, col_nnz)
        print("Normalized")

        # 2) compute the cosine similarity using the dot-product
        dist = X * X.T
        print("Computed")
        
        # zero out diagonal values
        dist = dist - sps.dia_matrix((dist.diagonal()[scipy.newaxis, :], [0]), shape=dist.shape)
        print("Removed diagonal")
        
        # and apply the shrinkage
        if self.shrinkage > 0:
            dist = self.apply_shrinkage(X, dist)
            print("Applied shrinkage")    
        
        return dist

    def apply_shrinkage(self, X, dist):
        # create an "indicator" version of X (i.e. replace values in X with ones)
        X_ind = X.copy()
        X_ind.data = np.ones_like(X_ind.data)
        # compute the co-rated counts
        co_counts = X_ind * X_ind.T
        # remove the diagonal
        co_counts = co_counts - sps.dia_matrix((co_counts.diagonal()[scipy.newaxis, :], [0]), shape=co_counts.shape)
        # compute the shrinkage factor as co_counts_ij / (co_counts_ij + shrinkage)
        # then multiply dist with it
        co_counts_shrink = co_counts.copy()
        co_counts_shrink.data += self.shrinkage
        co_counts.data /= co_counts_shrink.data
        dist.data *= co_counts.data
        return dist


<h3>Test it</h3>

<h3>Artists</h3>

In [26]:
rec = BasicItemKNNRecommender(URM=URM_train, shrinkage=0.0, k=50)
rec.fit(ICM_all_artist)

Normalized
Computed
Removed diagonal
Converted to csr
Item 0 of 20635
Item 10000 of 20635
Item 20000 of 20635


In [27]:
for playlist_id in playlist_unique[0:10]:
    print(rec.recommend(playlist_id, at=10))

[ 6095 19582 10912 19034 18427  2355 14796  5957  1568  2506]
[11605  8907 16515 17519 14741 11861 18466  3896  8727 12388]
[15130 15679 11228 10472 15506 13364  3151  7112 12460  8796]
[ 5218  9348 13503  4379 11327  9740  3292 17348 19475  3321]
[ 9576  3493 10927 13052  2528 13812  3747 12601 10023 11958]
[  243 19620  8804  6879  6874  6875  6876  6877  6878  6886]
[ 4103  7906 13979   102  9550  2385  6954 10945  6725  7803]
[ 4542  9570  4032  3052  1651 11776  1790 15908  4492 15124]
[ 2672 14049 18630 15614  5179  6460 10585  1025 11251 17148]
[ 3523  2230 18769  7031 11277  7646 14730  3200 20513  3659]


In [28]:
evaluate_algorithm(URM_test, rec)

Recommender performance is: MAP = 0.0317


In [29]:
rec_s_artist = BasicItemKNNRecommender(URM=URM_train, shrinkage=10.0, k=50)
rec_s_artist.fit(ICM_all_artist)
evaluate_algorithm(URM_test, rec_s_artist)

Normalized
Computed
Removed diagonal
Applied shrinkage
Converted to csr
Item 0 of 20635
Item 10000 of 20635
Item 20000 of 20635
Recommender performance is: MAP = 0.0317


<h3>Albums</h3>

In [31]:
rec_s_album = BasicItemKNNRecommender(URM=URM_train, shrinkage=10.0, k=50)
rec_s_album.fit(ICM_all_album)
evaluate_algorithm(URM_test, rec_s_album)

Normalized
Computed
Removed diagonal
Applied shrinkage
Converted to csr
Item 0 of 20635
Item 10000 of 20635
Item 20000 of 20635
Recommender performance is: MAP = 0.0499


<h3>Durations</h3>

In [32]:
rec_s_duration = BasicItemKNNRecommender(URM=URM_train, shrinkage=10.0, k=50)
rec_s_duration.fit(ICM_all_duration)
evaluate_algorithm(URM_test, rec_s_duration)

Normalized
Computed
Removed diagonal
Applied shrinkage
Converted to csr
Item 0 of 20635
Item 10000 of 20635
Item 20000 of 20635
Recommender performance is: MAP = 0.0001
