# Baselines - new song recommendation

In [1]:
%matplotlib inline

import os, sys, time, gzip
import pickle as pkl
import numpy as np
from scipy.sparse import lil_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics

In [3]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]

In [4]:
datasets = ['aotm2011', '30music']

In [5]:
dix = 1
dataset_name = datasets[dix]
dataset_name

'30music'

In [6]:
data_dir = 'data/%s/setting1' % dataset_name
Y_trndev = pkl.load(gzip.open(os.path.join(data_dir, 'Y_trndev.pkl.gz'), 'rb'))
X_test = pkl.load(gzip.open(os.path.join(data_dir, 'X_test.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))

In [7]:
songs1 = pkl.load(gzip.open(os.path.join(data_dir, 'songs_train_dev_test_s1.pkl.gz'), 'rb'))
train_songs = songs1['train_song_set']
dev_songs = songs1['dev_song_set']
test_songs = songs1['test_song_set']

In [8]:
song2index_trndev = {sid: ix for ix, (sid, _) in enumerate(train_songs + dev_songs)}
song2index_test = {sid: ix for ix, (sid, _) in enumerate(test_songs)}
index2song_test = {ix: sid for ix, (sid, _) in enumerate(test_songs)}

In [9]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in train_songs + dev_songs + test_songs if sid in _song2artist}

In [10]:
all_playlists = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_s1.pkl.gz'), 'rb'))

In [11]:
artist2pop = dict()
test_songset = set(test_songs)

for pl, _ in all_playlists:
    for sid in [sid for sid in pl if sid not in test_songset]:
        if sid in song2artist:
            aid = song2artist[sid]
            try:
                artist2pop[aid] += 1
            except KeyError:
                artist2pop[aid] = 1

### Popularity (of artist) based recommendation

In [12]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []

y_pred = np.zeros(len(test_songs))
for ix in range(len(test_songs)):
    sid = index2song_test[ix]
    if sid in song2artist:
        aid = song2artist[sid]
        if aid in artist2pop:
            y_pred[ix] = artist2pop[aid]

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()

    if npos[j] < 1:
        continue
        
    y_true = Y_test[:, j].A.reshape(-1)

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_pop.append(rp)
    for top in TOPs:
        hitrates_pop[top].append(hr_dict[top])
    aucs_pop.append(auc)
    
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))

17300 / 17342
8215 / 17342


In [13]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [14]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop), 
                                    'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
                                    'AUC': np.mean(aucs_pop),},
                           'Test_All': {'R-Precision': rps_pop,
                                        'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
                                        'AUC': aucs_pop,},
                          }}
pop_perf[dataset_name]['Test']

{'R-Precision': 0.005203867316768658,
 'Hit-Rate': {5: 0.013328379539578565,
  10: 0.01745343146743025,
  20: 0.04239344314298478,
  30: 0.046325904135701396,
  50: 0.06587742420171959,
  100: 0.12189305939747051,
  200: 0.15982659365799431,
  300: 0.23000178915292693,
  500: 0.3464156702960598,
  1000: 0.508522684424406},
 'AUC': 0.7093575433337442}

In [15]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']

data/30music/setting1/perf-pop.pkl


{'R-Precision': 0.005203867316768658,
 'Hit-Rate': {5: 0.013328379539578565,
  10: 0.01745343146743025,
  20: 0.04239344314298478,
  30: 0.046325904135701396,
  50: 0.06587742420171959,
  100: 0.12189305939747051,
  200: 0.15982659365799431,
  300: 0.23000178915292693,
  500: 0.3464156702960598,
  1000: 0.508522684424406},
 'AUC': 0.7093575433337442}

### Same Artists - Greatest Hits (SAGH)

Recommend according to the popularity of artists in listening history.

In [16]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    if npos[j] < 1:
        continue

    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.zeros(y_true.shape)
    
    pl = all_playlists[j][0]
    artists = set([song2artist[sid] for sid in pl if (sid not in test_songset) and (sid in song2artist)])
    assert len(artists) > 0
    
    for ix in range(Y_test.shape[0]):
        sid = index2song_test[ix]
        if sid in song2artist:
            aid = song2artist[sid]
            if aid in artists and aid in artist2pop:
                y_pred[ix] = artist2pop[aid]
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_sagh.append(rp)
    for top in TOPs:
        hitrates_sagh[top].append(hr_dict[top])
    aucs_sagh.append(auc)
    
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))

17300 / 17342
8215 / 17342


In [17]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [18]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in hitrates_sagh},
                                     'AUC': np.mean(aucs_sagh),},
                            'Test_All': {'R-Precision': rps_sagh,
                                        'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
                                        'AUC': aucs_sagh,},
                           }}
sagh_perf[dataset_name]['Test']

{'R-Precision': 0.0058906181071328645,
 'Hit-Rate': {5: 0.010393018642633766,
  10: 0.017147285016408928,
  20: 0.026372488740090107,
  30: 0.031067024033064407,
  50: 0.037101747964390676,
  100: 0.04784096744035933,
  200: 0.07480452458588552,
  300: 0.10500147506396343,
  500: 0.14913781225708528,
  1000: 0.2488882325582078},
 'AUC': 0.5158256192835081}

In [19]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']

data/30music/setting1/perf-sagh.pkl


{'R-Precision': 0.0058906181071328645,
 'Hit-Rate': {5: 0.010393018642633766,
  10: 0.017147285016408928,
  20: 0.026372488740090107,
  30: 0.031067024033064407,
  50: 0.037101747964390676,
  100: 0.04784096744035933,
  200: 0.07480452458588552,
  300: 0.10500147506396343,
  500: 0.14913781225708528,
  1000: 0.2488882325582078},
 'AUC': 0.5158256192835081}

### Collocated Artists - Greatest Hits (CAGH)

Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:   
$$
\text{sim}(a_1, a_2) 
= \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)}
       {\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}}
$$
where
$$
\delta(a, p) 
= \begin{cases}
1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\
0, \ \text{otherwise}.
\end{cases}
$$

Recommend according to the popularity of songs, but weighted by similarity of (`artist in user's listening history`, `artist of song`).

In [20]:
all_artist_trndev = sorted(set([song2artist[sid] for pl, _ in all_playlists for sid in pl \
                                if (sid not in test_songset) and (sid in song2artist)]))

In [21]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist_trndev)}

In [22]:
Na = len(all_artist_trndev)
Np = len(all_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
    pl_artist = sorted(set([song2artist[sid] for sid in all_playlists[j][0] \
                            if (sid not in test_songset) and (sid in song2artist)]))
    ix = [artist2index[aid] for aid in pl_artist]
    Delta[ix, j] = 1

In [23]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A

assert np.all(np.isclose(ColloMat.diagonal(), Dsum))

In [24]:
print(len(Dsum), len(all_artist_trndev))

9981 9981


In [25]:
#type(ColloMat)

In [26]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))

WeightMat = np.multiply(ColloMat, NormMat)

In [27]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    
    if npos[j] < 1:
        continue
    
    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.zeros(y_true.shape)
    
    pl = all_playlists[j][0]
    artists = set([song2artist[sid] for sid in pl if (sid not in test_songset) and (sid in song2artist)])
    assert len(artists) > 0
    artists_ix = [artist2index[aid] for aid in artists]
    
    for ix in range(Y_test.shape[0]):
        sid = index2song_test[ix]
        if sid in song2artist:
            aid = song2artist[sid]
            if aid in artist2pop:
                aix = artist2index[aid]
                y_pred[ix] = artist2pop[aid] * WeightMat[aix, artists_ix].sum()

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_cagh.append(rp)
    for top in TOPs:
        hitrates_cagh[top].append(hr_dict[top])
    aucs_cagh.append(auc)

print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))

17340 / 17342
8215 / 17342


In [28]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [29]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in hitrates_cagh},
                                     'AUC': np.mean(aucs_cagh),},
                            'Test_All': {'R-Precision': rps_cagh,
                                        'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
                                        'AUC': aucs_cagh,},
                           }}
cagh_perf[dataset_name]['Test']

{'R-Precision': 0.003592450613215128,
 'Hit-Rate': {5: 0.009558269939656213,
  10: 0.018799413692811708,
  20: 0.03456087534321208,
  30: 0.04787400312338364,
  50: 0.06889382854555288,
  100: 0.10867505070973285,
  200: 0.17632961379743992,
  300: 0.23260910790089728,
  500: 0.3311485681208424,
  1000: 0.49273638472759407},
 'AUC': 0.6915890641053042}

In [30]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']

data/30music/setting1/perf-cagh.pkl


{'R-Precision': 0.003592450613215128,
 'Hit-Rate': {5: 0.009558269939656213,
  10: 0.018799413692811708,
  20: 0.03456087534321208,
  30: 0.04787400312338364,
  50: 0.06889382854555288,
  100: 0.10867505070973285,
  200: 0.17632961379743992,
  300: 0.23260910790089728,
  500: 0.3311485681208424,
  1000: 0.49273638472759407},
 'AUC': 0.6915890641053042}