# Baselines - new song recommendation

In [1]:
%matplotlib inline

import os, sys, time, gzip
import pickle as pkl
import numpy as np
from scipy.sparse import lil_matrix, issparse
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics, pairwise_distance_hamming

In [3]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]

In [4]:
datasets = ['aotm2011', '30music']

In [5]:
dix = 0
dataset_name = datasets[dix]
dataset_name

'aotm2011'

In [6]:
data_dir = 'data/%s/coldstart/setting1' % dataset_name
Y_trndev = pkl.load(gzip.open(os.path.join(data_dir, 'Y_trndev.pkl.gz'), 'rb'))
X_test = pkl.load(gzip.open(os.path.join(data_dir, 'X_test.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))

In [7]:
songs1 = pkl.load(gzip.open(os.path.join(data_dir, 'songs_train_dev_test_s1.pkl.gz'), 'rb'))
train_songs = songs1['train_song_set']
dev_songs = songs1['dev_song_set']
test_songs = songs1['test_song_set']

In [8]:
song2index_trndev = {sid: ix for ix, (sid, _) in enumerate(train_songs + dev_songs)}
song2index_test = {sid: ix for ix, (sid, _) in enumerate(test_songs)}
index2song_test = {ix: sid for ix, (sid, _) in enumerate(test_songs)}

In [9]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in train_songs + dev_songs + test_songs if sid in _song2artist}

In [10]:
all_playlists = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_s1.pkl.gz'), 'rb'))

In [11]:
artist2pop = dict()
test_songset = set(test_songs)

for pl, _ in all_playlists:
    for sid in [sid for sid in pl if sid not in test_songset]:
        if sid in song2artist:
            aid = song2artist[sid]
            try:
                artist2pop[aid] += 1
            except KeyError:
                artist2pop[aid] = 1

In [12]:
cliques_all = pkl.load(gzip.open(os.path.join(data_dir, 'cliques_trndev.pkl.gz'), 'rb'))

In [13]:
U = len(cliques_all)
pl2u = np.zeros(Y_test.shape[1], dtype=np.int32)
for u in range(U):
    clq = cliques_all[u]
    pl2u[clq] = u

In [14]:
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))

In [15]:
X_test.shape

(10000, 63)

In [16]:
Y_test.shape

(10000, 84646)

In [17]:
type(Y_test)

scipy.sparse.csc.csc_matrix

In [18]:
Y_test_csr = Y_test.tocsr()

Note that `p XOR q = ( p AND NOT q )  OR  ( NOT p AND q )` from [here](https://math.stackexchange.com/questions/38473/is-xor-a-combination-of-and-and-not-operators),
let $\mathbf{p}, \mathbf{q} \in \{0, 1\}^{n}$, then
  
$
\begin{aligned}
& \text{Hamming_distance}(\mathbf{p}, \mathbf{q})  \\
& = \frac{1}{n} \sum_{i=1}^n p_i \ \text{XOR} \ q_i \\
& = \frac{1}{n} \sum_{i=1}^n \left( p_i (1 - q_i) + (1 - p_i) q_i \right) \\
& = \frac{1}{n} \left( \sum_{i=1}^n p_i (1 - q_i) + \sum_{i=1}^n (1 - p_i) q_i \right) \\
& = \frac{1}{n} \left( \mathbf{p}^\top (\mathbf{1} - \mathbf{q}) + (\mathbf{1} - \mathbf{p})^\top \mathbf{q} \right) \\
& = \frac{1}{n} \left( \text{sum}(\mathbf{p}) + \text{sum}(\mathbf{q}) - 2 \mathbf{p}^\top \mathbf{q} \right)
\end{aligned}
$

In [19]:
N, D = 1000, 200
aa = np.zeros(N * D, dtype=np.int)
idx = np.random.permutation(N * D)[:int(N * D * .3)]
aa[idx] = 1
aa = aa.reshape(N, D)
d1 = pairwise_distances(aa, metric='hamming', n_jobs=2)
d2 = (np.dot(aa, 1-aa.T) + np.dot(1-aa, aa.T)) / D
sum_vec = aa.sum(axis=1, keepdims=True)
d3 = (sum_vec + sum_vec.T - 2 * np.dot(aa, aa.T)) / D
diff = (d1 - d2).ravel()
print(np.dot(diff, diff))
diff2 = (d1 - d3).ravel()
print(np.dot(diff2, diff2))

4.990901080203748e-28
4.990901080203748e-28


In [21]:
aa = Y_test_csr[500, :]
d1 = pairwise_distances(aa.A, metric='hamming', n_jobs=2)
d2 = pairwise_distance_hamming(aa)
diff = (d1 - d2).ravel()
print(np.sqrt(np.dot(diff, diff)))

[[0.]]


### Popularity (of artist) based recommendation

In [23]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []
novelty_pop = dict()
diversities_pop = []

y_pred = np.zeros(len(test_songs))
for ix in range(len(test_songs)):
    sid = index2song_test[ix]
    if sid in song2artist:
        aid = song2artist[sid]
        if aid in artist2pop:
            y_pred[ix] = artist2pop[aid]

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()

    if npos[j] < 1:
        continue
        
    y_true = Y_test[:, j].A.reshape(-1)

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_pop.append(rp)
    for top in TOPs:
        hitrates_pop[top].append(hr_dict[top])
    aucs_pop.append(auc)
    
    # compute novelty@100
    u = pl2u[j]
    sortix = np.argsort(-y_pred)
    nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:100]])
    try:
        novelty_pop[u].append(nov)
    except KeyError:
        novelty_pop[u] = [nov]

    # compute diversity@100
    # csd = 1. / cosine_similarity(X_test[sortix[:100], :])
    # dist = pairwise_distances(Y_test_csr[sortix[:100], :].A, metric='hamming', n_jobs=4)
    dist = pairwise_distance_hamming(Y_test_csr[sortix[:100], :])
    diversities_pop.append((dist.sum() - np.trace(dist)) / (100 * 99))
    
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))

84600 / 84646
19504 / 84646


In [24]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [25]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop), 
                                    'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
                                    'AUC': np.mean(aucs_pop),
                                    'Novelty': np.mean([np.mean(novelty_pop[u]) for u in novelty_pop]),
                                    'Diveristy': np.mean(diversities_pop)},
                           'Test_All': {'R-Precision': rps_pop,
                                        'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
                                        'AUC': aucs_pop,
                                        'Novelty': novelty_pop,
                                        'Diversity': diversities_pop},
                          }}
pop_perf[dataset_name]['Test']

{'R-Precision': 0.00045846185932834824,
 'Hit-Rate': {5: 0.0019463027743380492,
  10: 0.008978570066680198,
  20: 0.026329491607767857,
  30: 0.034485467064040695,
  50: 0.041645387039762755,
  100: 0.0869623720184838,
  200: 0.16780507689895133,
  300: 0.21846529423276878,
  500: 0.2880161152209812,
  1000: 0.44232044414107435},
 'AUC': 0.7651276089946246,
 'Novelty': -2.3080483444860573,
 'Diveristy': 0.0005110290581547344}

In [29]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']

data/aotm2011/coldstart/setting1/perf-pop.pkl


{'R-Precision': 0.00045846185932834824,
 'Hit-Rate': {5: 0.0019463027743380492,
  10: 0.008978570066680198,
  20: 0.026329491607767857,
  30: 0.034485467064040695,
  50: 0.041645387039762755,
  100: 0.0869623720184838,
  200: 0.16780507689895133,
  300: 0.21846529423276878,
  500: 0.2880161152209812,
  1000: 0.44232044414107435},
 'AUC': 0.7651276089946246,
 'Novelty': -2.3080483444860573,
 'Diveristy': 0.0005110290581547344}

### Same Artists - Greatest Hits (SAGH)

Recommend according to the popularity of artists in listening history.

In [None]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []
novelty_sagh = dict()
diversities_sagh = []

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    if npos[j] < 1:
        continue

    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.zeros(y_true.shape)
    
    pl = all_playlists[j][0]
    artists = set([song2artist[sid] for sid in pl if (sid not in test_songset) and (sid in song2artist)])
    assert len(artists) > 0
    
    for ix in range(Y_test.shape[0]):
        sid = index2song_test[ix]
        if sid in song2artist:
            aid = song2artist[sid]
            if aid in artists and aid in artist2pop:
                y_pred[ix] = artist2pop[aid]
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_sagh.append(rp)
    for top in TOPs:
        hitrates_sagh[top].append(hr_dict[top])
    aucs_sagh.append(auc)
    
    # compute novelty@100
    u = pl2u[j]
    sortix = np.argsort(-y_pred)
    nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:100]])
    try:
        novelty_sagh[u].append(nov)
    except KeyError:
        novelty_sagh[u] = [nov]

    # compute diversity@100
    csd = 1. / cosine_similarity(X_test[sortix[:100], :])
    diversities_sagh.append((csd.sum() - np.trace(csd)) / (100 * 99))
    
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))

In [None]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [None]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in hitrates_sagh},
                                     'AUC': np.mean(aucs_sagh),
                                     'Novelty': np.mean([np.mean(novelty_sagh[u]) for u in novelty_sagh]),
                                     'Diveristy': np.mean(diversities_sagh)},
                            'Test_All': {'R-Precision': rps_sagh,
                                        'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
                                        'AUC': aucs_sagh,
                                        'Novelty': novelty_sagh,
                                        'Diversity': diversities_sagh},
                           }}
sagh_perf[dataset_name]['Test']

In [None]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']

### Collocated Artists - Greatest Hits (CAGH)

Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:   
$$
\text{sim}(a_1, a_2) 
= \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)}
       {\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}}
$$
where
$$
\delta(a, p) 
= \begin{cases}
1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\
0, \ \text{otherwise}.
\end{cases}
$$

Recommend according to the popularity of songs, but weighted by similarity of (`artist in user's listening history`, `artist of song`).

In [None]:
all_artist_trndev = sorted(set([song2artist[sid] for pl, _ in all_playlists for sid in pl \
                                if (sid not in test_songset) and (sid in song2artist)]))

In [None]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist_trndev)}

In [None]:
Na = len(all_artist_trndev)
Np = len(all_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
    pl_artist = sorted(set([song2artist[sid] for sid in all_playlists[j][0] \
                            if (sid not in test_songset) and (sid in song2artist)]))
    ix = [artist2index[aid] for aid in pl_artist]
    Delta[ix, j] = 1

In [None]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A

assert np.all(np.isclose(ColloMat.diagonal(), Dsum))

In [None]:
print(len(Dsum), len(all_artist_trndev))

In [None]:
#type(ColloMat)

In [None]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))

WeightMat = np.multiply(ColloMat, NormMat)

In [None]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []
novelty_cagh = dict()
diversities_cagh = []

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    
    if npos[j] < 1:
        continue
    
    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.zeros(y_true.shape)
    
    pl = all_playlists[j][0]
    artists = set([song2artist[sid] for sid in pl if (sid not in test_songset) and (sid in song2artist)])
    assert len(artists) > 0
    artists_ix = [artist2index[aid] for aid in artists]
    
    for ix in range(Y_test.shape[0]):
        sid = index2song_test[ix]
        if sid in song2artist:
            aid = song2artist[sid]
            if aid in artist2pop:
                aix = artist2index[aid]
                y_pred[ix] = artist2pop[aid] * WeightMat[aix, artists_ix].sum()

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_cagh.append(rp)
    for top in TOPs:
        hitrates_cagh[top].append(hr_dict[top])
    aucs_cagh.append(auc)
    
    # compute novelty@100
    u = pl2u[j]
    sortix = np.argsort(-y_pred)
    nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:100]])
    try:
        novelty_cagh[u].append(nov)
    except KeyError:
        novelty_cagh[u] = [nov]

    # compute diversity@100
    csd = 1. / cosine_similarity(X_test[sortix[:100], :])
    diversities_cagh.append((csd.sum() - np.trace(csd)) / (100 * 99))

print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))

In [None]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [None]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in hitrates_cagh},
                                     'AUC': np.mean(aucs_cagh),
                                     'Novelty': np.mean([np.mean(novelty_cagh[u]) for u in novelty_cagh]),
                                     'Diveristy': np.mean(diversities_cagh)},
                            'Test_All': {'R-Precision': rps_cagh,
                                        'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
                                        'AUC': aucs_cagh,
                                        'Novelty': novelty_cagh,
                                        'Diversity': diversities_cagh},
                           }}
cagh_perf[dataset_name]['Test']

In [None]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']