# Baselines - playlist generation for known users

In [None]:
%matplotlib inline

import os, sys, time, gzip
import pickle as pkl
import numpy as np
from scipy.sparse import lil_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics

In [None]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]

In [None]:
datasets = ['aotm2011', '30music']

In [None]:
dix = 0
dataset_name = datasets[dix]
dataset_name

In [None]:
data_dir = 'data/%s/setting4' % dataset_name
X = pkl.load(gzip.open(os.path.join(data_dir, 'X.pkl.gz'), 'rb'))
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
song2pop_train = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop_train.pkl.gz'), 'rb'))

In [None]:
playlists3 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_test_s4.pkl.gz'), 'rb'))
train_playlists = playlists3['train_playlists']
test_playlists = playlists3['test_playlists']

In [None]:
all_songs = pkl.load(gzip.open(os.path.join(data_dir, 'all_songs.pkl.gz'), 'rb'))
index2song = {ix: sid for ix, (sid, _) in enumerate(all_songs)}

In [None]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}

In [None]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in all_songs if sid in _song2artist}

In [None]:
artist2songs = dict()

for sid in sorted(song2artist):
    artist = song2artist[sid]
    try:
        artist2songs[artist].append(sid)
    except KeyError:
        artist2songs[artist] = [sid]

In [None]:
print('{:,} | {:,}'.format(len(song2artist), len(artist2songs)))

In [None]:
artist2pop = dict()

for pl, _ in train_playlists:
    for sid in pl:
        if sid in song2artist:
            aid = song2artist[sid]
            try:
                artist2pop[aid] += 1
            except KeyError:
                artist2pop[aid] = 1

In [None]:
print(len(artist2pop))

### Collocated Artists - Greatest Hits (CAGH), Top 10 Artists

Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:   
$$
\text{sim}(a_1, a_2) 
= \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)}
       {\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}}
$$
where
$$
\delta(a, p) 
= \begin{cases}
1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\
0, \ \text{otherwise}.
\end{cases}
$$

Recommend according to the popularity of songs, but weighted by similarity of (`top 10 artists`, `artist of song`).

In [None]:
all_artist = sorted(set([song2artist[sid] for pl, _ in train_playlists for sid in pl if sid in song2artist]))

In [None]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist)}

In [None]:
Na = len(all_artist)
Np = len(train_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
    pl_artist = sorted(set([song2artist[sid] for sid in train_playlists[j][0] if sid in song2artist]))
    ix = [artist2index[aid] for aid in pl_artist]
    Delta[ix, j] = 1

In [None]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A

assert np.all(np.isclose(ColloMat.diagonal(), Dsum))

In [None]:
print(len(Dsum), len(all_artist))

In [None]:
#type(ColloMat)

In [None]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))

WeightMat = np.multiply(ColloMat, NormMat)

In [None]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []

assert Y_test.shape[1] == len(test_playlists)

sid_legal = [sid for sid, _ in all_songs if sid in song2artist]
aix_legal = [artist2index[song2artist[sid]] for sid in sid_legal]
pop_legal = np.asarray([song2pop_train[sid] for sid in sid_legal])
ix_legal = [song2index[sid] for sid in sid_legal]

top10_artists = sorted(artist2pop, key=lambda aid: artist2pop[aid])[-10:]
top10_artists_ix = [artist2index[aix] for aix in top10_artists]
y_pred = np.zeros(Y_test.shape[0])
y_pred[ix_legal] = pop_legal * np.asarray([WeightMat[aix, top10_artists_ix].sum() for aix in aix_legal])

for j in range(Y_test.shape[1]):
    if (j + 1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_cagh.append(rp)
    for top in TOPs:
        hitrates_cagh[top].append(hr_dict[top])
    aucs_cagh.append(auc)

print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))

In [None]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [None]:
cagh = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh), 
                                'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in hitrates_cagh},
                                'AUC': np.mean(aucs_cagh),}}}
cagh

In [None]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))

### Same Artists - Greatest Hits (SAGH), Top 10 Artists

Recommending according to the popularity of songs of the top 10 most popular artists in data.

In [None]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []

top10_artists = sorted(artist2pop, key=lambda aid: artist2pop[aid])[-10:]
candidates = []
for aix in top10_artists:
    candidates += artist2songs[aix]
candidates = sorted(set(candidates))

assert len(candidates) > 0
y_pred = np.zeros(Y_test.shape[0])
for sid in candidates:
    ix = song2index[sid]
    y_pred[ix] = song2pop_train[sid]

assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_sagh.append(rp)
    for top in TOPs:
        hitrates_sagh[top].append(hr_dict[top])
    aucs_sagh.append(auc)
    
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))

In [None]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [None]:
sagh = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh), 
                                'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in hitrates_sagh},
                                'AUC': np.mean(aucs_sagh),}}}
sagh

In [None]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))

### Popularity based recommendation

In [None]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []

y_pred = np.array([song2pop_train[index2song[ix]] for ix in range(len(all_songs))])

assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_pop.append(rp)
    for top in TOPs:
        hitrates_pop[top].append(hr_dict[top])
    aucs_pop.append(auc)
    
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))

In [None]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [None]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop), 
                                    'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in hitrates_pop},
                                    'AUC': np.mean(aucs_pop),}}}
pop_perf

In [None]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))