# Baselines - playlist generation for known users

In [1]:
%matplotlib inline

import os, sys, time, gzip
import pickle as pkl
import numpy as np
from scipy.sparse import lil_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics
# from tools import calc_Precision_Recall

In [3]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]

In [4]:
datasets = ['aotm2011', '30music']

In [5]:
dix = 1
dataset_name = datasets[dix]
dataset_name

'30music'

In [6]:
data_dir = 'data/%s/setting3' % dataset_name
X = pkl.load(gzip.open(os.path.join(data_dir, 'X.pkl.gz'), 'rb'))
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
song2pop_train = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop_train.pkl.gz'), 'rb'))

In [7]:
playlists3 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_test_s3.pkl.gz'), 'rb'))
train_playlists = playlists3['train_playlists']
test_playlists = playlists3['test_playlists']
user2songs = dict()

for pl, u in train_playlists:
    try:
        user2songs[u].update(set(pl))
    except KeyError:
        user2songs[u] = set(pl)

In [8]:
all_songs = pkl.load(gzip.open(os.path.join(data_dir, 'all_songs.pkl.gz'), 'rb'))
index2song = {ix: sid for ix, (sid, _) in enumerate(all_songs)}

In [9]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}

In [10]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in all_songs if sid in _song2artist}

In [11]:
artist2songs = dict()

for sid in sorted(song2artist):
    artist = song2artist[sid]
    try:
        artist2songs[artist].append(sid)
    except KeyError:
        artist2songs[artist] = [sid]

In [12]:
print('{:,} | {:,}'.format(len(song2artist), len(artist2songs)))

45,468 | 9,981


### Collocated Artists - Greatest Hits (CAGH)

Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:   
$$
\text{sim}(a_1, a_2) 
= \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)}
       {\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}}
$$
where
$$
\delta(a, p) 
= \begin{cases}
1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\
0, \ \text{otherwise}.
\end{cases}
$$

Recommend according to the popularity of songs, but weighted by similarity of (`artist in user's listening history`, `artist of song`).

In [13]:
all_artist = sorted(set([song2artist[sid] for pl, _ in train_playlists for sid in pl if sid in song2artist]))

In [14]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist)}

In [15]:
Na = len(all_artist)
Np = len(train_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
    pl_artist = sorted(set([song2artist[sid] for sid in train_playlists[j][0] if sid in song2artist]))
    ix = [artist2index[aid] for aid in pl_artist]
    Delta[ix, j] = 1

In [16]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A

assert np.all(np.isclose(ColloMat.diagonal(), Dsum))

In [17]:
print(len(Dsum), len(all_artist))

9981 9981


In [18]:
#type(ColloMat)

In [19]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))

WeightMat = np.multiply(ColloMat, NormMat)

In [20]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []

assert Y_test.shape[1] == len(test_playlists)

sid_legal = [sid for sid, _ in all_songs if sid in song2artist]
aix_legal = [artist2index[song2artist[sid]] for sid in sid_legal]
pop_legal = np.asarray([song2pop_train[sid] for sid in sid_legal])
ix_legal = [song2index[sid] for sid in sid_legal]

prev_u = None
prev_y = None

for j in range(Y_test.shape[1]):
    sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
    sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    u = test_playlists[j][1]
    if prev_u is None or prev_u != u:
        artists = sorted(set([song2artist[sid] for sid in user2songs[u] if sid in song2artist]))
        artists_ix = [artist2index[aid] for aid in artists]
        y_pred = np.zeros(y_true.shape)
        y_pred[ix_legal] = pop_legal * np.asarray([WeightMat[aix, artists_ix].sum() for aix in aix_legal])

        # for ix in ix_legal:
        #     sid = index2song[ix]
        #     aix = artist2index[song2artist[sid]]
        #     pop = song2pop_test[sid]
        #     y_pred[ix] = pop * WeightMat[aix, artists_ix].sum()
        
        prev_u = u
        prev_y = y_pred
    else:
        y_pred = prev_y

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_cagh.append(rp)
    for top in TOPs:
        hitrates_cagh[top].append(hr_dict[top])
    aucs_cagh.append(auc)

print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))

2195 / 2195
2195 / 2195


In [21]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [22]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in TOPs},
                                     'AUC': np.mean(aucs_cagh),},
                            'Test_All': {'R-Precision': rps_cagh, 
                                         'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
                                         'AUC': np.mean(aucs_cagh),}}}
cagh_perf[dataset_name]['Test']

{'R-Precision': 0.044309023611024186,
 'Hit-Rate': {5: 0.024511173083654143,
  10: 0.04246964987286078,
  20: 0.0683950430514176,
  30: 0.08988681067269817,
  50: 0.12598437957395134,
  100: 0.19260644292704998,
  200: 0.2865075334854971,
  300: 0.352321652803193,
  500: 0.44802022679788867,
  1000: 0.5938181770453134},
 'AUC': 0.9517195299023042}

In [23]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']

data/30music/setting3/perf-cagh.pkl


{'R-Precision': 0.044309023611024186,
 'Hit-Rate': {5: 0.024511173083654143,
  10: 0.04246964987286078,
  20: 0.0683950430514176,
  30: 0.08988681067269817,
  50: 0.12598437957395134,
  100: 0.19260644292704998,
  200: 0.2865075334854971,
  300: 0.352321652803193,
  500: 0.44802022679788867,
  1000: 0.5938181770453134},
 'AUC': 0.9517195299023042}

### Same Artists - Greatest Hits (SAGH)

Recommend according to the popularity of songs of artists in listening history.

In [24]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []

assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.zeros(y_true.shape)
    
    u = test_playlists[j][1]
    artists = sorted(set([song2artist[sid] for sid in user2songs[u] if sid in song2artist]))
    candidates = []
    for a in artists:
        candidates += artist2songs[a]
    candidates = sorted(set(candidates))
    if len(candidates) > 0:
        for sid in candidates:
            ix = song2index[sid]
            y_pred[ix] = song2pop_train[sid]

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_sagh.append(rp)
    for top in TOPs:
        hitrates_sagh[top].append(hr_dict[top])
    aucs_sagh.append(auc)
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))

2100 / 2195
2195 / 2195


In [25]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [26]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in TOPs},
                                     'AUC': np.mean(aucs_sagh),},
                            'Test_All': {'R-Precision': rps_sagh, 
                                         'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
                                         'AUC': aucs_sagh,}}}
sagh_perf[dataset_name]['Test']

{'R-Precision': 0.04508938511047115,
 'Hit-Rate': {5: 0.026424145514482755,
  10: 0.04537646141173367,
  20: 0.07616462132710679,
  30: 0.09878911742609467,
  50: 0.13232042872060362,
  100: 0.18139792682074818,
  200: 0.22747646979902908,
  300: 0.25139306924133015,
  500: 0.2750661840702982,
  1000: 0.2994516089970001},
 'AUC': 0.6444120943290897}

In [27]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']

data/30music/setting3/perf-sagh.pkl


{'R-Precision': 0.04508938511047115,
 'Hit-Rate': {5: 0.026424145514482755,
  10: 0.04537646141173367,
  20: 0.07616462132710679,
  30: 0.09878911742609467,
  50: 0.13232042872060362,
  100: 0.18139792682074818,
  200: 0.22747646979902908,
  300: 0.25139306924133015,
  500: 0.2750661840702982,
  1000: 0.2994516089970001},
 'AUC': 0.6444120943290897}

### Popularity based recommendation

In [28]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []

y_pred = np.array([song2pop_train[index2song[ix]] for ix in range(len(all_songs))])

assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_pop.append(rp)
    for top in TOPs:
        hitrates_pop[top].append(hr_dict[top])
    aucs_pop.append(auc)
    
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))

2190 / 2195
2195 / 2195


In [29]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [30]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop), 
                                    'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
                                    'AUC': np.mean(aucs_pop),},
                           'Test_All': {'R-Precision': rps_pop, 
                                        'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
                                        'AUC': aucs_pop,}}}
pop_perf[dataset_name]['Test']

{'R-Precision': 0.021789486717889057,
 'Hit-Rate': {5: 0.010145439453664305,
  10: 0.0190906029791239,
  20: 0.035023385784414565,
  30: 0.04652897408134314,
  50: 0.06902868603298959,
  100: 0.11953569058765297,
  200: 0.1921951014338093,
  300: 0.24737848161229312,
  500: 0.32612280951457073,
  1000: 0.462820533654615},
 'AUC': 0.9402710379314311}

In [31]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']

data/30music/setting3/perf-pop.pkl


{'R-Precision': 0.021789486717889057,
 'Hit-Rate': {5: 0.010145439453664305,
  10: 0.0190906029791239,
  20: 0.035023385784414565,
  30: 0.04652897408134314,
  50: 0.06902868603298959,
  100: 0.11953569058765297,
  200: 0.1921951014338093,
  300: 0.24737848161229312,
  500: 0.32612280951457073,
  1000: 0.462820533654615},
 'AUC': 0.9402710379314311}