# A representative subset of AotM-2011 Playlists with MSD Audio Features

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from scipy.sparse import lil_matrix, issparse
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sys.path.append('src')
from BinaryRelevance import BinaryRelevance
#from PClassificationMLC import PClassificationMLC
from PCMLC import PCMLC as PClassificationMLC
from evaluate import calc_F1, calc_precisionK, calc_rank, f1_score_nowarn

In [None]:
data_dir = 'data/aotm-2011'
#faotm = os.path.join(data_dir, 'aotm2011-subset.pkl')
faotm = os.path.join(data_dir, 'aotm2011-user-playlist.pkl')
ffeature = 'data/msd/songID2Features.pkl.gz'
fgenre = 'data/msd/song2genre.pkl'

## Load playlists

Load playlists.

In [None]:
user_playlists = pkl.load(open(faotm, 'rb'))

In [None]:
print('#user    :', len(user_playlists))
print('#playlist:', np.sum([len(user_playlists[u]) for u in user_playlists]))

In [None]:
pl_lengths = [len(pl) for u in user_playlists for pl in user_playlists[u]]
#plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))

In [None]:
users = sorted(user_playlists.keys())

In [None]:
songs_user = {u: {sid for pl in user_playlists[u] for sid in pl} for u in users}  # user: a set of songs

Compute the number of playlists per user, and the number of songs covered by the user's playlists.

In [None]:
udf = pd.DataFrame(index=users, columns=['#playlist', '#song'])

In [None]:
udf['#playlist'] = [len(user_playlists[u]) for u in users]

In [None]:
udf['#song'] = [len(songs_user[u]) for u in users]

In [None]:
ax = plt.subplot(111)
udf['#playlist'].hist(bins=200, ax=ax)
ax.set_yscale('log')

In [None]:
u_npl = sorted([(u, len(user_playlists[u])) for u in users], key=lambda x: x[1])

In [None]:
#u_npl

In [None]:
step = 1000  # sample 0.1%
subset = [u_npl[ix] for ix in np.arange(0, len(u_npl), step)]

In [None]:
subset

In [None]:
uid_subset = [t[0] for t in subset]

In [None]:
#udf_subset = udf[ udf['#playlist'] == 10]
#udf_subset.head()

In [None]:
#uid_subset += udf_subset.index[[3,4]].tolist()

In [None]:
#udf_subset = udf[ udf['#playlist'] == 30]
#udf_subset.head()

In [None]:
#uid_subset += udf_subset.index[[1,2]].tolist()

In [None]:
#udf_subset = udf[ udf['#playlist'].isin(np.arange(95, 105))]
#udf_subset.sort_values(by='#playlist')

In [None]:
#uid_subset += udf_subset.index[[2,5]].tolist()

In [None]:
#udf.sort_values(by=['#playlist'], ascending=False).iloc[100:200]

In [None]:
#uid_subset

In [None]:
#udf[uid_subset]  # tuple are used as multiindex in pandas
#udf[[uid_subset]]

## Subset of data

The user whose playlists cover a *proper number of playlists*, e.g. 50.

In [None]:
playlists_subset = [pl for u in uid_subset for pl in user_playlists[u]]

In [None]:
len(playlists_subset)

In [None]:
song_set = sorted({sid for u in uid_subset for sid in songs_user[u]})

In [None]:
len(song_set)

### Split songs for setting I

Split songs (90/10 split) such that the distributions of song popularity (the number of occurrence in playlists) in training and dev set are similiar.

In [None]:
song_pl_mat = np.zeros((len(song_set), len(playlists_subset)))
songind = {sid: ix for ix, sid in enumerate(song_set)}
for j in range(len(playlists_subset)):
    pl = playlists_subset[j]
    ind = [songind[sid] for sid in pl]
    song_pl_mat[ind, j] = 1

In [None]:
song_pop = np.sum(song_pl_mat, axis=1)

In [None]:
#plt.hist(song_pop, bins=20)
#print()

In [None]:
#np.arange(0, 100, 10)

In [None]:
sortix = np.argsort(song_pop)
ratio = 0.1  # 90/10 split
step = int(1./ratio)
split_ix = np.arange(0, len(song_pop), step)
dev_ix = [sortix[ix] for ix in split_ix]
dev_song_set = [song_set[ix] for ix in dev_ix]
train_song_set = sorted(set(song_set) - set(dev_song_set))

Histogram of song popularity in training set.

In [None]:
train_song_pop = [song_pop[songind[sid]] for sid in train_song_set]
ax = plt.subplot(111)
ax.hist(train_song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(train_song_set))

Histogram of song popularity in dev set.

In [None]:
dev_song_pop = [song_pop[songind[sid]] for sid in dev_song_set]
ax = plt.subplot(111)
ax.hist(dev_song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(dev_song_set))

### Split playlists

Split playlists (80/20 split) such that the distributions of playlist length (the number of songs in playlists) for each user in training and dev set are similiar.

In [None]:
train_playlists = []
dev_playlists = []

In [None]:
ratio = 0.2
step = 1./ratio
#np.arange(0, 10, step)

In [None]:
np.random.seed(123456789)
rounding_prob = step - int(step)
for u in uid_subset:
    u_playlists = user_playlists[u]
    if len(u_playlists) < 3: 
        train_playlists.append((u, u_playlists[0]))
        continue
    sorted_pl = sorted(u_playlists, key=lambda pl: len(pl))
    if step == int(step):
        step = int(step)
        dev_ix = np.arange(0, len(sorted_pl), step)
    else:
        split_ix = np.arange(0, len(sorted_pl), step)
        dev_ix = [ix for ix in [int(x) if np.random.rand() < rounding_prob or int(x) == len(sorted_pl)-1 \
                                else int(x)+1 for x in split_ix]]  # avoid index out of bounds
    dev_playlists += [(u, sorted_pl[ix]) for ix in dev_ix]
    train_playlists += [(u, sorted_pl[ix]) for ix in range(len(sorted_pl)) if ix not in dev_ix]

In [None]:
xmax = np.max([len(pl) for pl in playlists_subset]) + 1

Histogram of playlist length in training set.

In [None]:
ax = plt.subplot(111)
ax.hist([len(t[1]) for t in train_playlists], bins=50)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
print(len(train_playlists))

Histogram of playlist length in training set.

In [None]:
ax = plt.subplot(111)
ax.hist([len(t[1]) for t in dev_playlists], bins=50)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
print(len(dev_playlists))

### Hold 10% of songs in the dev set of playlists

Hold 10% of songs uniformly at random for each playlist in dev set.

In [None]:
dev_known_songs = []

In [None]:
ratio = 0.1
np.random.seed(987654321)
num_held = 0
for u, pl in dev_playlists:
    sample_size = ratio * len(pl)
    rounding_prob = sample_size - int(sample_size)
    sample_size = int(sample_size) if np.random.rand() < rounding_prob else int(sample_size) + 1
    sample_ix = np.random.permutation(np.arange(len(pl)))[sample_size:]
    dev_known_songs += np.array(pl)[sample_ix].tolist()
    num_held += sample_size
print('#song being held:', num_held)

In [None]:
dev_known_songs = sorted(set(dev_known_songs))

In [None]:
len(dev_known_songs)

## Load song features

Load `song_id` --> `feature array` mapping: map a song to the audio features of one of its corresponding tracks in MSD.

In [None]:
song2Features = pkl.load(gzip.open(ffeature, 'rb'))

## Load genres

In [None]:
song2genre = pkl.load(open(fgenre, 'rb'))

Check if all songs have genre info.

In [None]:
np.all([sid in song2genre for sid in song_set])

## Create song-playlist matrix

Songs as rows, playlists as columns.

In [None]:
def gen_dataset_subset(playlists, song_set, features_MSD, song2genre):
    """
    Create labelled dataset: rows are songs, columns are users.
    
    Input:
        - playlists: a set of playlists
        - song_set: a set of songIDs
        - features_MSD: dictionary that maps songIDs to features from MSD
        - song2genre: dictionary that maps songIDs to genre
    Output:
        - (Feature, Label) pair (X, Y)
          X: #songs by #features
          Y: #songs by #users
    """ 
    song_indices = {sid: ix for ix, sid in enumerate(song_set)}
    N = len(song_set)
    K = len(playlists)
    
    genre_set = sorted({v for v in song2genre.values()})
    genre_indices = {genre: ix for ix, genre in enumerate(genre_set)}
    
    def onehot_genre(songID):
        """
        One-hot encoding of genres.
        Data imputation: one extra entry for songs without genre info.
        Should try other options: 
            mean imputation, sampling from the distribution of genre popularity.
        """
        num = len(genre_set) + 1
        vec = np.zeros(num, dtype=np.float)
        if songID in song2genre:
            genre_ix = genre_indices[song2genre[songID]]
            vec[genre_ix] = 1
        else:
            vec[-1] = 1
        return vec
    
    #X = np.array([features_MSD[sid] for sid in song_set])  # without using genre
    X = np.array([np.concatenate([features_MSD[sid], onehot_genre(sid)], axis=-1) for sid in song_set])
    Y = np.zeros((N, K), dtype=np.bool)
    
    for k in range(K):
        pl = playlists[k]
        indices = [song_indices[sid] for sid in pl if sid in song_indices]
        Y[indices, k] = True

    return X, Y

In [None]:
def mean_normalised_reciprocal_rank(Y_true, Y_pred):
    """
    Compute the mean of normalised reciprocal rank (reciprocal rank are normalised by the best possible ranks)
    """
    normalised_reci_rank = []
    npos = np.sum(Y_true, axis=0)
    for k in range(Y_true.shape[1]):
        ranks = calc_rank(Y_pred[:, k])[Y_true[:, k]]
        if len(ranks) > 0:
            ideal = np.sum([1./nk for nk in range(1, npos[k]+1)])
            real = np.sum([1./r for r in ranks])
            normalised_reci_rank.append(real / ideal)  # normalise the reciprocal ranks by the best possible ranks
    return np.mean(normalised_reci_rank)

In [None]:
def eval_pl(Y_true, Y_pred):
    nzcol = np.nonzero(np.sum(Y_true, axis=0))[0]  # columns with at least one True
    print('Average over %d columns' % len(nzcol))
    print('%-15s %.4f' % ('Mean P@K:', np.mean(calc_precisionK(Y_true.T, Y_pred.T))))
    print('%-15s %.4f' % ('Mean AUC:', roc_auc_score(Y_true[:, nzcol], Y_pred[:, nzcol], average='macro')))
    print('%-15s %.4f' % ('MAP:', average_precision_score(Y_true[:, nzcol], Y_pred[:, nzcol], average='macro')))
    print('%-15s %.4f' % ('Mean NRR:', mean_normalised_reciprocal_rank(Y_true, Y_pred)))

## Setting I: hold a subset of songs, use all playlists

In [None]:
playlists1 = [t[1] for t in train_playlists + dev_playlists]

In [None]:
X_train, Y_train = gen_dataset_subset(playlists=playlists1, song_set=train_song_set, 
                                      features_MSD=song2Features, song2genre=song2genre)

In [None]:
X_dev, Y_dev = gen_dataset_subset(playlists=playlists1, song_set=dev_song_set, 
                                  features_MSD=song2Features, song2genre=song2genre)

Feature normalisation.

In [None]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev   -= X_train_mean
X_dev   /= X_train_std

In [None]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))

In [None]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)

In [None]:
#np.sum(Y_train, axis=0)

In [None]:
#np.sum(Y_dev, axis=0)

### M1. BR - Independent logistic regression

In [None]:
br = BinaryRelevance(C=1, n_jobs=4)
br.fit(X_train, Y_train)

Evaluation: normalise **per playlist**.

In [None]:
print('Dev set:')
eval_pl(Y_dev, br.predict(X_dev))

In [None]:
print('Training set:')
eval_pl(Y_train, br.predict(X_train))

### M2. PC - Multilabel p-classification

P-Classification ~ P-norm push ranking.

In [None]:
pc = PClassificationMLC(C1=1, weighting='labels')
pc.fit(X_train, Y_train)

Evaluation: normalise **per playlist**.

In [None]:
print('Dev set:')
eval_pl(Y_dev, pc.predict(X_dev))

In [None]:
print('Training set:')
eval_pl(Y_train, pc.predict(X_train))

## Setting II: hold a subset of songs in a subset of playlists, use all songs

In [None]:
X, Y = gen_dataset_subset(playlists=[t[1] for t in train_playlists + dev_playlists], song_set=song_set, 
                          features_MSD=song2Features, song2genre=song2genre)

Set all entries corresponding to playlists in dev set to NaN, except those songs in dev playlists that we observed.

In [None]:
Y_train = Y.copy().astype(np.float)  # note: np.nan is float
cols = np.arange(Y.shape[1])[-len(dev_playlists):]
Y_train[:, cols] = np.nan
song_indices = {sid: ix for ix, sid in enumerate(song_set)}
assert len(cols) == len(dev_playlists)
num_known = 0
for j in range(len(cols)):
    pl = dev_playlists[j][1]
    rows = [song_indices[sid] for sid in (set(pl) & set(dev_known_songs))]
    Y_train[rows, cols[j]] = 1
    num_known += len(rows)

In [None]:
np.sum(np.isnan(Y_train))

In [None]:
len(dev_playlists) * len(song_set) - num_known

In [None]:
len(dev_song_set) * Y.shape[1]

In [None]:
len(dev_song_set)
Y.shape

In [None]:
print(np.sum(np.isnan(Y_train)), len(dev_playlists) * len(dev_song_set2))

In [None]:
X_train = X

Feature normalisation.

In [None]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev = X_train[len(nondev_song_set2):]

In [None]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))

In [None]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)

In [None]:
#np.sum(Y_train, axis=0)

In [None]:
#np.sum(Y_dev, axis=0)

### M3. Independent logistic regression

In [None]:
br2 = BinaryRelevance(C=1, n_jobs=4)
br2.fit(X_train, np.nan_to_num(Y_train))

In [None]:
col_start = -len(dev_playlists)

Evaluation: normalise **per playlist**.

In [None]:
print('Dev set:')
eval_pl(Y_dev, br2.predict(X_dev)[:, col_start:])

In [None]:
Y_train_gt = np.nan_to_num(Y_train).astype(np.bool)
Y_train_pred = br2.predict(X_train)
Y_train_pred[:, col_start:] = 0   # remove test region
print('Training set:')
eval_pl(Y_train_gt, Y_train_pred)

### M4. Multilabel p-classification with some playlist fully observed

In [None]:
user_of_playlists2 = [t[0] for t in train_playlists + dev_playlists]
#user_of_playlists2

In [None]:
same_user_mat = np.zeros((len(playlists2), len(playlists2)), dtype=np.bool)
for i in range(len(playlists2)):
    for j in range(i+1, len(playlists2)):
        if user_of_playlists2[i] == user_of_playlists2[j]:
            same_user_mat[i, j] = True
            same_user_mat[j, i] = True

In [None]:
#same_user_mat

In [None]:
pla = PClassificationMLC(C1=1, C2=1, C3=10, weighting='both', similarMat=same_user_mat)
pla.fit(X_train, Y_train)

In [None]:
col_start = -len(dev_playlists)

Evaluation: normalise **per playlist**.

In [None]:
eval_pl(Y_dev, pla.predict(X_dev)[:, col_start:])

In [None]:
Y_train_gt = np.nan_to_num(Y_train).astype(np.bool)
Y_train_pred = pla.predict(X_train)
Y_train_pred[:, col_start:] = 0   # remove test region
eval_pl(Y_train_gt, Y_train_pred)

**Check the if the regulariser is effective**

In [None]:
%%script false
rows, cols = np.nonzero(same_user_mat)
for row, col in zip(rows, cols):
    diff = pla.W[row] - pla.W[col]
    print('%g' % np.sqrt(np.dot(pla.W[row], pla.W[row])))
    print('%g' % np.sqrt(np.dot(pla.W[col], pla.W[col])))
    print('%g' % np.sqrt(np.dot(diff, diff)))
    print('------------------------------')

Compute matrix $M$ such that $M_{jk} = \sqrt{(w_j - w_k)^\top (w_j - w_k)}, \forall j, k$.

In [None]:
A = np.dot(pla.W, pla.W.T)
B = np.tile(np.diag(A), (A.shape[0], 1))
M = np.sqrt(-2 * A + (B + B.T))

Normalise $M$ by the vector with maximum norm in $W$.

In [None]:
#aa = np.arange(6).reshape(3, 2)
#np.einsum('ij,ij->i', aa, aa)

In [None]:
denorm = np.sqrt(np.einsum('ij,ij->i', pla.W, pla.W))  # compute the norm for each row in W

In [None]:
M1 = M / np.max(denorm)

In [None]:
#plt.matshow(M1)

In [None]:
#user_of_playlists2

In [None]:
rows, cols = np.nonzero(same_user_mat)
M2 = M1[rows, cols]
print(np.min(M2), np.max(M2), np.mean(M2), np.std(M2))

In [None]:
mat = same_user_mat.copy()
np.fill_diagonal(mat, 1)   # remove the diagnoal from consideration
rows, cols = np.where(mat == 0)
M3 = M1[rows, cols]
print(np.min(M3), np.max(M3), np.mean(M3), np.std(M3))

**Check performance per user**

In [None]:
user_set = sorted(set(user_of_playlists2))
#user_set

In [None]:
#user_of_playlists2

In [None]:
Y_pla = pla.predict(X_dev)[:, col_start:]
dev_col_start = len(train_playlists)
for u in user_set:
    uind = np.where(np.array(user_of_playlists2, dtype=np.object) == u)[0]
    ntrain = len(uind)
    if len(uind) < 2: continue  # filtering out users with less than 2 playlists
    uind -= dev_col_start
    uind = uind[uind >= 0]
    ntest = len(uind)
    #print(uind)
    if len(uind) < 1: continue
    print('--------------------')
    print('USER:', u)
    print('#train: %d, #test: %d' % (ntrain, ntest))
    eval_pl(Y_dev[:, uind], Y_pla[:, uind])
    print()