# Use a nice subset of AotM-2011 Playlists with MSD Audio Features

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import lil_matrix, issparse
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

In [74]:
sys.path.append('src')
from BinaryRelevance import BinaryRelevance
from PClassificationMLC import PClassificationMLC
from evaluate import calc_F1, calc_precisionK

In [3]:
data_dir = 'data/aotm-2011'
#faotm = os.path.join(data_dir, 'aotm2011-subset.pkl')
faotm = os.path.join(data_dir, 'aotm2011-user-playlist.pkl')
ffeature = 'data/msd/songID2Features.pkl.gz'

## Load playlists

Load playlists.

In [4]:
user_playlists = pkl.load(open(faotm, 'rb'))

In [5]:
print('#user    :', len(user_playlists))
print('#playlist:', np.sum([len(user_playlists[u]) for u in user_playlists]))

#user    : 14182
#playlist: 84710


In [6]:
pl_lengths = [len(pl) for u in user_playlists for pl in user_playlists[u]]
#plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))

Average playlist length: 10.1


In [7]:
users = sorted(user_playlists.keys())

In [15]:
songs_user = {u: {sid for pl in user_playlists[u] for sid in pl} for u in users}  # user: a set of songs

Compute the number of playlists per user, and the number of songs covered by the user's playlists.

In [16]:
udf = pd.DataFrame(index=users, columns=['#playlist', '#song'])

In [17]:
udf['#playlist'] = [len(user_playlists[u]) for u in users]

In [18]:
udf['#song'] = [len(songs_user[u]) for u in users]

In [22]:
udf_subset = udf[udf['#playlist'] == 50]
udf_subset

Unnamed: 0,#playlist,#song
"(969886800.0, Kelly12345678906)",50,512
"(998920800.0, Mark Petruccelli)",50,474
"(1057759200.0, MattL)",50,515
"(1124719200.0, El Santo)",50,516


In [73]:
#udf.sort_values(by=['#playlist'], ascending=False).iloc[100:200]

In [31]:
uid_subset = udf_subset.index[0]
uid_subset

(969886800.0, 'Kelly12345678906')

In [43]:
#udf[uid_subset]  # tuple are used as multiindex in pandas
#udf[[uid_subset]]

## Subset of data

The user whose playlists cover a *proper number of playlists*, e.g. 50.

In [44]:
playlists_subset = user_playlists[uid_subset]

In [46]:
song_set = sorted(songs_user[uid_subset])

In [47]:
len(song_set)

512

In [48]:
#song_set

## Load song features

Load `song_id` --> `feature array` mapping: map a song to the audio features of one of its corresponding tracks in MSD.

In [49]:
song2Features = pkl.load(gzip.open(ffeature, 'rb'))

The set of songs, which is the set of labels in this formulation.

## Split song-playlist matrix

Songs as rows, playlists as columns, split rows.

In [50]:
def gen_dataset_subset(playlists, song_set, features_MSD):
    """
    Create labelled dataset: rows are songs, columns are users.
    
    Input:
        - playlists: a set of playlists
        - song_set: a set of songIDs
        - features_MSD: dictionary that maps songIDs to features from MSD
    Output:
        - (Feature, Label) pair (X, Y)
          X: #songs by #features
          Y: #songs by #users
    """
    song_indices = {sid: ix for ix, sid in enumerate(song_set)}
    N = len(song_set)
    K = len(playlists)
    
    X = np.array([features_MSD[sid] for sid in song_set])
    Y = np.zeros((N, K), dtype=np.bool)
    
    for k in range(K):
        pl = playlists[k]
        indices = [song_indices[sid] for sid in pl]
        Y[indices, k] = True

    return X, Y

In [63]:
X, Y = gen_dataset_subset(playlists=playlists_subset, song_set=song_set, features_MSD=song2Features)

# data split: approximately 80/20 for training/dev
X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size=0.2, random_state=89765432)

# feature normalisation
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev   -= X_train_mean
X_dev   /= X_train_std

In [64]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))

Train:      (409, 202)       (409, 50)
Dev  :      (103, 202)       (103, 50)


In [68]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)

-0.09543775861387005

## BR - Independent logistic regression

Independent logistic regression.

In [69]:
br = BinaryRelevance(n_jobs=4)
br.fit(X_train, Y_train)

In [70]:
Y_br = br.predict(X_dev)

In [72]:
np.mean(calc_precisionK(Y_dev, Y_br))

0.08414239482200649

## PC - Multilabel p-classification

P-Classification ~ P-norm push ranking.

In [75]:
pc = PClassificationMLC()
pc.fit(X_train, Y_train)


C: 1, p: 1, weighting: True


In [76]:
Y_pc = pc.predict(X_dev)

In [77]:
np.mean(calc_precisionK(Y_dev, Y_pc))

0.1423948220064725

## Multilabel p-classification with unknows in test set

In [169]:
N_dev, K_dev = Y_dev.shape

In [170]:
#type(np.nan)

In [171]:
Y_dev_nan = Y_dev.copy().astype(np.float)

In [172]:
np.random.seed(89673215)
rand_num = int(0.2 * N_dev)
ones = 0
for k in range(K_dev):
    randix = np.random.permutation(np.arange(N_dev))[:rand_num]
    Y_dev_nan[randix, k] = np.nan
    ones += Y_dev[randix, k].sum()

In [173]:
ones  # number of positive entries selected to be masked as NaN

206

In [174]:
np.sum(np.isnan(Y_dev_nan))

1000

In [176]:
#Y_dev_nan

In [182]:
Y_nan = np.concatenate([Y_train.astype(np.float), Y_dev_nan], axis=0)

In [183]:
print(X.shape, Y_nan.shape)

(512, 202) (512, 50)


In [184]:
pc = PClassificationMLC()
pc.fit(X, Y_nan)


C: 1, p: 1, weighting: True


  T2 = np.multiply(Yp, np.exp(-T1p))  # N by K


In [76]:
Y_pc = pc.predict(X_dev)

Prediction: use the minimum of positive entry score of the same example as threshold.  
Evaluation: use micro-F1 on all unknown entries.

## Multilabel p-classification with some playlist fully observed

## Multilabel p-classification with (some playlist fully observed) and (unknowns in test set)