# Use a nice subset of AotM-2011 Playlists with MSD Audio Features

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from scipy.sparse import lil_matrix, issparse
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sys.path.append('src')
from BinaryRelevance import BinaryRelevance
from PClassificationMLC import PClassificationMLC
from evaluate import calc_F1, calc_precisionK, f1_score_nowarn

In [3]:
data_dir = 'data/aotm-2011'
#faotm = os.path.join(data_dir, 'aotm2011-subset.pkl')
faotm = os.path.join(data_dir, 'aotm2011-user-playlist.pkl')
ffeature = 'data/msd/songID2Features.pkl.gz'

## Load playlists

Load playlists.

In [4]:
user_playlists = pkl.load(open(faotm, 'rb'))

In [5]:
print('#user    :', len(user_playlists))
print('#playlist:', np.sum([len(user_playlists[u]) for u in user_playlists]))

#user    : 14182
#playlist: 84710


In [6]:
pl_lengths = [len(pl) for u in user_playlists for pl in user_playlists[u]]
#plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))

Average playlist length: 10.1


In [7]:
users = sorted(user_playlists.keys())

In [8]:
songs_user = {u: {sid for pl in user_playlists[u] for sid in pl} for u in users}  # user: a set of songs

Compute the number of playlists per user, and the number of songs covered by the user's playlists.

In [9]:
udf = pd.DataFrame(index=users, columns=['#playlist', '#song'])

In [10]:
udf['#playlist'] = [len(user_playlists[u]) for u in users]

In [11]:
udf['#song'] = [len(songs_user[u]) for u in users]

In [12]:
udf_subset = udf[udf['#playlist'] == 50]
udf_subset

Unnamed: 0,#playlist,#song
"(969886800.0, Kelly12345678906)",50,512
"(998920800.0, Mark Petruccelli)",50,474
"(1057759200.0, MattL)",50,515
"(1124719200.0, El Santo)",50,516


In [13]:
#udf.sort_values(by=['#playlist'], ascending=False).iloc[100:200]

In [14]:
uid_subset = udf_subset.index[0]
uid_subset

(969886800.0, 'Kelly12345678906')

In [15]:
#udf[uid_subset]  # tuple are used as multiindex in pandas
#udf[[uid_subset]]

## Subset of data

The user whose playlists cover a *proper number of playlists*, e.g. 50.

In [16]:
playlists_subset = user_playlists[uid_subset]

In [17]:
song_set = sorted(songs_user[uid_subset])

In [18]:
len(song_set)

512

In [19]:
#song_set

## Load song features

Load `song_id` --> `feature array` mapping: map a song to the audio features of one of its corresponding tracks in MSD.

In [20]:
song2Features = pkl.load(gzip.open(ffeature, 'rb'))

The set of songs, which is the set of labels in this formulation.

## Split song-playlist matrix

Songs as rows, playlists as columns, split rows.

In [21]:
def gen_dataset_subset(playlists, song_set, features_MSD):
    """
    Create labelled dataset: rows are songs, columns are users.
    
    Input:
        - playlists: a set of playlists
        - song_set: a set of songIDs
        - features_MSD: dictionary that maps songIDs to features from MSD
    Output:
        - (Feature, Label) pair (X, Y)
          X: #songs by #features
          Y: #songs by #users
    """
    song_indices = {sid: ix for ix, sid in enumerate(song_set)}
    N = len(song_set)
    K = len(playlists)
    
    X = np.array([features_MSD[sid] for sid in song_set])
    Y = np.zeros((N, K), dtype=np.bool)
    
    for k in range(K):
        pl = playlists[k]
        indices = [song_indices[sid] for sid in pl]
        Y[indices, k] = True

    return X, Y

In [22]:
X, Y = gen_dataset_subset(playlists=playlists_subset, song_set=song_set, features_MSD=song2Features)

# data split: approximately 80/20 for training/dev
X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size=0.2, random_state=8976321)

# feature normalisation
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = 10 * np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev   -= X_train_mean
X_dev   /= X_train_std

In [23]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))

Train:      (409, 202)       (409, 50)
Dev  :      (103, 202)       (103, 50)


In [24]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 0.1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 0.1)

2.66663099048e-17
-0.00346567911462
-0.00116424300265
-0.00576218173743


## BR - Independent logistic regression

Independent logistic regression.

In [25]:
br = BinaryRelevance(n_jobs=4)
br.fit(X_train, Y_train)

In [26]:
Y_br = br.predict(X_dev)

In [27]:
np.mean(calc_precisionK(Y_dev.T, Y_br.T))

0.13661904761904761

In [29]:
f1_score_nowarn(Y_dev.ravel(), (Y_br>=0).ravel(), average='binary')

0.0

In [30]:
precision_recall_fscore_support(Y_dev.ravel(), (Y_br>=0).ravel(), average='binary', warn_for=None)

(0.0, 0.0, 0.0, None)

In [31]:
#np.mean(calc_F1(Y_train, br.predict(X_train) >= 0))

In [32]:
%%script false
min_pos_score = []
for col in range(Y_dev.shape[1]):
    val = Y_br[:,col][Y_dev[:,col]]
    if len(val) > 0:
        min_pos_score.append(np.min(val))
    else:
        min_pos_score.append(np.nan)
print(np.array(min_pos_score))

In [33]:
%%script false
max_neg_score = []
for col in range(Y_dev.shape[1]):
    val = Y_br[:,col][np.logical_not(Y_dev[:,col])]
    if len(val) > 0:
        max_neg_score.append(np.max(val))
print(np.array(max_neg_score))

In [34]:
#print(np.array(min_pos_score)-np.array(max_neg_score))

## PC - Multilabel p-classification

P-Classification ~ P-norm push ranking.

In [35]:
pc1 = PClassificationMLC(C=1, weighting=True, verticalWeighting=True)
pc1.fit(X_train, Y_train)


C: 1, p: 1, weighting: True


In [36]:
X_test = X_dev
Y_test = Y_dev

In [37]:
np.sum(Y_dev, axis=0)

array([6, 1, 2, 5, 1, 2, 4, 1, 3, 1, 2, 1, 4, 3, 3, 4, 3, 3, 1, 2, 2, 1, 1,
       3, 3, 2, 5, 0, 3, 1, 2, 0, 2, 1, 1, 4, 4, 3, 1, 2, 2, 7, 3, 3, 0, 4,
       2, 2, 3, 3])

In [38]:
Y_pc = pc1.predict(X_test)

In [39]:
np.mean(calc_precisionK(Y_test.T, Y_pc.T))

0.13709523809523808

In [40]:
precision_recall_fscore_support(Y_dev.ravel(), (Y_pc>=0).ravel(), average='binary', warn_for=None)

(0.056320400500625784, 0.36885245901639346, 0.097719869706840393, None)

In [41]:
#np.mean(calc_F1(Y_dev, Y_pc >= 0))

In [42]:
#np.mean(calc_F1(Y_train, pc.predict(X_train) >= 0))

In [110]:
min_pos_score = []
for col in range(Y_test.shape[1]):
    val = Y_pc[:,col][Y_test[:,col]]
    if len(val) > 0:
        min_pos_score.append(np.min(val))
    else:
        min_pos_score.append(np.nan)
#plt.hist((np.array(min_pos_score)))
#plt.hist((np.nan_to_num(min_pos_score)), bins=30)
#print(np.array(min_pos_score))
#print()

In [111]:
max_neg_score = []
for col in range(Y_test.shape[1]):
    val = Y_pc[:,col][np.logical_not(Y_test[:,col])]
    if len(val) > 0:
        max_neg_score.append(np.max(val))
#plt.hist(np.array(max_neg_score), bins=30)
#print()

In [112]:
#plt.hist(np.nan_to_num(min_pos_score)-np.array(max_neg_score), bins=30)
#print()

## Multilabel p-classification with unknows in test set

In [46]:
N, K = Y.shape

In [47]:
#type(np.nan)

In [48]:
Y_nan = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.2 * N)
ones = 0
for k in range(K):
    randix = np.random.permutation(np.arange(N))[:rand_num]
    Y_nan[randix, k] = np.nan
    ones += Y[randix, k].sum()

In [49]:
#np.sum(Y, axis=0)

In [50]:
#np.nansum(Y_nan, axis=0)

In [51]:
#np.sum(Y, axis=0) - np.nansum(Y_nan, axis=0)

In [52]:
#ones  # number of positive entries selected to be masked as NaN

In [53]:
#Y.shape

In [54]:
#Y_nan.shape

The number of NaN entries.

In [55]:
np.sum(np.isnan(Y_nan))

5100

In [56]:
#np.sum(Y)

Train: *keep running util no overflow warning occurred*.

In [57]:
pc2 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc2.fit(X, Y_nan)


C: 1, p: 1, weighting: True


Prediction: use the minimum of positive entry score of the same example as threshold.  
Evaluation: use F1 on all unknown entries (as a 1D array).

In [58]:
Y_pred2 = pc2.predict(X)

In [59]:
pos_index = np.nan_to_num(Y_nan).astype(np.bool)
nan_index = np.isnan(Y_nan)

In [60]:
ground_truths = Y[nan_index]

In [61]:
thresholds = []
preds = []
for k in range(K):
    val = Y_pred2[:, k][pos_index[:, k]]
    th = np.min(val)
    thresholds.append(th)
    preds += (Y_pred2[nan_index[:,k], k] >= th).tolist()

In [62]:
f1_score_nowarn(ground_truths, preds, average='binary')

0.06377551020408162

In [63]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)

(0.037313432835820892, 0.21929824561403508, 0.06377551020408162, None)

## Multilabel p-classification with some playlist fully observed

In [64]:
N, K = Y.shape

In [65]:
Y_nan_part = Y.copy().astype(np.float)
np.random.seed(8967321)
nan_num = int(0.4 * N)
indices = np.arange(N)[-nan_num:]
ones = 0
for k in range(int(K/2), K):
    Y_nan_part[indices, k] = np.nan
    ones += Y[indices, k].sum()

In [66]:
#np.sum(Y, axis=0)

In [67]:
#np.nansum(Y_nan_part, axis=0)

In [68]:
#np.sum(Y, axis=0) - np.nansum(Y_nan_part, axis=0)

In [69]:
#np.sum(np.isnan(Y_nan_part),axis=0)

In [70]:
#ones  # number of positive entries selected to be masked as NaN

In [71]:
#Y.shape

In [72]:
#Y_nan.shape

The number of NaN entries.

In [73]:
np.sum(np.isnan(Y_nan_part))

5100

In [74]:
#np.sum(Y)

Train: *keep running util no overflow warning occurred*.

In [75]:
pc3 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc3.fit(X, Y_nan_part)


C: 1, p: 1, weighting: True


Prediction: use the minimum of positive entry score of the same example as threshold.  
Evaluation: use F1 on all unknown entries (as a 1D array).

In [76]:
Y_pred3 = pc3.predict(X)

In [77]:
pos_index = np.nan_to_num(Y_nan_part).astype(np.bool)
nan_index = np.isnan(Y_nan_part)

In [78]:
ground_truths = Y[nan_index]

In [79]:
thresholds = []
preds = []
for k in range(int(K/2), K):
    val = Y_pred3[:, k][pos_index[:, k]]
    th = np.min(val)
    #th = np.mean(val)
    thresholds.append(th)
    preds += (Y_pred3[nan_index[:,k], k] >= th).tolist()

In [80]:
#np.sum(ground_truths)

In [81]:
#np.sum(preds)

In [83]:
f1_score_nowarn(ground_truths, preds, average='binary')

0.041189931350114416

In [84]:
len(preds)

5100

In [85]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)

(0.028125000000000001, 0.076923076923076927, 0.041189931350114416, None)

## Multilabel p-classification with (some playlist fully observed) and (unknowns in test set)

In [86]:
N, K = Y.shape

In [87]:
Y_nan_part2 = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.4 * N)
ones = 0
for k in range(int(K/2), K):
    randix = np.random.permutation(np.arange(N))[:rand_num]
    Y_nan_part2[randix, k] = np.nan
    ones += Y[randix, k].sum()

In [88]:
#np.sum(Y, axis=0)

In [89]:
#np.nansum(Y_nan_part, axis=0)

In [90]:
#np.sum(Y, axis=0) - np.nansum(Y_nan_part, axis=0)

In [91]:
#ones  # number of positive entries selected to be masked as NaN

In [92]:
#Y.shape

In [93]:
#Y_nan.shape

The number of NaN entries.

In [94]:
np.sum(np.isnan(Y_nan_part2))

5100

In [95]:
#np.sum(Y)

In [97]:
pc4 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc4.fit(X, Y_nan_part2)


C: 1, p: 1, weighting: True


Prediction: use the minimum of positive entry score of the same example as threshold.  
Evaluation: use F1 on all unknown entries (as a 1D array).

In [98]:
Y_pred4 = pc4.predict(X)

In [99]:
pos_index = np.nan_to_num(Y_nan_part2).astype(np.bool)
nan_index = np.isnan(Y_nan_part2)

In [100]:
ground_truths = Y[nan_index]

In [101]:
thresholds = []
preds = []
for k in range(int(K/2), K):
    val = Y_pred4[:, k][pos_index[:, k]]
    th = np.min(val)
    #th = np.mean(val)
    thresholds.append(th)
    preds += (Y_pred4[nan_index[:,k], k] >= th).tolist()

In [102]:
#np.sum(ground_truths)

In [103]:
#np.sum(preds)

In [104]:
f1_score_nowarn(ground_truths, preds, average='binary')

0.029250457038391225

In [105]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)

(0.018518518518518517, 0.069565217391304349, 0.029250457038391225, None)

In [106]:
np.sum(ground_truths)

115

In [107]:
np.sum(preds)

432

In [108]:
np.sum(np.logical_and(ground_truths, preds))

8