# Train/Dev/Test split using AotM-2011 Playlists & MSD Audio Features

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import lil_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_dir = 'data/aotm-2011'
faotm = os.path.join(data_dir, 'aotm2011-subset.pkl')
ffeature = 'data/msd/songID2Features.pkl'

## Data loading

Load playlists.

In [None]:
playlists = pkl.load(open(faotm, 'rb'))

In [None]:
print('#Playlists: %d' % len(playlists))

In [None]:
playlists[0]

In [None]:
#print('#Songs: %d' % len({songID for p in playlists for songID in p['filtered_lists'][0]}))

In [None]:
#lengths = [len(p['filtered_lists'][0]) for p in playlists]
lengths = [len(sl) for sl in playlists]
plt.hist(lengths, bins=20)
print('Average playlist length: %.1f' % np.mean(lengths))

**NOTE**: there are duplicated songs in some playlists.

In [None]:
np.sum([len(pl) for pl in playlists])

In [None]:
np.sum([len(set(pl)) for pl in playlists])

In [None]:
lengths = [len(pl) for pl in playlists]

In [None]:
(np.min(lengths), np.max(lengths), np.mean(lengths))

Load `song_id` --> `feature array` mapping: map a song to the audio features of one of its corresponding tracks in MSD.

In [None]:
song2Features = pkl.load(open(ffeature, 'rb'))

The set of songs, which is the set of labels in this formulation.

In [None]:
#song_set = sorted(song2Features.keys())  # use MSD songs as label space
song_set = sorted({sid for pl in playlists for sid in pl})   # use the intersection of MSD and AotM as label space

In [None]:
len(song_set)

## Setting 1

A set of songs as labels.

In [None]:
label_indices = {songID: ix for ix, songID in enumerate(song_set)}

In [None]:
list(label_indices.items())[:10]

In [None]:
def gen_training_set(playlists, label_indices, features):
    """
        Create the labelled dataset: rows are playlists, columns are songs
        
        Input:
            - playlists: which playlists to create features for
            - label_indices: a dictionary that maps a songID to the index of the corresponding label
            - features: a dictionary that maps a songID to its feature vector
            
        Output:
            - (Feature, Label) pair (X, Y), with # num playlists rows
              X comprises the features for each seed song (the 1st in playlist)
              Y comprises the indicators of whether the given song is present in the respective playlist
    """

    N = len(playlists)
    K = len(label_indices)

    X = [ ]
    Y = lil_matrix((N, K), dtype=np.int8)
    
    cnt = 0
    for i in range(len(playlists)):
        cnt += 1
        if cnt % 1000 == 0:
            sys.stdout.write('\r%d / %d' % (cnt, len(playlists)))
            sys.stdout.flush()
            
        playlist = playlists[i]
        seed     = playlist[0]

        X.append(features[seed])
        #indices = [label_indices[s] for s in playlist]
        indices = [label_indices[s] for s in playlist if s in label_indices]
        Y[i, indices] = 1

    return np.array(X), Y.tocsr()

In [None]:
#test_dict = {1: 0, 2: 1, 3: 2}
#[test_dict[s] for s in [1, 2, 5] if s in test_dict]

In [None]:
fdir = os.path.join(data_dir, 'setting1')
fxtrain = os.path.join(fdir, 'X_train_audio.pkl')
fytrain = os.path.join(fdir, 'Y_train_audio.pkl')
fxdev   = os.path.join(fdir, 'X_dev_audio.pkl')
fydev   = os.path.join(fdir, 'Y_dev_audio.pkl')
fxtest  = os.path.join(fdir, 'X_test_audio.pkl')
fytest  = os.path.join(fdir, 'Y_test_audio.pkl')

In [None]:
if np.all([os.path.exists(fname) for fname in [fxtrain, fytrain, fxdev, fydev, fxtest, fytest]]):
    X_train = pkl.load(open(fxtrain, 'rb'))
    Y_train = pkl.load(open(fytrain, 'rb'))
    X_dev   = pkl.load(open(fxdev,   'rb'))
    Y_dev   = pkl.load(open(fydev,   'rb'))
    X_test  = pkl.load(open(fxtest,  'rb'))
    Y_test  = pkl.load(open(fytest,  'rb'))
else:
    # generate dataset
    X, Y = gen_training_set(playlists=playlists, label_indices=label_indices, features=song2Features)
    
    # data split: approximately 70/10/20 for training/dev/test
    # by fixing random seed, the same playlists will be in the test set each time
    X_train, X_other, Y_train, Y_other = train_test_split(X, Y, test_size=0.3, random_state=123456789)
    X_dev,   X_test,  Y_dev,   Y_test  = train_test_split(X_other, Y_other, test_size=0.65, random_state=987654321)
    
    # feature normalisation
    X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
    X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
    X_train -= X_train_mean
    X_train /= X_train_std
    X_dev   -= X_train_mean
    X_dev   /= X_train_std
    X_test  -= X_train_mean
    X_test  /= X_train_std
    
    # save to files
    pkl.dump(X_train, open(fxtrain, 'wb'))
    pkl.dump(Y_train, open(fytrain, 'wb'))
    pkl.dump(X_dev,   open(fxdev,   'wb'))
    pkl.dump(Y_dev,   open(fydev,   'wb'))
    pkl.dump(X_test,  open(fxtest,  'wb'))
    pkl.dump(Y_test,  open(fytest,  'wb'))

In [None]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))
print('Test : %15s %15s' % (X_test.shape,  Y_test.shape))

In [None]:
np.mean(np.mean(X_train, axis=0))

In [None]:
np.mean(np.std(X_train, axis=0)) - 1

In [None]:
np.mean(np.mean(X_dev, axis=0))

In [None]:
np.mean(np.std(X_dev, axis=0)) - 1

In [None]:
np.mean(np.mean(X_test, axis=0))

In [None]:
np.mean(np.std(X_test, axis=0)) - 1

## Setting 2

A set of playlists as labels.

In [None]:
def gen_training_set2(playlists, features):
    """
        Create the labelled dataset: rows are songs, columns are playlists
        
        Input:
            - playlists: which playlists to create features for
            - features: a dictionary that maps a songID to its feature vector
            
        Output:
            - (Feature, Label) pair (X, Y), with # num playlists rows
              X comprises the features for each song
              Y comprises the indicators of whether the given song is present in the respective playlist
    """
    
    song_set = sorted({sid for pl in playlists for sid in pl})
    songInPlaylist = {sid: [] for sid in song_set}
    N = len(song_set)
    K = len(playlists)

    for j in range(K):
        pl = playlists[j]
        for sid in pl:
            songInPlaylist[sid].append(j)
    
    X = [ ]
    Y = lil_matrix((N, K), dtype=np.int8)
    for i in range(N):
        if (i+1) % 1000 == 0:
            sys.stdout.write('\r%d / %d' % (i+1, N))
            sys.stdout.flush()
        sid = song_set[i]
        X.append(features[sid])
        indices = songInPlaylist[sid]
        Y[i, indices] = 1
        
    return np.array(X), Y.tocsr()

In [None]:
fdir = os.path.join(data_dir, 'setting2')
fxtrain = os.path.join(fdir, 'X_train_audio.pkl')
fytrain = os.path.join(fdir, 'Y_train_audio.pkl')
fxdev   = os.path.join(fdir, 'X_dev_audio.pkl')
fydev   = os.path.join(fdir, 'Y_dev_audio.pkl')
fxtest  = os.path.join(fdir, 'X_test_audio.pkl')
fytest  = os.path.join(fdir, 'Y_test_audio.pkl')

In [None]:
if np.all([os.path.exists(fname) for fname in [fxtrain, fytrain, fxdev, fydev, fxtest, fytest]]):
    X_train = pkl.load(open(fxtrain, 'rb'))
    Y_train = pkl.load(open(fytrain, 'rb'))
    X_dev   = pkl.load(open(fxdev,   'rb'))
    Y_dev   = pkl.load(open(fydev,   'rb'))
    X_test  = pkl.load(open(fxtest,  'rb'))
    Y_test  = pkl.load(open(fytest,  'rb'))
else:
    # generate dataset
    X, Y = gen_training_set2(playlists=playlists, features=song2Features)

    # data split: approximately 70/10/20 for training/dev/test
    # by fixing random seed, the same playlists will be in the test set each time
    X_train, X_other, Y_train, Y_other = train_test_split(X, Y, test_size=0.3, random_state=59)
    X_dev,   X_test,  Y_dev,   Y_test  = train_test_split(X_other, Y_other, test_size=0.65, random_state=71)

    # feature normalisation
    X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
    X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
    X_train -= X_train_mean
    X_train /= X_train_std
    X_dev   -= X_train_mean
    X_dev   /= X_train_std
    X_test  -= X_train_mean
    X_test  /= X_train_std

    # save to files
    pkl.dump(X_train, open(fxtrain, 'wb'))
    pkl.dump(Y_train, open(fytrain, 'wb'))
    pkl.dump(X_dev,   open(fxdev,   'wb'))
    pkl.dump(Y_dev,   open(fydev,   'wb'))
    pkl.dump(X_test,  open(fxtest,  'wb'))
    pkl.dump(Y_test,  open(fytest,  'wb'))

In [None]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))
print('Test : %15s %15s' % (X_test.shape,  Y_test.shape))

In [None]:
Y.sum()

In [None]:
np.mean(np.mean(X_train, axis=0))

In [None]:
np.mean(np.std(X_train, axis=0)) - 1

In [None]:
np.mean(np.mean(X_dev, axis=0))

In [None]:
np.mean(np.std(X_dev, axis=0)) - 1

In [None]:
np.mean(np.mean(X_test, axis=0))

In [None]:
np.mean(np.std(X_test, axis=0)) - 1

### Statistics

Song in playlist.

In [None]:
song_set = sorted({sid for pl in playlists for sid in pl})
songInPlaylist = {sid: [] for sid in song_set}
K = len(playlists)
for j in range(K):
    pl = playlists[j]
    for sid in pl: songInPlaylist[sid].append(j)

In [None]:
S_train, S_other, dummy_train, dummy_other = train_test_split(song_set, np.arange(len(song_set)), 
                                                              test_size=0.3, random_state=59)
S_dev, S_test, dummy_dev, dummy_test = train_test_split(S_other, dummy_other, test_size=0.65, random_state=71)

In [None]:
len(S_train)

In [None]:
len(S_test)

In [None]:
ix = 0
sid = S_train[ix]
#np.equal(song2Features[sid], X_train[ix])

In [None]:
songInPlaylist[song_set[dummy_train[0]]]

In [None]:
np.nonzero()

In [None]:
f1 = pd.read_csv('data/f1.txt', names=['F1', '0'])

In [None]:
f1 = f1['F1']

In [None]:
pak = pd.read_csv('data/pak.txt', names=['PaK', '0'])
pak = pak['PaK']

In [None]:
xmin = np.min([np.min(f1), np.min(pak)]) - 0.00005
xmax = np.max([np.max(f1), np.max(pak)]) + 0.00005
plt.xlim([xmin, xmax])
plt.ylim([xmin, xmax])
plt.plot([xmin, xmax], [xmin, xmax], ls='--', c='g')
plt.scatter(f1, pak)
plt.xlabel('F1')
plt.ylabel('Precision@K')