# A simple example of generating playlist by multilable learning (toppush)

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score, make_scorer, label_ranking_loss
from scipy.sparse import coo_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sys.path.append('src')
from TopPushMLC import TopPushMLC
from evaluate import evaluatePrecision, evalPred

In [None]:
data_dir = 'data'
faotm = os.path.join(data_dir, 'aotm-2011/aotm-2011-subset.pkl')
fmap  = os.path.join(data_dir, 'aotm-2011/songID2TrackID.pkl')
ftag  = os.path.join(data_dir, 'msd/msd_tagtraum_cd2c.cls')

## Data loading

Load playlists.

In [None]:
playlists = pkl.load(open(faotm, 'rb'))

In [None]:
print('#Playlists: %d' % len(playlists))

In [None]:
playlists[0]

In [None]:
#print('#Songs: %d' % len({songID for p in playlists for songID in p['filtered_lists'][0]}))

In [None]:
#lengths = [len(p['filtered_lists'][0]) for p in playlists]
lengths = [len(sl) for sl in playlists]
plt.hist(lengths, bins=20)
print('Average playlist length: %.1f' % np.mean(lengths))

Load `song_id` --> `track_id` mapping: a song may correspond to multiple tracks.

In [None]:
song2TrackID = pkl.load(open(fmap, 'rb'))

In [None]:
{ k : song2TrackID[k] for k in list(song2TrackID.keys())[:10] }

Load song tags, build `track_id` --> `tag` mapping.

In [None]:
track2Tags = dict()

In [None]:
with open(ftag) as f:
    for line in f:
        if line[0] == '#': continue
        tid, tag = line.strip().split('\t')
        #print(tid, tag)
        track2Tags[tid] = tag

In [None]:
print('#(Track, Tag): %d' % len(track2Tags))

In [None]:
{ k : track2Tags[k] for k in list(track2Tags.keys())[:10] }

## Data cleaning

Use the subset of playlist such that the first song (i.e. the *seed* song) in each playlist has tag(s).

In [None]:
subset_ix = []

In [None]:
seedSong2Tag = { }
for ix in range(len(playlists)):
    # the list of song IDs in the playlist
    #songIDs = playlists[ix]['filtered_lists'][0]
    songIDs = playlists[ix]

    # seed song
    seedSongID   = songIDs[0]
    seedTrackIDs = song2TrackID[seedSongID]
    
    # a song can have multiple tracks, make sure that at least one track for a song has a tag
    flag = [ (trackID in track2Tags) for trackID in seedTrackIDs]
    if not np.any(flag):
        continue

    #seedSong2Tag[playlists[ix]['mix_id']] = [track2Tags[seedTrackIDs[i]] for i in range(len(flag)) if flag[i] is True]
    seedSong2Tag[playlists[ix][0]] = [track2Tags[seedTrackIDs[i]] for i in range(len(flag)) if flag[i] is True]

    subset_ix.append(ix)

In [None]:
#seedSong2Tag

In [None]:
playlists_subset = [playlists[ix] for ix in subset_ix]

In [None]:
print('#Playlists used: %d' % len(subset_ix))

In [None]:
playlists_subset[0]

The set of unique songs, **in multilabel learning, we have a label for each song in this set**.

In [None]:
song_set = sorted({songID for p in playlists_subset for songID in p})

In [None]:
print('#Songs used: %d' % len(song_set))

In [None]:
print(song_set[:10])

## Data analysis

For the most part, playlists contain less than 10 songs. The most common playlist length is 2 songs.

In [None]:
playlist_lengths = [len(p) for p in playlists_subset]
plt.hist(playlist_lengths, bins=20)
print('Average playlist length: %.1f' % np.mean(playlist_lengths))

## One-hot tag encoding

Indicator of tags: `tag` --> `index` mapping.

In [None]:
# the set of unique tags
tag_set = sorted(set(track2Tags.values()))

In [None]:
print('#Tags: %d' % len(tag_set))

In [None]:
tag_indicator = { tag: ix for ix, tag in enumerate(tag_set) }

In [None]:
tag_indicator

## Feature extraction

Build features (1-hot encoding of tag) for a song given its `song_id`.

In [None]:
def gen_features(song_id, song2TrackID = song2TrackID, tag_indicator = tag_indicator):
    """
        Generate one-hot feature vector for a given song ID
    """

    features = np.zeros(len(tag_set), dtype = np.float)
    trackIDs = song2TrackID[song_id]

    cnt = 0
    for trackID in trackIDs:
        if trackID in track2Tags:
            cnt += 1
            tag = track2Tags[trackID]
            tag_ix = tag_indicator[tag]
            features[tag_ix] = 1

    # must have at least one tag for the song, else useless
    assert(cnt >= 1)

    return features

In [None]:
def gen_feature_map(song_id, seed):
    """
        Generate feature mapping for a given (label, query) pair
    """
    
    #return gen_features(song_id) - gen_features(seed)  # feature map
    return gen_features(seed)  # a trivial feature map

In [None]:
def gen_training_set(playlists = playlists_subset, song_set = song_set):
    """
        Create the labelled dataset for a given song index
        
        Input:
            - playlists: which playlists to create features for
            
        Output:
            - (Feature, Label) pair (X, Y), with # num playlists rows
              X comprises the features for each seed song and the given song
              Y comprises the indicators of whether the given song is present in the respective playlist
    """

    N = len(playlists)
    D = len(tag_set)
    K = len(song_set)

    X = np.zeros((N, D), dtype = np.float)
    #Y = np.zeros((N, K), dtype = np.int)
    Y = coo_matrix(([0], ([0],[0])), shape=(N, K), dtype=np.int8).tolil()
    
    for i in range(len(playlists)):
        playlist = playlists[i]
        seed     = playlist[0]

        X[i, :] = gen_feature_map(None, seed)
        Y[i, :] = [int(sid in playlist) for sid in song_set]

    return X, Y.tocsr()

In [None]:
gen_feature_map(song_set[100], playlists_subset[0][0])

## Training & Test

Train a logistic regression model for each label.

In [None]:
X, Y = gen_training_set()
# by fixing random seed, the same playlists will be in the test set each time
X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(X, Y, test_size = 0.33, random_state = 31)

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
clf = OneVsRestClassifier(LogisticRegression(verbose=1))
clf.fit(X_train, Y_train)

In [None]:
pkl.dump(X_train, open(os.path.join(data_dir, 'aotm-2011/XTrain_tag.pkl'), 'wb'))

In [None]:
pkl.dump(Y_train, open(os.path.join(data_dir, 'aotm-2011/YTrain.pkl'), 'wb'))

In [None]:
pkl.dump(X_test, open(os.path.join(data_dir, 'aotm-2011/XTest_tag.pkl'), 'wb'))

In [None]:
pkl.dump(Y_test, open(os.path.join(data_dir, 'aotm-2011/YTest.pkl'), 'wb'))

In [None]:
pkl.dump(clf, open(os.path.join(data_dir, 'aotm-2011/br-base.pkl'), 'wb'))

In [None]:
def print_results(predictor, X_train, Y_train, X_test, Y_test):
    """
        Compute and save performance results
    """
    p3_train = []
    p5_train = []
    pk_train = []
    p3_test = []
    p5_test = []
    pk_test = []
    rankloss_train = []
    rankloss_test = []
    
    N_train = X_train.shape[0]
    batch_size = 200
    N_batch_train = int((N_train-1) / batch_size) + 1
    for i in range(N_batch_train):
        ix0 = i * batch_size
        ix1 = min((i+1) * batch_size, N_train)
        preds = predictor.decision_function(X_train[ix0:ix1])
        evaldict = evaluatePrecision(Y_train[ix0:ix1].toarray(), preds, verbose=-1)
        p3_train.append(evaldict['Precision@3'][0])
        p5_train.append(evaldict['Precision@5'][0])
        pk_train.append(evaldict['Precision@K'][0])
        #rankloss_train.append(evalPred1(Y_train[i].toarray()[0], pred, metricType='Ranking'))
        sys.stdout.write('\r%d / %d' % (i+1, N_batch_train)); sys.stdout.flush()
    print()
    
    N_test = X_test.shape[0]
    N_batch_test = int((N_test-1) / batch_size) + 1
    for i in range(N_batch_test):
        ix0 = i * batch_size
        ix1 = min((i+1) * batch_size, N_test)
        preds = predictor.decision_function(X_test[ix0:ix1])
        evaldict = evaluatePrecision(Y_test[ix0:ix1].toarray(), preds, verbose=-1)
        p3_test.append(evaldict['Precision@3'][0])
        p5_test.append(evaldict['Precision@5'][0])
        pk_test.append(evaldict['Precision@K'][0])
        #rankloss_test.append(evalPred1(Y_test[i].toarray()[0], pred, metricType='Ranking'))
        sys.stdout.write('\r%d / %d' % (i+1, N_batch_test)); sys.stdout.flush()
    print()
    
    print('Training set:')
    print('Precision@3: %.4f' % np.mean(p3_train))
    print('Precision@5: %.4f' % np.mean(p5_train))
    print('Precision@k: %.4f' % np.mean(pk_train))
    print()
    print('Test set:')
    print('Precision@3: %.4f' % np.mean(p3_test))
    print('Precision@5: %.4f' % np.mean(p5_test))
    print('Precision@k: %.4f' % np.mean(pk_test))
    
    #print()
    #print('Training set:')
    #print('RankingLoss: %.1f, %.1f' % (np.mean(rankloss_train), np.std(rankloss_train) / N_train))
    #print()
    #print('Test set:')
    #print('RankingLoss: %.1f, %.1f' % (np.mean(rankloss_test), np.std(rankloss_test) / N_test))

In [None]:
print_results(clf, X_train, Y_train, X_test, Y_test)

In [None]:
clf = TopPushMLC(C=3000)
clf.fit(X_train, Y_train)

In [None]:
print_results(clf, X_train, Y_train, X_test, Y_test)