In [1]:
# This notebook trains logistic regression to predict genre tags
# of musical tags based on latent vector inputs.
#
# The genre tags come from the FMA.
#
# The latent vectors are produced by running WMF on 
# user-track playcounts from the MSD taste profile subset.

In [2]:
import numpy as np
import numpy
import os
import pickle
import scipy
from scipy import sparse
import sklearn
from sklearn import preprocessing

In [3]:
import pandas as pd
import ast

# Function borrowed from: https://github.com/mdeff/fma/blob/master/utils.py
def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                'category', categories=SUBSETS, ordered=True)

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks

In [4]:
# FMA metadata can be downloaded using a link provided at:
# https://github.com/mdeff/fma
#fma_metadata_directory = '/path/to/fma/metadata'
fma_metadata_directory = '/media/datasets/home/devin/data/fma/fma_metadata/'
fma_tracks_file = os.path.join(fma_metadata_directory, 'tracks.csv')
tracks = load(fma_tracks_file)
len(tracks)

106574

In [5]:
fma_genres_file = os.path.join(fma_metadata_directory, 'genres.csv')
genres = load(fma_genres_file)
print(len(genres))
top_level_genres = set(
    y[1]['top_level']
    for y in genres.iterrows()
)
print(len(top_level_genres))
genre_id_to_index = {
    y[0]: i
    for i, y in enumerate(genres.iterrows())
}
genre_names = [r[1]['title'] for r in genres.iterrows()]

163
16


In [6]:
def get_track_metadata(y):
    return {
        'track_title': y['track']['title'],
        'tags': y['track']['tags'],
        'genres': y['track']['genres'],
        'genres_all': y['track']['genres_all'],
        'artist_id': y['artist']['id'],
        'artist': y['artist']['name']
    }

fma_matching_dir = '../../matchings/fma_lfm-1b/'
matched_fma_ids_fname = os.path.join(fma_matching_dir, 'matched_fma_ids.txt')
matched_fma_ids = set([int(line.strip()) for line in open(matched_fma_ids_fname)])

# Using itertuples is faster apparently, but this won't take too long...
fma_id_to_metadata = {
    y[0]: get_track_metadata(y[1])
    for y in tracks.iterrows()
    if y[0] in matched_fma_ids
}
print(len(fma_id_to_metadata))
print(len(matched_fma_ids))

31713
32683


In [7]:
def genre_ids_to_many_hot(genre_ids):
    indices = [genre_id_to_index[genre_id] for genre_id in genre_ids]
    many_hot = [False] * 163
    for i in indices:
        many_hot[i] = True
    return np.array(many_hot)


def get_tag_data():
    """
    Function for getting MSD tag data.
    
    Returns train, valid, test.
    Each are lists of ternary tuples.
    The first tuple element is artist_trackname.
    The second tuple element is the latent factors matrix index of the track.
    The third tuple element is an array of True/False values,
    corresponding to FMA genre tags.
    """
    matrix_artist_tracknames_fname = '../../matchings/both/matched_artists_tracks.txt'
    matrix_artist_tracknames = [line.strip() for line in open(matrix_artist_tracknames_fname)]
    artist_trackname_to_matrix_index = {
        artist_trackname: index
        for index, artist_trackname in enumerate(matrix_artist_tracknames)
    }
    
    fma_ids_fname = '../../matchings/fma_lfm-1b/artist_trackname_to_fma_ids.txt'
    fma_artists_tracks_fname = '../../matchings/fma_lfm-1b/matched_artists_tracks.txt'
    fma_artists_tracks = [line.strip() for line in open(fma_artists_tracks_fname)]
    fma_ids = [line.strip() for line in open(fma_ids_fname)]
    artist_trackname_to_fma_id = {
        # We pick the first matching FMA id when there is more than one matching.
        artist_trackname: int(fma_id.split('\t')[0])
        for artist_trackname, fma_id in zip(fma_artists_tracks, fma_ids)
    }
 
    split_dir = '../../split/fma/'
    train_artist_trackname_fname = os.path.join(split_dir, 'train_artist_tracknames.txt')
    valid_artist_trackname_fname = os.path.join(split_dir, 'valid_artist_tracknames.txt')
    test_artist_trackname_fname = os.path.join(split_dir, 'test_artist_tracknames.txt')

    train_artist_tracknames = [line.strip() for line in open(train_artist_trackname_fname)]
    valid_artist_tracknames = [line.strip() for line in open(valid_artist_trackname_fname)]
    test_artist_tracknames = [line.strip() for line in open(test_artist_trackname_fname)]
    
    print(len(train_artist_tracknames))
    print(len(valid_artist_tracknames))
    print(len(test_artist_tracknames))
    
    train_artist_tracknames = filter(
        lambda at: artist_trackname_to_fma_id[at] in fma_id_to_metadata,
        train_artist_tracknames
    )
    
    valid_artist_tracknames = filter(
        lambda at: artist_trackname_to_fma_id[at] in fma_id_to_metadata,
        valid_artist_tracknames
    )
    
    test_artist_tracknames = filter(
        lambda at: artist_trackname_to_fma_id[at] in fma_id_to_metadata,
        test_artist_tracknames
    )
    
    train_x = [artist_trackname_to_matrix_index[artist_trackname] for artist_trackname in train_artist_tracknames]
    valid_x = [artist_trackname_to_matrix_index[artist_trackname] for artist_trackname in valid_artist_tracknames]
    test_x = [artist_trackname_to_matrix_index[artist_trackname] for artist_trackname in test_artist_tracknames]
    
    train_y = [
        genre_ids_to_many_hot(
            fma_id_to_metadata[
                artist_trackname_to_fma_id[artist_trackname]
            ]['genres']
        )
        for artist_trackname in train_artist_tracknames
    ]
    
    valid_y = [
        genre_ids_to_many_hot(
            fma_id_to_metadata[
                artist_trackname_to_fma_id[artist_trackname]
            ]['genres']
        )
        for artist_trackname in valid_artist_tracknames
    ]
    
    test_y = [
        genre_ids_to_many_hot(
            fma_id_to_metadata[
                artist_trackname_to_fma_id[artist_trackname]
            ]['genres']
        )
        for artist_trackname in test_artist_tracknames
    ]

    tr = [(at, x, y) for at, x, y in zip(train_artist_tracknames, train_x, train_y)]
    va = [(at, x, y) for at, x, y in zip(valid_artist_tracknames, valid_x, valid_y)]
    te = [(at, x, y) for at, x, y in zip(test_artist_tracknames, test_x, test_y)]

    print([len(x) for x in [tr, va, te]])
    
    return tr, va, te

In [8]:
tr, va, te = get_tag_data()

25258
2806
3119
[24519, 2728, 3019]


In [9]:
tr[10]

('titus andronicus\tmy time outside the womb',
 112409,
 array([False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False, False, False, False, False,
        False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False,

In [10]:
song_factors_fname = '../output/factors_merged_v.npy'
song_factors = np.load(song_factors_fname)
song_factors.shape

(661392, 80)

In [11]:
tr_at, tr_x, tr_y = zip(*tr)
va_at, va_x, va_y = zip(*va)
te_at, te_x, te_y = zip(*te)

tr_x_feats = np.array([
    song_factors[matrix_index] for matrix_index in tr_x
])
va_x_feats = np.array([
    song_factors[matrix_index] for matrix_index in va_x
])
te_x_feats = np.array([
    song_factors[matrix_index] for matrix_index in te_x
])

tr_y = np.array(tr_y)
va_y = np.array(va_y)
te_y = np.array(te_y)

In [12]:
from keras.models import Sequential
from keras.layers import Dense

# 163-class logistic regression in Keras
model = Sequential()
model.add(Dense(163, activation='sigmoid', input_dim=tr_x_feats.shape[1]))
model.compile(optimizer='rmsprop', loss='binary_crossentropy')

Using TensorFlow backend.


In [13]:
model.fit(tr_x_feats, tr_y, epochs=1, validation_data=(va_x_feats, va_y))

Train on 24519 samples, validate on 2728 samples
Epoch 1/1


<keras.callbacks.History at 0x7f746e48e8d0>

In [14]:
model.fit(tr_x_feats, tr_y, epochs=50, validation_data=(va_x_feats, va_y))

Train on 24519 samples, validate on 2728 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f74e40dd110>

In [15]:
proba_va = model.predict_proba(va_x_feats)
classes_va = np.round(proba_va)

proba_tr = model.predict_proba(tr_x_feats)
classes_tr = np.round(proba_tr)

proba_te = model.predict_proba(te_x_feats)
classes_te = np.round(proba_te)

  32/3019 [..............................] - ETA: 0s0s

In [16]:
from sklearn.metrics import roc_auc_score
# AUC calculated on a subset of the tags.
# (Some tags apparently don't have examples in the test set.)
indices = range(34) + range(36, 54) + range(36, 54) + range(56, 70)
tr_auc = roc_auc_score(tr_y[:,indices], proba_tr[:,indices])
print('training auc: {}'.format(tr_auc))
va_auc = roc_auc_score(va_y[:,indices], proba_va[:,indices])
print('validation auc: {}'.format(va_auc))
te_auc = roc_auc_score(te_y[:,indices], proba_te[:,indices])
print('test auc: {}'.format(te_auc))

training auc: 0.790827185892
validation auc: 0.78934022433
test auc: 0.780232305826


In [17]:
# Per-tag AUCs.
te_aucs = []
for i in range(163):
    try:
        te_aucs.append(roc_auc_score(te_y[:,[i]], proba_te[:,[i]]))
    except Exception as e:
        te_aucs.append(0)
te_instances = np.sum(te_y, axis=0)
indices = np.argsort(te_aucs)
for i in indices:
    print('{0:.2f}\t'.format(te_aucs[i]) +
          '{}\t{}'.format(te_instances[i], genre_names[i])
         )

0.00	0	Deep Funk
0.00	0	Spoken Word
0.00	0	Be-Bop
0.00	0	Radio Theater
0.00	0	Bollywood
0.00	0	Symphony
0.00	0	Western Swing
0.00	0	Christmas
0.00	0	N. Indian Traditional
0.00	0	Radio Art
0.00	0	Klezmer
0.00	0	Salsa
0.00	0	Turkish
0.00	0	Tango
0.00	0	Interview
0.00	0	Talk Radio
0.26	1	South Indian Traditional
0.39	1	Banter
0.42	3	Indian
0.44	17	Krautrock
0.44	4	Spoken
0.51	3	Country & Western
0.54	4	Asia-Far East
0.56	31	Minimal Electronic
0.56	1	Musical Theater
0.57	3	Romany (Gypsy)
0.58	2	Gospel
0.59	14	Sound Art
0.60	4	Chamber Music
0.61	1	Polka
0.61	10	Novelty
0.61	3	Reggae - Dancehall
0.62	10	20th Century Classical
0.62	17	Unclassifiable
0.63	2	Nerdcore
0.64	27	Chill-out
0.64	45	Field Recordings
0.65	40	Instrumental
0.67	66	Post-Rock
0.67	43	Minimalism
0.67	4	Middle East
0.68	71	Soundtrack
0.68	20	Free-Jazz
0.68	41	Sound Collage
0.68	19	Sound Poetry
0.68	3	Jungle
0.69	23	Musique Concrete
0.69	10	New Wave
0.70	5	Nu-Jazz
0.70	4	Opera
0.70	23	Free-Folk
0.71	189	Experimental Pop
0.71	

In [18]:
# Restricting to genres with 20 tracks or more in test set.
te_instances = np.sum(te_y, axis=0)
many_examples_indices = [
    i
    for i in range(163)
    if te_instances[i] >= 20
]
many_examples_indices_set = set(many_examples_indices)

te_auc = roc_auc_score(te_y[:,many_examples_indices], proba_te[:,many_examples_indices])
print('test auc: {}'.format(te_auc))

for i in indices:
    if i in many_examples_indices_set:
        print('{0:.2f}\t'.format(te_aucs[i]) +
              '{}\t{}'.format(te_instances[i], genre_names[i])
             )

test auc: 0.776925890549
0.56	31	Minimal Electronic
0.64	27	Chill-out
0.64	45	Field Recordings
0.65	40	Instrumental
0.67	66	Post-Rock
0.67	43	Minimalism
0.68	71	Soundtrack
0.68	20	Free-Jazz
0.68	41	Sound Collage
0.69	23	Musique Concrete
0.70	23	Free-Folk
0.71	189	Experimental Pop
0.71	510	Experimental
0.71	42	Freak-Folk
0.71	88	Electroacoustic
0.72	20	Electro-Punk
0.72	174	Lo-Fi
0.73	124	Noise
0.73	186	Ambient
0.73	29	Funk
0.74	186	Indie-Rock
0.74	39	Shoegaze
0.74	52	Post-Punk
0.74	29	Breakbeat
0.74	75	Psych-Folk
0.75	200	Ambient Electronic
0.75	21	No Wave
0.75	218	Pop
0.75	53	Synth Pop
0.76	66	International
0.76	100	Singer-Songwriter
0.78	50	Dance
0.78	801	Electronic
0.78	31	Progressive
0.78	33	Blues
0.79	23	Latin America
0.79	129	Avant-Garde
0.79	204	Folk
0.79	61	Techno
0.80	32	Alternative Hip-Hop
0.80	75	Industrial
0.80	287	Rock
0.80	108	Psych-Rock
0.80	61	Jazz
0.80	60	Dubstep
0.81	44	Hip-Hop Beats
0.81	24	Reggae - Dub
0.81	40	Classical
0.81	148	Hip-Hop
0.82	26	House
0.83	20	Audio C