In [1]:
# COLAB = True
COLAB = False
RANDOM_SEED = 42

In [2]:
import sys
path = ''
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive/')
    path = '/content/gdrive/MyDrive/fma/'
    sys.path.append('/content/gdrive/MyDrive/fma')

In [3]:
import time
import os
import IPython.display as ipd
from tqdm import notebook
import numpy as np
import pandas as pd
import ast
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
%matplotlib inline


In [4]:
def load(filepath):
    filename = os.path.basename(filepath)
    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)
    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')
        return tracks

In [5]:
# size = 'small'
size = 'medium'
# size = 'large'
os.environ['AUDIO_DIR'] = f'./data/fma_{size}/'
tracks = load(f'{path}data/fma_metadata/tracks.csv')
features = load(f'{path}data/fma_metadata/features.csv')
echonest = load(f'{path}data/fma_metadata/echonest.csv')
subset = tracks.index[tracks['set', 'subset'] <= size]
features_all = features.join(echonest, how='inner').sort_index(axis=1)
features_all = features.loc[subset]
tracks = tracks.loc[subset]
tracks.shape, features_all.shape

((25000, 52), (25000, 518))

In [6]:
train = tracks.index[tracks['set', 'split'] == 'training']
validation = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']
print(f'Train: {len(train)}, Validation: {len(validation)}, Test: {len(test)}')

top_genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)
all_genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print(f'All genres: ({len(all_genres)}), Top genres ({len(top_genres)}): {top_genres}')

Train: 19922, Validation: 2505, Test: 2573
All genres: (151), Top genres (16): ['Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken']


In [7]:
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    if not multi_label:
        encoder = LabelEncoder()
        labels = tracks['track', 'genre_top']
    else:
        encoder = MultiLabelBinarizer()
        labels = tracks['track', 'genres_all']

    # Split in training, validation and testing sets.
    y_train = encoder.fit_transform(labels[train])
    y_validation = encoder.transform(labels[validation])
    y_test = encoder.transform(labels[test])
    X_train = features.loc[train, columns].values
    X_validation = features.loc[validation, columns].values
    X_test = features.loc[test, columns].values
    X_train, y_train = shuffle(X_train, y_train, random_state=RANDOM_SEED)
    # Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_validation)
    scaler.transform(X_test)
    return y_train, y_validation, y_test, X_train, X_validation, X_test


def test_classifiers_features(classifiers, feature_sets, multi_label=False):

    columns = list(classifiers.keys()).insert(0, 'number of features')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())
    for fset_name, fset in notebook.tqdm(feature_sets.items(), desc='features'):
        y_train, y_validation, y_test, X_train, X_validation, X_test = pre_process(
            tracks, features_all, fset, multi_label)
        scores.loc[fset_name, 'number of features'] = X_train.shape[1]
        for clf_name, clf in classifiers.items():
            t = time.process_time()
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            scores.loc[fset_name, clf_name] = score
            times.loc[fset_name, clf_name] = time.process_time() - t
    return scores, times


def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])


classifiers = {
    'LR': LogisticRegression(),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'SVC (RBF)': SVC(kernel='rbf', gamma='scale'),
    'SVC (Linear)': SVC(kernel='linear'),
    'SVC (Polynomial^2)': SVC(kernel='poly', degree=2, gamma='scale'),
    'SVC (Polynomial^3)': SVC(kernel='poly', degree=3, gamma='scale'),
    'DT': DecisionTreeClassifier(max_depth=5),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000)
}

feature_sets = {'mfcc': ['mfcc'],
                'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
                'all fma features': ['tonnetz', 'zcr', 'chroma_cqt', 'chroma_cens', 'spectral_contrast',
                                     'spectral_rolloff', 'spectral_centroid', 'mfcc', 'rmse', 'chroma_stft', 'spectral_bandwidth']}

scores, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))


features:   0%|          | 0/3 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,number of features,LR,kNN,SVC (RBF),SVC (Linear),SVC (Polynomial^2),SVC (Polynomial^3),DT,MLP
mfcc,140.0,58.03%,54.99%,60.98%,59.19%,55.23%,56.35%,45.82%,50.37%
mfcc/contrast/centroid,196.0,60.47%,55.23%,63.39%,60.28%,58.76%,58.10%,47.61%,51.69%
all fma features,518.0,60.63%,51.77%,62.88%,59.08%,57.83%,54.96%,47.30%,56.43%


Unnamed: 0,LR,kNN,SVC (RBF),SVC (Linear),SVC (Polynomial^2),SVC (Polynomial^3),DT,MLP
mfcc,14.125,3.1094,42.7656,129.8906,39.7344,44.1406,1.4844,592.4062
mfcc/contrast/centroid,14.375,3.1094,49.4531,173.2969,51.6562,58.9531,2.0938,358.1875
all fma features,19.8906,4.625,115.3281,489.0469,142.2031,166.2969,5.4531,175.0312
