# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)

Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Baselines

* This notebook evaluates standard classifiers from scikit-learn on the provided features.
* Moreover, it evaluates Deep Learning models on both audio and spectrograms.

In [None]:
import time
import os

import IPython.display as ipd
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import keras
from tensorflow.keras.models import Model
from keras.layers import  Flatten, Reshape, MaxPooling2D, concatenate, Add
from keras.layers import Input,AveragePooling2D,Activation, Dense, Conv1D, Conv2D, MaxPooling1D

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier

import utils

In [None]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')

tracks = utils.load('data/fma_metadata/tracks.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, features.shape, echonest.shape

## Subset

In [None]:
subset = tracks.index[tracks['set', 'subset'] <= 'medium']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape

In [None]:
train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)
#genres = list(tracks['track', 'genre_top'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))

## 1 Multiple classifiers and feature sets

Todo:
* Cross-validation for hyper-parameters.
* Dimensionality reduction?

### 1.1 Pre-processing

In [None]:
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    if not multi_label:
        # Assign an integer value to each genre.
        enc = LabelEncoder()
        labels = tracks['track', 'genre_top']
        #y = enc.fit_transform(tracks['track', 'genre_top'])
    else:
        # Create an indicator matrix.
        enc = MultiLabelBinarizer()
        labels = tracks['track', 'genres_all']
        #labels = tracks['track', 'genres']

    # Split in training, validation and testing sets.
    y_train = enc.fit_transform(labels[train])
    y_val = enc.transform(labels[val])
    y_test = enc.transform(labels[test])
    X_train = features.loc[train, columns].values
    X_val = features.loc[val, columns].values
    X_test = features.loc[test, columns].values
    
    print(len(y_train[0]))
    
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
    # Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_val)
    scaler.transform(X_test)
    
    return y_train, y_val, y_test, X_train, X_val, X_test

### 1.2 Single genre

In [None]:
"""def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    columns = list(classifiers.keys()).insert(0, 'dim')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores.loc[fset_name, 'dim'] = X_train.shape[1]
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            scores.loc[fset_name, clf_name] = score
            times.loc[fset_name, clf_name] = time.process_time() - t
    return scores, times

def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])"""

In [None]:
"""classifiers = {
    'LR': LogisticRegression(),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'SVCrbf': SVC(kernel='rbf'),
    'SVCpoly1': SVC(kernel='poly', degree=1),
    'linSVC1': SVC(kernel="linear"),
    'linSVC2': LinearSVC(),
    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    'DT': DecisionTreeClassifier(max_depth=5),
    'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'AdaBoost': AdaBoostClassifier(n_estimators=10),
    'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
    'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
    'NB': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_social': ('echonest', 'social_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
#    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
#    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
}
for name in features.columns.levels[0]:
    feature_sets[name] = name
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})

scores, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))"""

### 1.3 Multiple genres

Todo:
* Ignore rare genres? Count them higher up in the genre tree? On the other hand it's not much tracks.

In [None]:
"""classifiers = {
    #LogisticRegression(),
    'LR': OneVsRestClassifier(LogisticRegression()),
    'SVC': OneVsRestClassifier(SVC()),
    'MLP': MLPClassifier(max_iter=700),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
    'mfcc': 'mfcc',
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
}

scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))"""

## 2 Deep learning on raw audio

Other architectures:
* [Learning Features of Music from Scratch (MusicNet)](https://arxiv.org/abs/1611.09827), John Thickstun, Zaid Harchaoui, Sham Kakade.

In [None]:
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)

Load audio samples in parallel using `multiprocessing` so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:
* librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg
    * resampling is very slow --> use `kaiser_fast`
    * does not work with multi-processing, for keras `fit_generator()`
* pydub is a high-level interface for audio modification, uses ffmpeg to load
    * store a temporary `.wav`
* directly pipe ffmpeg output
    * fastest method
* [pyAV](https://github.com/mikeboers/PyAV) may be a fastest alternative by linking to ffmpeg libraries

In [None]:
# Just be sure that everything is fine. Multiprocessing is tricky to debug.
utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())
SampleLoader(train, batch_size=2).__next__()[0].shape

In [None]:
# Keras parameters.
NB_WORKER = len(os.sched_getaffinity(0))  # number of usables CPUs
params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}

### 2.1 Fully connected neural network

* Two layers with 10 hiddens is no better than random, ~11%.

Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed.

In [None]:
'''
loader = utils.FfmpegLoader(sampling_rate=2000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
print('Dimensionality: {}'.format(loader.shape))

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Dense(output_dim=1000, input_shape=loader.shape))
model.add(Activation("relu"))
model.add(Dense(output_dim=100))
model.add(Activation("relu"))
model.add(Dense(output_dim=labels_onehot.shape[1]))
model.add(Activation("softmax"))

optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)
#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);

loss
'''

### 2.2 Convolutional neural network

* Architecture: [End-to-end learning for music audio](http://www.mirlab.org/conference_papers/International_Conference/ICASSP%202014/papers/p7014-dieleman.pdf), Sander Dieleman, Benjamin Schrauwen.
* Missing: track segmentation and class averaging (majority voting)
* Compared with log-scaled mel-spectrograms instead of strided convolution as first layer.
* Larger net: http://benanne.github.io/2014/08/05/spotify-cnns.html

In [None]:
'''
loader = utils.FfmpegLoader(sampling_rate=16000)
#loader = utils.LibrosaLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((-1, 1), input_shape=loader.shape))
print(model.output_shape)

model.add(Conv1D(128, 512, subsample_length=512))
print(model.output_shape)
model.add(Activation("relu"))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

print(model.output_shape)
#model.add(Dropout(0.25))
model.add(Flatten())
print(model.output_shape)
model.add(Dense(100))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)

loss
'''

### 2.3 Our network (nnet1) from paper

In [None]:

loader = utils.FfmpegLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)

keras.backend.clear_session()



input_shape=[128,513,1]
inputs = Input(shape=input_shape)

conv1=Conv2D(kernel_size=(4,513),filters=128,activation="relu")(inputs)
pool1=MaxPooling2D( pool_size=(2, 1))(conv1)
conv2=Conv2D(kernel_size=(4,1),filters=128,activation="relu")(pool1)
pool2=MaxPooling2D( pool_size=(2, 1))(conv2)
conv3=Conv2D(kernel_size=(4,1),filters=256,activation="relu")(pool2)
# Average pooling layer
average_pool = AveragePooling2D(pool_size=(26,1))(conv3)
# Max pooling layer
max_pool = MaxPooling2D(pool_size=(26,1))(conv3)

# Concatenate the average pooling and max pooling outputs
merged = concatenate([average_pool, max_pool], axis=2)

# Flatten the output for the fully connected layers
flatten = Flatten()(merged)

# Dense layers
dense1 = Dense(300, activation='relu')(flatten)
dense2 = Dense(150, activation='relu')(dense1)
dense3 = Dense(10, activation='softmax')(dense2)

nnet1 = Model(inputs=inputs, outputs=dense3, name= "nnet1")
nnet1.summary()

nnet1.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])

print(nnet1.output_shape)

## 2.4 Our network (nnet2) from paper

In [None]:
loader = utils.FfmpegLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)

keras.backend.clear_session()



input_shape=[128,513,1]
inputs = Input(shape=input_shape)

conv1=Conv2D(kernel_size=(4,513),filters=256,activation="relu",padding="same")(inputs)
conv2=Conv2D(kernel_size=(4,1),filters=256,activation="relu",padding="same")(conv1)
conv3=Conv2D(kernel_size=(4,1),filters=256,activation="relu",padding="same")(conv2)

#residual block
res=Add()([conv1, conv3])

# Average pooling layer
average_pool = AveragePooling2D(pool_size=(125,1))(res)
# Max pooling layer
max_pool = MaxPooling2D(pool_size=(125,1))(res)

# Concatenate the average pooling and max pooling outputs
merged = concatenate([average_pool, max_pool], axis=2)

# Flatten the output for the fully connected layers
flatten = Flatten()(merged)

# Dense layers
dense1 = Dense(300, activation='relu')(flatten)
dense2 = Dense(150, activation='relu')(dense1)
dense3 = Dense(10, activation='softmax')(dense2)

nnet2 = Model(inputs=inputs, outputs=dense3, name= "nnet2")
nnet2.summary()

nnet2.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])

print(nnet2.output_shape)

## 2.5 Generate the STFT small dataset

#### Let's try to get all the spectrograms for all the audio in the dataset

Remember to create a directory inside your data folder named "fma_small_stft".
Some tracks of the original small dataset are corrupted or very small (1-2 seconds), these track will be replaced
by a script below with other good tracks of the same genre from the dataset.

In [None]:
#procedure: 
#1-get all the track ids for the small subset
#2-load each file
#3-compute its stft vector
#4-save the vector as a file

import librosa
import librosa.display
import utils
import os
import IPython.display as ipd
import numpy as np
import matplotlib.pyplot as plt

AUDIO_DIR = os.environ.get('AUDIO_DIR')
print("audio directory: ",AUDIO_DIR)
#1 extract all track_id for fma_small + the label top_genre (by using track.csv) using only the train subset
#2 open the track with utils.get_audio_path(AUDIO_DIR, track_id).
#3 divide in 3s clips with 50% overlapping (TODO: OVERLAPPING)
#4 for each clip compute the stft using a frame of length 1024
#  with overlap of 50% (a hop of length 512) obtaining a STFT with shape (513, 128)
#5 each song has a vector of shape (10,513,218) to be saved as a file in fma_small_stft folder

print("Loading tracks.csv...")
#load the tracks file (can take some times)
tracks = utils.load('data/fma_metadata/tracks.csv')

#get only the small subset of the dataset
small = tracks[tracks['set', 'subset'] <= 'small']
small.shape

In [None]:
#get the track ids
track_ids = np.array(small.index)
print("Track ids shape: ",track_ids.shape, "track ids content:",track_ids)

error_list = []

#open each track audio
for track_id in track_ids:
    print("track id: ",track_id)
    
    try:
        #open the file using the utils
        filename = utils.get_audio_path(AUDIO_DIR, track_id)
        print('File: {}'.format(filename))

        x, sr = librosa.load(filename, sr=None, mono=True) #load the MONO instead of stereo as in paper
        print('Duration: {:.2f}s, {} samples'.format(x.shape[-1] / sr, x.size))

        #now lets try to calculate stft on each 3s clip of the 30s song

        stft_list = []

        for i in range (0,28,3):
            start, end = i, i+3
            #print('start:',start,' end:',end)

            #extract clip (TODO: IMPLEMENT 50% OVERLAPPING TO GET THE DOUBLE OF THE CLIPS)
            clip = x[start*sr:end*sr]

            #calculate stft using a window of 1024 sample and a hop length of 512 sample
            stft = np.abs(librosa.stft(clip, n_fft=1024, hop_length=512))
            #print("Shape of the stft:",stft.shape)

            #resize the spectrogram to become 512 x 128 instead of 512 x 130
            stft = librosa.util.fix_length(stft, size=513, axis=0)[:, :128]
            #print("Shape of the resized stft:",stft.shape)

            stft_list.append(stft)

        stft_array = np.array(stft_list) #the final array containing the stft for each clip of the song
        print("shape of the stft vector: ", stft_array.shape)


        save_filename = './data/fma_small_stft/' + str(track_id)
        print("Saving the stft vector in file:",save_filename) 

        #save the stft array in a file with the name 'track_id.npy'
        np.save(save_filename,stft_array)
    except Exception as e: #skip the songs which give error (file corrupted or song too short)
        print("ERROR on file: ",filename,"\nError information:", e)
        error_list.append(filename)
#show plot for a single clip stft
'''
fig, ax = plt.subplots()
clip_number = 5 #a number from 0 to 9
img = librosa.display.specshow(librosa.amplitude_to_db(stft_array[clip_number], ref=np.max), y_axis='log', x_axis='time', ax=ax)
ax.set_title('STFT Power spectrogram start: {}, end: {}'.format(3*clip_number,3*clip_number+3))
fig.colorbar(img, ax=ax, format="%+2.0f dB")

'''
    
    

In [None]:
#let's see the tracks which had some the errors (couldn't be decoded)

error_array = np.array(error_list)
error_id_array = [] #extract only the id from the filename

for track in error_array:
    id = track.split('/')[-1].split('.')[0]
    error_id_array.append(id)
    
print("tracks with error (saved in error_list.npy): ",error_id_array)
np.save('./data/error_list',error_id_array)


### Replace track with errors with good ones from dataset

In [None]:
import shutil

#let's see the genres of the error tracks
error_tracks_genre = small.loc[small.index.isin(error_id_array)]['track']['genre_top']

i=0;

for track_id, genre in error_tracks_genre.items():
    print("track with error:")
    print("track id:",track_id,"genre:",genre)
    
    same_genre_tracks = small.loc[small['track']['genre_top'] == genre]['track']['genre_top'] #select other tracks in dataset with the same genre
    print("Some tracks with same genre:")
    ipd.display(same_genre_tracks.head())
    
    new_track = same_genre_tracks.iloc[[i]] #pick the i-th track with same genre in the dataset
    print("track",track_id,"replaced with another track:")
    ipd.display(new_track)
    new_id = new_track.index[0] #id of the good track (to be copied into the bad one)
    print("new id:",new_id)
    
    #replace the file of good track to error track
    source_filename = utils.get_audio_path(AUDIO_DIR, new_id)
    dest_filename = utils.get_audio_path(AUDIO_DIR, track_id)

    print("copy from:",source_filename," to:",dest_filename)
    shutil.copy2(source_filename, dest_filename)

    i+=1; #next track index (to ensure not to copy the same good track again and again)
    print("")
    


### Retry to create STFT for the tracks with errors

In [None]:
#get the track ids
track_ids = [int(numeric_string) for numeric_string in error_id_array] #cast to array of int instead aof array of strings
print("Track ids length: ",len(track_ids), "track ids content:",track_ids)

error_list = []

#open each track audio
for track_id in track_ids:
    print("track id: ",track_id)
    
    try:
        #open the file using the utils
        filename = utils.get_audio_path(AUDIO_DIR, track_id)
        print('File: {}'.format(filename))

        x, sr = librosa.load(filename, sr=None, mono=True) #load the MONO instead of stereo as in paper
        print('Duration: {:.2f}s, {} samples'.format(x.shape[-1] / sr, x.size))

        #now lets try to calculate stft on each 3s clip of the 30s song

        stft_list = []

        for i in range (0,28,3):
            start, end = i, i+3
            #print('start:',start,' end:',end)

            #extract clip (TODO: IMPLEMENT 50% OVERLAPPING TO GET THE DOUBLE OF THE CLIPS)
            clip = x[start*sr:end*sr]

            #calculate stft
            stft = np.abs(librosa.stft(clip, n_fft=1024, hop_length=512))
            #print("Shape of the stft:",stft.shape)

            #resize the spectrogram to become 512 x 128 instead of 512 x 130
            stft = librosa.util.fix_length(stft, size=513, axis=0)[:, :128]
            #print("Shape of the resized stft:",stft.shape)

            stft_list.append(stft)

        stft_array = np.array(stft_list) #the final array containing the stft for each clip of the song
        print("shape of the stft vector: ", stft_array.shape)


        save_filename = './data/fma_small_stft/' + str(track_id)
        print("Saving the stft vector in file:",save_filename) 

        #save the stft array in a file with the name 'track_id.npy'
        np.save(save_filename,stft_array)
    except:
        print("ERROR on file: ",filename)
        error_list.append(filename)
#show plot for a single clip stft
'''
fig, ax = plt.subplots()
clip_number = 5 #a number from 0 to 9
img = librosa.display.specshow(librosa.amplitude_to_db(stft_array[clip_number], ref=np.max), y_axis='log', x_axis='time', ax=ax)
ax.set_title('STFT Power spectrogram start: {}, end: {}'.format(3*clip_number,3*clip_number+3))
fig.colorbar(img, ax=ax, format="%+2.0f dB")

'''

## 3 Deep learning on extracted audio features

Look at:
* Pre-processing in Keras: https://github.com/keunwoochoi/kapre
* Convolutional Recurrent Neural Networks for Music Classification: https://github.com/keunwoochoi/icassp_2017
* Music Auto-Tagger: https://github.com/keunwoochoi/music-auto_tagging-keras
* Pre-processor: https://github.com/bmcfee/pumpp

### 3.1 ConvNet on MFCC

* Architecture: [Automatic Musical Pattern Feature Extraction Using Convolutional Neural Network](http://www.iaeng.org/publication/IMECS2010/IMECS2010_pp546-550.pdf), Tom LH. Li, Antoni B. Chan and Andy HW. Chun
* Missing: track segmentation and majority voting.
* Best seen: 17.6%

In [None]:
class MfccLoader(utils.Loader):
    raw_loader = utils.FfmpegLoader(sampling_rate=22050)
    #shape = (13, 190)  # For segmented tracks.
    shape = (13, 2582)
    def load(self, filename):
        import librosa
        x = self.raw_loader.load(filename)
        # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.
        mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)
        return mfcc

loader = MfccLoader()
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape

In [None]:
keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((*loader.shape, 1),  input_shape=loader.shape))
print(model.output_shape)

model.add(Conv2D(3, 13, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(15, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(65, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Flatten())
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)
#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)

loss