# Preprocess the data

In [1]:
import numpy as np
import pandas as pd

In [2]:
path = '../data/fma_metadata/'
tracks = pd.read_csv(path + 'tracks.csv', index_col=0, header=[0, 1])
features = pd.read_csv(path + 'features.csv', index_col=0, header=[0, 1, 2])

In [5]:
# This code is selecting the small dataset and extracting the features to use in the model.
# It is also separating the tracks into train, validation, and test sets.
small = tracks['set', 'subset'] == 'small'

# FMA has already separated the tracks for into the three sets (train, val, test)
train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

# Load the genre labels
y_train = tracks.loc[small & train, ('track', 'genre_top')]
y_val = tracks.loc[small & val, ('track', 'genre_top')]
y_test = tracks.loc[small & test, ('track', 'genre_top')]

# Load the mfccs
X_train = features.loc[small & train, 'mfcc']
X_val = features.loc[small & val, 'mfcc']
X_test = features.loc[small & test, 'mfcc']

print('{} training examples'.format(y_train.size))
print('{} cross validation examples'.format(y_val.size))
print('{} testing examples'.format(y_test.size))
print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))

X_train.to_numpy().shape

6400 training examples
800 cross validation examples
800 testing examples
140 features, 8 classes


(6400, 140)