# Music Genre Classification

## Data Pre-Processing for Recurrent NN

Fabrizio Niro - Jacopo Signò

GTZAN Dataset - Music Genre Classification 

https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification

In [1]:
import librosa as lr
import pandas as pd
import numpy as np
import IPython.display as ipyd
from collections import Counter
import math
import os
from pathlib import Path
import pickle
import torch
import sklearn
import torch.nn.functional as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sys

The function "tracksplit" splits the audio track, loaded into an array, in n subtracks

In [2]:
def tracksplit(track, nsplit = 10):
    #splits a track in n subtracks of equal length
    length = len(track) // nsplit
    subtracks = []
    acc = 0

    for i in range(nsplit):
        subtracks.append(track[acc:acc+length])
        acc += length

    return subtracks

We iterate through the dataset structure to split all the audio tracks into 10 tracks of 3 seconds length

In [3]:
sec3_subtracks = []

for directory in os.listdir('Data/genres_original'):

    for file in os.listdir('Data/genres_original/{}'.format(directory)):

        try:
            track, _ = lr.load('Data/genres_original/{}/{}'.format(directory, file))

        except:
            track, _ = lr.load('Data/genres_original/{}/{}{}'.format(directory, file[:-5], '0.wav'))

        for subtr in tracksplit(track):

            sec3_subtracks.append(subtr)


  return f(*args, **kwargs)


The length of the arrays generated starting from the .wav files differs slightly among the set. We regularize the lengths taking as proxy the minimum length found 

In [4]:
sec3_len =[]
for i in sec3_subtracks:
    sec3_len.append(len(i))

Counter(sec3_len).keys()

dict_keys([66150, 66179, 66792, 66167, 66176, 66418, 66506, 66814, 66352, 66616, 66134, 66140, 67012, 66968, 66308, 66528, 67228, 66924, 66198, 66682, 66748, 66594, 67210, 67580, 66770, 66000, 66440, 66330, 66374, 66880, 66110, 67034, 66946])

In [5]:
min_len = min(Counter(sec3_len).keys())
min_len

66000

In [6]:
sec3_subtr_trim = []
for i in sec3_subtracks:
    sec3_subtr_trim.append(i[:min_len])

sec3_len_trim =[]
for i in sec3_subtr_trim:
    sec3_len_trim.append(len(i))

Counter(sec3_len_trim).keys()

dict_keys([66000])

In [7]:
Path('Processed_data/RNN').mkdir(parents=True, exist_ok=True)

In [8]:
with open('Processed_data/sec3_subtr_trim', 'wb') as f:
    pickle.dump(sec3_subtracks, f)

The pandas dataframe "data" stores for each row the track id, the label and the array of the splitted tracks

In [9]:
sec3_track_id = []
label = []

for dir in os.listdir('Data/genres_original'):

    for i in range(1000):

        sec3_track_id.append(f'{dir}_{i}')
        label.append(f'{dir}')

In [10]:
data = pd.DataFrame()
data['sec3_track_id'] = sec3_track_id
data['label'] = label
data['sec3_subtracks'] = sec3_subtr_trim

In [11]:
with open('Processed_data/data_id_lab_subt', 'wb') as f:
    pickle.dump(data, f)

In [12]:
with open('Processed_data/data_id_lab_subt', 'rb') as f:
    data = pickle.load(f)

In [13]:
data

Unnamed: 0,sec3_track_id,label,sec3_subtracks
0,reggae_0,reggae,"[-0.032226562, -0.04006958, -0.10153198, -0.14..."
1,reggae_1,reggae,"[0.21435547, 0.19308472, 0.19226074, 0.2298278..."
2,reggae_2,reggae,"[0.0730896, 0.06726074, 0.055755615, 0.0475769..."
3,reggae_3,reggae,"[-0.09701538, -0.079589844, -0.049835205, -0.0..."
4,reggae_4,reggae,"[-0.027557373, -0.061676025, -0.09249878, -0.1..."
...,...,...,...
9995,rock_995,rock,"[-0.11187744, -0.103637695, -0.08721924, -0.05..."
9996,rock_996,rock,"[-0.009063721, 0.0043029785, 0.0033874512, -0...."
9997,rock_997,rock,"[-0.041290283, -0.060668945, -0.06274414, -0.0..."
9998,rock_998,rock,"[0.08023071, 0.069732666, 0.03161621, 0.001800..."


We compute the Mel Frequency C Coef. for each splitted track. It will be the feature to feed the network with

In [14]:
sec3_mfcc = []

for i in range(len(data['sec3_subtracks'])):
    
    sec3_mfcc.append(lr.feature.mfcc(y=data['sec3_subtracks'][i]))

In [15]:
with open('Processed_data/sec3_mfcc', 'wb') as f: 
    pickle.dump(sec3_mfcc, f)

In [16]:
with open('Processed_data/sec3_mfcc', 'rb') as f:
    sec3_mfcc = pickle.load(f)

We store the MFCC matrices of all the splitted tracks into a torch tensor

In [17]:
sec3_mfcc = np.array(sec3_mfcc, dtype='float32')

In [18]:
sec3_mfcc = sec3_mfcc.reshape((-1, 129, 20))

In [19]:
sec3_mfcc.shape

(10000, 129, 20)

We create the target array encoding the labels with One Hot method

In [20]:
labels = np.array(data['label'])
le = preprocessing.LabelEncoder()
target = le.fit_transform(labels)
#target_one_hot = tf.one_hot(target.to(torch.int64), 10)

y = target

y

array([8, 8, 8, ..., 9, 9, 9])

In [21]:
y.dtype

dtype('int64')

In [22]:
scaler = StandardScaler()
#sec3_mfcc_scaled = scaler.fit_transform(sec3_mfcc)

sec3_mfcc_scaled = [None]*len(sec3_mfcc)
for i in range(len(sec3_mfcc)):
    sec3_mfcc_scaled[i] = scaler.fit_transform(sec3_mfcc[i])

In [23]:
len(y)

10000

In [24]:
len(sec3_mfcc_scaled)

10000

In [25]:
sec3_mfcc_scaled[0].dtype

dtype('float32')

In [26]:
data_ok = []

for i in range(len(sec3_mfcc_scaled)):

    data_ok.append([sec3_mfcc_scaled[i].astype('float32'), y[i]])

In [27]:
len(data_ok)

10000

In [28]:
data_ok[0][0].dtype

dtype('float32')

In [29]:
len(data_ok[0])

2

In [30]:
data_ok[0][0].shape

(129, 20)

In [31]:
data_train, data_test = train_test_split(
    data_ok, test_size=0.2, random_state=42)

In [32]:
len(data_train)

8000

In [33]:
data_train[0][0].dtype

dtype('float32')

In [34]:
data_test[0][0].dtype

dtype('float32')

In [35]:
len(data_train[0])

2

In [36]:
data_train[0][0].shape

(129, 20)

In [37]:
sys.getsizeof(data_train)

67224

In [38]:
sys.getsizeof(data_ok)

85176

In [39]:
len(data_test)

2000

In [40]:
with open('Processed_data/RNN/data_train', 'wb') as f: 
    pickle.dump(data_train, f)

In [41]:
with open('Processed_data/RNN/data_test', 'wb') as f: 
    pickle.dump(data_test, f)