# Music Genre Classification

## Data Pre-Processing for Recurrent NN

Fabrizio Niro - Jacopo Signò

GTZAN Dataset - Music Genre Classification 

https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification

In [1]:
import librosa as lr
import pandas as pd
import numpy as np
import IPython.display as ipyd
from collections import Counter
import math
import os
from pathlib import Path
import pickle
import torch
import sklearn
import torch.nn.functional as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

The function "tracksplit" splits the audio track, loaded into an array, in n subtracks

In [2]:
def tracksplit(track, nsplit = 10):
    #splits a track in n subtracks of equal length
    length = len(track) // nsplit
    subtracks = []
    acc = 0

    for i in range(nsplit):
        subtracks.append(track[acc:acc+length])
        acc += length

    return subtracks

We iterate through the dataset structure to split all the audio tracks into 10 tracks of 3 seconds length

In [3]:
sec3_subtracks = []

for directory in os.listdir('Data/genres_original'):

    for file in os.listdir('Data/genres_original/{}'.format(directory)):

        try:
            track, _ = lr.load('Data/genres_original/{}/{}'.format(directory, file))

        except:
            track, _ = lr.load('Data/genres_original/{}/{}{}'.format(directory, file[:-5], '0.wav'))

        for subtr in tracksplit(track):

            sec3_subtracks.append(subtr)


  return f(*args, **kwargs)


The length of the arrays generated starting from the .wav files differs slightly among the set. We regularize the lengths taking as proxy the minimum length found 

In [4]:
sec3_len =[]
for i in sec3_subtracks:
    sec3_len.append(len(i))

Counter(sec3_len).keys()

dict_keys([66179, 66968, 66167, 66528, 66176, 66352, 66140, 66134, 67012, 67228, 66308, 66330, 66682, 66880, 66110, 66374, 66418, 66616, 66814, 66792, 66506, 66150, 66440, 66924, 66000, 66770, 67580, 66594, 67210, 66198, 66748, 67034, 66946])

In [5]:
min_len = min(Counter(sec3_len).keys())
min_len

66000

In [6]:
sec3_subtr_trim = []
for i in sec3_subtracks:
    sec3_subtr_trim.append(i[:min_len])

sec3_len_trim =[]
for i in sec3_subtr_trim:
    sec3_len_trim.append(len(i))

Counter(sec3_len_trim).keys()

dict_keys([66000])

In [7]:
Path('Processed_data/RNN').mkdir(parents=True, exist_ok=True)

In [8]:
with open('Processed_data/sec3_subtr_trim', 'wb') as f:
    pickle.dump(sec3_subtracks, f)

The pandas dataframe "data" stores for each row the track id, the label and the array of the splitted tracks

In [9]:
sec3_track_id = []
label = []

for dir in os.listdir('Data/genres_original'):

    for i in range(1000):

        sec3_track_id.append(f'{dir}_{i}')
        label.append(f'{dir}')

In [10]:
data = pd.DataFrame()
data['sec3_track_id'] = sec3_track_id
data['label'] = label
data['sec3_subtracks'] = sec3_subtr_trim

In [11]:
with open('Processed_data/data_id_lab_subt', 'wb') as f:
    pickle.dump(data, f)

In [12]:
with open('Processed_data/data_id_lab_subt', 'rb') as f:
    data = pickle.load(f)

In [13]:
data

Unnamed: 0,sec3_track_id,label,sec3_subtracks
0,blues_0,blues,"[0.0073242188, 0.016601562, 0.0076293945, -0.0..."
1,blues_1,blues,"[-0.072753906, -0.055389404, -0.036102295, -0...."
2,blues_2,blues,"[0.06997681, 0.14709473, 0.2263794, 0.28271484..."
3,blues_3,blues,"[-0.31854248, -0.2897339, -0.25097656, -0.2348..."
4,blues_4,blues,"[0.19113159, 0.12878418, 0.06561279, -0.004669..."
...,...,...,...
9995,rock_995,rock,"[-0.004211426, 0.013061523, 0.007232666, -0.00..."
9996,rock_996,rock,"[0.007659912, 0.0037231445, 0.004058838, 0.010..."
9997,rock_997,rock,"[0.028564453, 0.03237915, 0.042114258, 0.05493..."
9998,rock_998,rock,"[-0.08312988, -0.07098389, -0.029724121, -0.01..."


We compute the Mel Frequency C Coef. for each splitted track. It will be the feature to feed the network with

In [14]:
sec3_mfcc = []

for i in range(len(data['sec3_subtracks'])):
    
    sec3_mfcc.append(lr.feature.mfcc(y=data['sec3_subtracks'][i]))

In [15]:
with open('Processed_data/sec3_mfcc', 'wb') as f: 
    pickle.dump(sec3_mfcc, f)

In [16]:
with open('Processed_data/sec3_mfcc', 'rb') as f:
    sec3_mfcc = pickle.load(f)

We store the MFCC matrices of all the splitted tracks into a torch tensor

In [17]:
sec3_mfcc = np.array(sec3_mfcc, dtype='float64')

In [18]:
sec3_mfcc = torch.tensor(sec3_mfcc)

We create the target array encoding the labels with One Hot method

In [19]:
labels = np.array(data['label'])
le = preprocessing.LabelEncoder()
target = le.fit_transform(labels)
target = torch.tensor(target)
#target_one_hot = tf.one_hot(target.to(torch.int64), 10)

In [20]:
target

tensor([0, 0, 0,  ..., 9, 9, 9], dtype=torch.int32)

In [21]:
#target_one_hot

In [22]:
X = torch.flatten(sec3_mfcc, start_dim=1)

y = target

In [23]:
X

tensor([[-2.4244e+02, -2.1299e+02, -1.9541e+02,  ...,  2.2183e+00,
          8.2816e+00,  8.8715e+00],
        [-1.6625e+02, -1.5054e+02, -1.6450e+02,  ...,  6.3761e-02,
          2.9251e+00,  1.2711e+01],
        [-1.3069e+02, -1.2486e+02, -1.5329e+02,  ...,  6.7889e+00,
          6.1143e+00,  1.3908e-01],
        ...,
        [-2.7123e+02, -2.6610e+02, -2.7025e+02,  ...,  4.8574e-01,
         -2.0561e+00, -4.5958e+00],
        [-1.1438e+02, -9.3377e+01, -1.0054e+02,  ..., -3.8260e+00,
         -2.2336e+00, -1.5710e+00],
        [-2.6820e+02, -2.5146e+02, -2.5395e+02,  ..., -2.1837e+00,
         -4.0726e+00, -5.3789e+00]], dtype=torch.float64)

In [24]:
y

tensor([0, 0, 0,  ..., 9, 9, 9], dtype=torch.int32)

In [25]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [26]:
X_scaled

array([[-0.78764284, -0.65046513, -0.41381591, ...,  0.34597108,
         1.02128406,  1.23036079],
       [-0.07341539, -0.1001311 , -0.15433941, ...,  0.11068252,
         0.43703148,  1.69340506],
       [ 0.2600198 ,  0.12611122, -0.06023841, ...,  0.84511728,
         0.78488638,  0.17719336],
       ...,
       [-1.05758505, -1.11845248, -1.04208863, ...,  0.15676552,
        -0.106284  , -0.39384866],
       [ 0.41285305,  0.40357573,  0.38266885, ..., -0.31410316,
        -0.12564417, -0.02905444],
       [-1.02921939, -0.98946415, -0.90524509, ..., -0.13476045,
        -0.32624082, -0.48829588]])

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.33, random_state=42)

In [28]:
print(X_train.shape, y_train.shape)

(6700, 2580) torch.Size([6700])


In [29]:
print(X_test.shape, y_test.shape)

(3300, 2580) torch.Size([3300])


In [30]:
with open('Processed_data/RNN/X_train', 'wb') as f: 
    pickle.dump(X_train, f)

with open('Processed_data/RNN/y_train', 'wb') as f: 
    pickle.dump(y_train, f)

with open('Processed_data/RNN/X_test', 'wb') as f: 
    pickle.dump(X_test, f)

with open('Processed_data/RNN/y_test', 'wb') as f: 
    pickle.dump(y_test, f)