# Hand-crafted features for GTZAN

> The goal of this notebook is to create several audio features descriptors for the GTZAN dataset, as proposed for many year as input for machine learning algorithms. We are going to use timbral texture based features and tempo based features for this. The main goal is to produce this features, classify and then compare with our proposed deep learning approach, using CNNs on the raw audio.

In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import kurtosis
from scipy.stats import skew

from sklearn.preprocessing import MinMaxScaler

In [2]:
gtzan_dir = '../data/genres/'

In [3]:
# Parameters
song_samples = 22050*30
genres = {'metal': 0, 'disco': 1, 'classical': 2, 'hiphop': 3, 'jazz': 4, 
          'country': 5, 'pop': 6, 'blues': 7, 'reggae': 8, 'rock': 9}

In [4]:
def get_features(y, sr, n_fft = 1024, hop_length = 512):
    # Features to concatenate in the final dictionary
    features = {'centroid': None, 'roloff': None, 'flux': None, 'rmse': None, 'zcr': None}
    
    # Using librosa to calculate the features
    features['centroid'] = librosa.feature.spectral_centroid(y, sr=sr, n_fft=n_fft, hop_length=hop_length).ravel()
    features['roloff'] = librosa.feature.spectral_rolloff(y, sr=sr, n_fft=n_fft, hop_length=hop_length).ravel()
    features['zcr'] = librosa.feature.zero_crossing_rate(y, frame_length=n_fft, hop_length=hop_length).ravel()
    features['rmse'] = librosa.feature.rmse(y, frame_length=n_fft, hop_length=hop_length).ravel()
    features['flux'] = librosa.onset.onset_strength(y=y, sr=sr).ravel()
    
    # MFCC treatment
    mfcc = librosa.feature.mfcc(y, n_fft = n_fft, hop_length = hop_length, n_mfcc=13)
    for idx, v_mfcc in enumerate(mfcc):
        features['mfcc_{}'.format(idx)] = v_mfcc.ravel()
        
    # Get statistics from the vectors
    def get_moments(descriptors):
        result = {}
        for k, v in descriptors.items():
            result['{}_mean'.format(k)] = np.mean(v)
            result['{}_std'.format(k)] = np.std(v)
            result['{}_kurtosis'.format(k)] = kurtosis(v)
            result['{}_skew'.format(k)] = skew(v)
        return result
    
    dict_agg_features = get_moments(features)
    dict_agg_features['tempo'] = librosa.beat.tempo(y, sr=sr)[0]
    
    return dict_agg_features

In [5]:
def read_process_songs(src_dir, debug = True):    
    # Empty array of dicts with the processed features from all files
    arr_features = []

    # Read files from the folders
    for x,_ in genres.items():
        folder = src_dir + x
        
        for root, subdirs, files in os.walk(folder):
            for file in files:
                # Read the audio file
                file_name = folder + "/" + file
                signal, sr = librosa.load(file_name)
                
                # Debug process
                if debug:
                    print("Reading file: {}".format(file_name))
                
                # Append the result to the data structure
                features = get_features(signal, sr)
                features['genre'] = genres[x]
                arr_features.append(features)
    return arr_features

In [6]:
# Get list of dicts with features and convert to dataframe
features = read_process_songs(gtzan_dir, debug=False)

In [7]:
df_features = pd.DataFrame(features)

In [8]:
df_features.shape

(1000, 74)

In [9]:
df_features.head()

Unnamed: 0,centroid_kurtosis,centroid_mean,centroid_skew,centroid_std,flux_kurtosis,flux_mean,flux_skew,flux_std,genre,mfcc_0_kurtosis,...,rmse_std,roloff_kurtosis,roloff_mean,roloff_skew,roloff_std,tempo,zcr_kurtosis,zcr_mean,zcr_skew,zcr_std
0,10.042047,1269.216477,2.260234,446.092946,6.292978,1.178202,2.346743,1.185736,2,-0.245464,...,0.030878,9.517677,2359.925818,2.693269,1111.068026,151.999081,3.351442,0.076088,0.957668,0.03038
1,0.826854,1838.910308,-0.096383,301.668493,17.150263,0.961513,3.297531,0.454976,2,0.844655,...,0.020861,0.301991,3550.580386,0.098658,717.360465,129.199219,1.260865,0.111567,0.179186,0.02629
2,0.10944,1315.36078,0.653077,326.553231,7.258358,1.103384,2.285265,0.781201,2,-1.331976,...,0.039326,0.185604,2455.501264,0.685868,726.483974,135.999178,0.527575,0.079525,0.807671,0.025371
3,0.294949,1885.865295,-0.384094,400.593715,10.510427,1.183209,2.869308,1.264878,2,1.316842,...,0.019045,0.461511,3650.585711,-0.376932,690.153455,95.703125,-0.154639,0.123387,-0.018563,0.049699
4,-0.131708,1472.965682,0.333442,240.199445,20.731936,0.876369,4.204318,0.544919,2,0.352991,...,0.002197,-0.349541,3170.343655,0.417666,649.128638,117.453835,0.144632,0.080679,0.511944,0.022168


In [10]:
df_features.to_csv('../data/GTZAN_Features.csv', index=False)