In [17]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
from python_speech_features import sigproc
from scipy.fftpack import dct
import os

In [10]:
def read_fma_audio(path):
    audio_files= []
    for subdir, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".mp3") or file.endswith(".wav"):
                file_path= os.path.join(subdir, file)
                # read audio file with no sampling rate
                data, sampling_rate= librosa.load(file_path, sr= None)  
                duration= librosa.get_duration(y= data, sr= sampling_rate)
                # append information to the list
                audio_files.append({
                    'file_path': file_path,
                    'duration': duration,
                    'format': file.split('.')[-1]
                })
    return audio_files

In [11]:
dataset_path= 'noob_sample'
audios= read_fma_audio(dataset_path)

In [12]:
audio_df= pd.DataFrame(audios)
audio_df.head()

Unnamed: 0,file_path,duration,format
0,noob_sample/000/000498.mp3,29.976576,mp3
1,noob_sample/000/000549.mp3,29.976576,mp3
2,noob_sample/000/000568.mp3,29.976576,mp3
3,noob_sample/000/000517.mp3,30.002698,mp3
4,noob_sample/000/000540.mp3,29.976576,mp3


## MetaData ##

In [3]:
# file paths
tracks_file= 'fma_metadata/tracks.csv'
genres_file= 'fma_metadata/genres.csv'
features_file= 'fma_metadata/features.csv'
echonest_file= 'fma_metadata/echonest.csv'
# load csv files
tracks_df= pd.read_csv(tracks_file, index_col= 0, header= [0, 1])
genres_df= pd.read_csv(genres_file)
features_df= pd.read_csv(features_file, index_col= 0, header= [0, 1, 2])
echonest_df= pd.read_csv(echonest_file, index_col= 0, header= [0, 1, 2])


In [4]:
tracks_df.head()

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [5]:
genres_df.head()

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5


In [6]:
features_df.head()  

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.0,3.542325,0.0408
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.81641,0.043851,-0.804761,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993


In [7]:
echonest_df.head()

Unnamed: 0_level_0,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest
Unnamed: 0_level_1,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,metadata,metadata,...,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features
Unnamed: 0_level_2,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,album_date,album_name,...,214,215,216,217,218,219,220,221,222,223
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.08277,6.01864,16.673548,325.581085
5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,,,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,2008-03-11,Constant Hitmaker,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809
134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,,,...,-1.452696,2.356398,0.234686,0.19955,0.149332,0.0644,11.26707,11.20267,26.45418,751.147705


## Feature Extraction ##

In [18]:
def calculate_nfft(sample_rate, window_len):
    # sample rate of the signal
    # window length in seconds
    window_len_samples= window_len * sample_rate
    nfft= 1
    while nfft < window_len_samples:
        # calculates nfft as a power of 2
        nfft *= 2
    return nfft

def hz2mel(hz):
    """Convert a value in Hertz to Mels

    :param hz: a value in Hz. This can also be a np array, conversion proceeds element-wise.
    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
    """
    return 2595 * np.log10(1+hz/700.)

def mel2hz(mel):
    """Convert a value in Mels to Hertz

    :param mel: a value in Mels. This can also be a np array, conversion proceeds element-wise.
    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
    """
    return 700*(10**(mel/2595.0)-1)


def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)

    :param nfilt: the number of filters in the filterbank, default 20.
    :param nfft: the FFT size. Default is 512.
    :param samplerate: the sample rate of the signal we are working with, in Hz. Affects mel spacing.
    :param lowfreq: lowest band edge of mel filters, default 0 Hz
    :param highfreq: highest band edge of mel filters, default samplerate/2
    :returns: A np array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
    """
    highfreq= highfreq or samplerate/2
    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"

    # compute points evenly spaced in mels
    lowmel = hz2mel(lowfreq)
    highmel = hz2mel(highfreq)
    melpoints = np.linspace(lowmel,highmel,nfilt+2)
    # our points are in Hz, but we use fft bins, so we have to convert
    #  from Hz to fft bin number
    bin = np.floor((nfft+1)*mel2hz(melpoints)/samplerate)

    fbank = np.zeros([nfilt,nfft//2+1])
    for j in range(0,nfilt):
        for i in range(int(bin[j]), int(bin[j+1])):
            fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
        for i in range(int(bin[j+1]), int(bin[j+2])):
            fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
    return fbank

def fbank(signal, samplerate=16000, winlen=0.025, winstep=0.01,
          nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97,
          winfunc=lambda x: np.ones((x,))):
    """Compute Mel-filterbank energy features from an audio signal."""
    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc)
    pspec = sigproc.powspec(frames, nfft)
    energy = np.sum(pspec, 1)  # total energy in each frame
    energy = np.where(energy == 0, np.finfo(float).eps, energy)

    fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
    feat = np.dot(pspec, fb.T)
    feat = np.where(feat == 0, np.finfo(float).eps, feat)

    return feat, energy

def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
             nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
             winfunc=lambda x:np.ones((x,))):
    """Compute log Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the sample rate of the signal we are working with, in Hz.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use np window functions here e.g. winfunc=np.hamming
    :returns: A np array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
    """
    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
    return np.log(feat)

In [19]:
def mfcc(signal, sample_rate= 16000, window_len= 0.025, hop_step= 0.01, num_ceptra= 13, nfilter= 26, nfft= None, low_frequency= 0, high_frequency= None, preemphasis= 0.97, ceplifter=22, appendEnergy= True, window_func= lambda x: np.ones((x,))):
    """"  
    signal; audio signal from which to compute features. 
    sample_rate; sample rate in HZ of the signal.
    window_len; length of the analysis window in seconds.
    hop_step; steps between windows in seconds.
    num_ceptra; the number of cepstrum to return, anything above 26 is ignored by default
    nfilter; the number of filters in the mel filterbank.
    nfft; FFT size. we will use nfft function to cater for drops in sample data.
    low_frequency; lowest edge of mel filters(0).
    high_frequency; highest edge of mel filter(sr/2).
    preemphasis; pre-emphasis filter
    ceplifter; apply a lifting coefficient to final cepstral coefficients.
    appendEnergy; if set true, ceptral coeff is replaced by log of each frame.
    window_func; the analysis window to apply to each frame.
    
    """
    if nfft is None:
        nfft = calculate_nfft(sample_rate, window_len)
        feat,energy = fbank(signal,sample_rate,window_len,hop_step,nfilter,nfft,low_frequency,high_frequency,preemphasis,window_func)
        feat = np.log(feat)
        feat = dct(feat, type=2, axis=1, norm='ortho')[:,:num_ceptra]
        if appendEnergy: feat[:,0] = np.log(energy) # replace first cepstral coefficient with log of frame energy
    return feat
        

In [23]:
audio_df['mfcc_features'] = audio_df['file_path'].apply(lambda x: mfcc(*librosa.load(x, sr=None)))

In [24]:
audio_df.head(5)

Unnamed: 0,file_path,duration,format,mfcc_features
0,noob_sample/000/000498.mp3,29.976576,mp3,"[[-9.03683552756391, -5.362598577831097, 0.122..."
1,noob_sample/000/000549.mp3,29.976576,mp3,"[[-8.485954063460438, 7.236494273534735, 0.896..."
2,noob_sample/000/000568.mp3,29.976576,mp3,"[[-6.532540114163912, 7.612464399946411, 1.655..."
3,noob_sample/000/000517.mp3,30.002698,mp3,"[[-4.540820773706524, 7.36644395944907, -3.987..."
4,noob_sample/000/000540.mp3,29.976576,mp3,"[[-5.096697088478837, 7.950631824877814, -1.86..."
