In [13]:
from essentia.streaming import *
import essentia.standard as es
import essentia
import numpy as np

In [22]:
def melspectrogram(audio, sampleRate=16000, frameSize=512, hopSize=256,
                   window='hann', zeroPadding=0, center=True,
                   numberBands=[128, 96, 48, 32, 24, 16, 8],
                   lowFrequencyBound=0, highFrequencyBound=None,
                   weighting='linear', warpingFormula='slaneyMel', normalize='unit_tri'):

    if highFrequencyBound is None:
        highFrequencyBound = sampleRate/2

    windowing = es.Windowing(type=window, normalized=False, zeroPadding=zeroPadding)
    spectrum = es.Spectrum()
    melbands = {}
    for nBands in numberBands:
        melbands[nBands] = es.MelBands(numberBands=nBands,
                                       sampleRate=sampleRate,
                                       lowFrequencyBound=lowFrequencyBound,
                                       highFrequencyBound=highFrequencyBound,
                                       inputSize=(frameSize+zeroPadding)//2+1,
                                       weighting=weighting,
                                       normalize=normalize,
                                       warpingFormula=warpingFormula,
                                       type='power')
    norm10k = es.UnaryOperator(type='identity', shift=1, scale=10000)
    log10 = es.UnaryOperator(type='log10')
    amp2db = es.UnaryOperator(type='lin2db', scale=2)

    results = essentia.Pool()

    for frame in es.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize,
                                   startFromZero=not center):
        spectrumFrame = spectrum(windowing(frame))

        for nBands in numberBands:
            melFrame = melbands[nBands](spectrumFrame)
            results.add('mel_' + str(nBands)+'_db', amp2db(melFrame))
            results.add('mel_' + str(nBands)+'_log1+10kx', log10(norm10k(melFrame)))

    return results


def cut_audio(filename, sampleRate=16000, segment_duration=None):

    audio = es.MonoLoader(filename=filename, sampleRate=sampleRate)()

    if segment_duration:
        segment_duration = round(segment_duration*sampleRate)
        segment_start = (len(audio) - segment_duration) // 2
        segment_end = segment_start + segment_duration
    else:
        segment_start = 0
        segment_end = len(audio)

    if segment_start < 0 or segment_end > len(audio):
      raise ValueError('Segment duration is larger than the input audio duration')

    return audio[segment_start:segment_end]


def analyze_mel(filename, segment_duration=None, maxFrequency=11025, replaygain=True):
    lowlevelFrameSize=2048
    lowlevelHopSize=1024

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()

    if replaygain:
        replaygain = es.ReplayGain()(audio)
    else:
        replaygain = -6 # Default replaygain value in EasyLoader

    if segment_duration:
        segment_start = (len(audio) / 44100 - segment_duration) / 2
        segment_end = segment_start + segment_duration
    else:
        segment_start = 0
        segment_end = len(audio)/44100

    if segment_start < 0 or segment_end > len(audio)/44100:
      raise ValueError('Segment duration is larger than the input audio duration')

    loader_mel = EasyLoader(filename=filename, replayGain=replaygain,
                            startTime=segment_start, endTime=segment_end)

    # Processing for Mel bands
    framecutter_mel = FrameCutter(frameSize=lowlevelFrameSize,
                                  hopSize=lowlevelHopSize)
    window_mel = Windowing(type='hann', zeroPadding=lowlevelFrameSize)

    spectrum_mel = Spectrum()

    melbands128 = MelBands(numberBands=128,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize+1)

    melbands96 = MelBands(numberBands=96,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize+1)

    melbands48 = MelBands(numberBands=48,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize+1)

    melbands32 = MelBands(numberBands=32,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize+1)

    melbands24 = MelBands(numberBands=24,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize+1)

    melbands16 = MelBands(numberBands=16,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize+1)

    melbands8 = MelBands(numberBands=8,
                         lowFrequencyBound=0,
                         highFrequencyBound=maxFrequency,
                         inputSize=lowlevelFrameSize+1)



    # Normalize Mel bands: log10(1+x*10000)
    norm128 = UnaryOperator(type='identity', shift=1, scale=10000)
    log10128 = UnaryOperator(type='log10')

    norm96 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1096 = UnaryOperator(type='log10')

    norm48 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1048 = UnaryOperator(type='log10')

    norm32 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1032 = UnaryOperator(type='log10')

    norm24 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1024 = UnaryOperator(type='log10')

    norm16 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1016 = UnaryOperator(type='log10')

    norm8 = UnaryOperator(type='identity', shift=1, scale=10000)
    log108 = UnaryOperator(type='log10')

    p = essentia.Pool()

    loader_mel.audio >> framecutter_mel.signal
    framecutter_mel.frame >> window_mel.frame >> spectrum_mel.frame

    spectrum_mel.spectrum >> melbands128.spectrum
    spectrum_mel.spectrum >> melbands96.spectrum
    spectrum_mel.spectrum >> melbands48.spectrum
    spectrum_mel.spectrum >> melbands32.spectrum
    spectrum_mel.spectrum >> melbands24.spectrum
    spectrum_mel.spectrum >> melbands16.spectrum
    spectrum_mel.spectrum >> melbands8.spectrum

    melbands128.bands >> norm128.array >> log10128.array >> (p, 'mel128')
    melbands96.bands >> norm96.array >> log1096.array >> (p, 'mel96')
    melbands48.bands >> norm48.array >> log1048.array >> (p, 'mel48')
    melbands32.bands >> norm32.array >> log1032.array >> (p, 'mel32')
    melbands24.bands >> norm24.array >> log1024.array >> (p, 'mel24')
    melbands16.bands >> norm16.array >> log1016.array >> (p, 'mel16')
    melbands8.bands >> norm8.array >> log108.array >> (p, 'mel8')

    essentia.run(loader_mel)

    return p

In [23]:
pool = analyze_mel('576923.mp3')

In [24]:
mel_ess = pool['mel48'].T

In [25]:
mel_melon = np.load('576923.npy')

In [26]:
mel_melon.shape

(48, 1876)

In [27]:
mel_ess.shape

(48, 2157)

In [28]:
mel_melon

array([[ -7.887066 , -28.748133 , -18.874838 , ..., -18.29318  ,
        -20.040993 , -15.670321 ],
       [ -5.681465 , -25.458563 , -18.32325  , ..., -14.090767 ,
        -14.78177  , -12.724834 ],
       [ -1.1125975, -11.245893 , -11.762764 , ...,  -3.8788598,
         -5.2051177, -12.451994 ],
       ...,
       [-35.191166 , -27.399254 , -31.73117  , ..., -39.381298 ,
        -40.27823  , -39.283855 ],
       [-35.241993 , -28.77065  , -29.981926 , ..., -42.0037   ,
        -41.793472 , -41.58875  ],
       [-37.730556 , -41.596817 , -41.02214  , ..., -47.232635 ,
        -45.44609  , -45.186172 ]], dtype=float32)

In [29]:
mel_ess

array([[0.0000000e+00, 3.1063152e-07, 3.6240343e-07, ..., 3.1573690e-02,
        1.8236548e-02, 0.0000000e+00],
       [0.0000000e+00, 2.0708771e-07, 3.1063152e-07, ..., 1.7096120e-01,
        2.6123753e-02, 0.0000000e+00],
       [0.0000000e+00, 1.0354386e-07, 5.6949091e-07, ..., 2.2004661e-01,
        1.6786341e-02, 0.0000000e+00],
       ...,
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 2.8237587e-04,
        1.9672891e-05, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 3.5056160e-04,
        2.6040505e-05, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 5.3540588e-04,
        6.4813634e-05, 0.0000000e+00]], dtype=float32)