In [1]:
import librosa
import matplotlib.pyplot as plt
import numpy as np

In [2]:
hop_length = 512
frame_size = 512
audio_path = '/home/gab/Área de Trabalho/birdclef/train_audio/abethr1/XC128013.ogg'
x, sr = librosa.load(audio_path)

In [3]:
S = librosa.stft(x, n_fft=frame_size, hop_length=hop_length)

In [4]:
M = np.abs(S) # vetor de magnitudes
f_bins = librosa.fft_frequencies(sr=sr, n_fft = frame_size) # vetor de frequencias

In [5]:
def spectral_centroid(magnitude, frequencias):
    centroides = np.zeros(magnitude.shape[1])
    
    for i in range(magnitude.shape[1]):
        num = np.sum(frequencias*magnitude[:, i])
        den = np.sum(magnitude[:, i])
        
        if den != 0:
            centroides[i] = num / den
        else:
            centroides[i] = 0
    
    return centroides

spectral_centroid(M, f_bins)


array([4689.98411711, 2952.29914276, 3653.89310196, ..., 4690.53689301,
       2989.07597697, 3833.75657996])

In [6]:
librosa.feature.spectral_centroid(S=M, freq=f_bins)[0]

array([4689.98391365, 2952.29920089, 3653.89324244, ..., 4690.53678682,
       2989.07608219, 3833.75685536])

In [7]:
def spectral_rolloff(magnitude, freq, pcent = 0.85):
    n_frames = magnitude.shape[1]
    rolloff = np.zeros(n_frames)

    for i in range(n_frames):
        f_mag = magnitude[:, i]
        sum = np.sum(f_mag)
        accumulated_energy = 0.0

        for j, mag in enumerate(f_mag):
            accumulated_energy += mag
            if accumulated_energy >= pcent * sum:
                rolloff[i] = freq[j]
                break
    
    return rolloff

spectral_rolloff(M, f_bins)

array([8656.34765625, 7450.48828125, 7235.15625   , ..., 8957.8125    ,
       7881.15234375, 8096.484375  ])

In [8]:
librosa.feature.spectral_rolloff(S=M, freq=f_bins)

array([[8656.34765625, 7450.48828125, 7235.15625   , ..., 8957.8125    ,
        7881.15234375, 8096.484375  ]])

In [9]:
def spectral_flux(magnitude):
    normal_mag = librosa.amplitude_to_db(magnitude)
    n_frames = magnitude.shape[1]
    sf = np.zeros(n_frames)
    
    for i in range(n_frames):
        sf[i] = np.sum((normal_mag[:, i] - normal_mag[:, i-1])**2)
    return sf

spectral_flux(M)

array([24718.9765625 , 55315.4453125 , 35996.6484375 , ...,
       17806.90429688, 23791.92578125, 58441.8671875 ])

In [10]:
def spectral_flux(magnitude):
    normal_mag = librosa.amplitude_to_db(magnitude)
    diff = np.diff(normal_mag, axis=1)
    sf = np.sum(diff**2, axis=0)
    return sf

spectral_flux(M)

array([55315.445, 35996.65 , 21118.71 , ..., 17806.904, 23791.926,
       58441.867], dtype=float32)

In [23]:
def sign(arg):
    if arg >= 0:
        return 1
    return 0

def time_domain_zero_crossings(signal, frame_size):
    sign_array_function = np.vectorize(sign)
    sign_array = sign_array_function(signal)
    zt = np.zeros((len(signal) + frame_size - 1) // frame_size)
    
    for t in range(0, len(signal), frame_size):
        frame = signal[t : t + frame_size]
        diff = np.abs(np.diff(sign_array[t : t + frame_size]))
        crossings = np.sum(diff)
        zt[t // frame_size] = 0.5 * crossings
    
    return zt

time_domain_zero_crossings(x, frame_size)

array([104.5,  71.5,  74. , ...,  63. ,  47. ,   6.5])

In [33]:
def low_energy(signal, frame_size):
    rms = np.zeros((len(signal) + frame_size - 1) // frame_size)
    
    for t in range(0, len(signal), frame_size):
        frame = signal[t : t + frame_size]
        square = np.square(frame)
        mean_square = np.mean(square)
        rms[t // frame_size] = np.sqrt(mean_square)
    
    avg_rms = np.mean(rms)
    low_energy_count = np.sum(rms < avg_rms)
    total_windows = len(rms)
    low_energy_percentage = low_energy_count / total_windows * 100
    
    return low_energy_percentage

low_energy(x, frame_size)

87.2264631043257

In [14]:
mel_spec = librosa.feature.melspectrogram(y=x, sr=sr, n_fft=frame_size, hop_length=hop_length)
mfccs = librosa.feature.mfcc(S=librosa.power_to_db(mel_spec), n_mfcc=5)
mfccs

array([[-7.2047156e+02, -5.0806726e+02, -4.4377539e+02, ...,
        -4.7200241e+02, -4.1448135e+02, -5.9052924e+02],
       [ 0.0000000e+00,  7.7803917e+01,  1.4349536e+01, ...,
         1.5689369e+00,  6.1597099e+01,  2.8675579e+01],
       [ 0.0000000e+00,  3.1201195e+01,  4.0430775e-01, ...,
        -5.6525631e+00,  3.0952339e+01,  1.0882376e+01],
       [ 0.0000000e+00,  2.9159851e+01,  2.0130863e+01, ...,
         1.3992163e+01,  2.8165968e+01,  3.4795479e+01],
       [ 0.0000000e+00,  1.3773541e+01, -3.3239133e+00, ...,
         1.5036448e+01,  2.0941603e+01,  3.2032043e+01]], dtype=float32)

In [30]:
def texture_window(analysis_windows, n=43):
    
    w = np.lib.stride_tricks.sliding_window_view(analysis_windows, window_shape=(n,))
    means = np.mean(w, axis=1)
    variances = np.var(w, axis=1)
    
    return np.mean(means), np.var(variances)

In [39]:
#label = "label"
y, sr = librosa.load(audio_path)
S = librosa.stft(y, n_fft=frame_size, hop_length=hop_length)
M = np.abs(S) # vetor de magnitudes
f_bins = librosa.fft_frequencies(sr=sr, n_fft = frame_size) # vetor de frequencias

features = np.array([])

mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=frame_size, hop_length=hop_length)

mfccs = librosa.feature.mfcc(S=librosa.power_to_db(mel_spec), n_mfcc=5)
s_centroid = spectral_centroid(M, f_bins)
s_rolloff = spectral_rolloff(M, f_bins)
s_flux = spectral_flux(M)
zc = time_domain_zero_crossings(y, frame_size)
low_energy_feature = low_energy(y, frame_size)

for mfcc in mfccs:
    texture_mean, texture_var = texture_window(mfcc)
    features = np.append(features, texture_mean)
    features = np.append(features, texture_var)

centroid_mean, centroid_var = texture_window(s_centroid)
features = np.append(features, centroid_mean)
features = np.append(features, centroid_var)

rolloff_mean, rolloff_var = texture_window(s_rolloff)
features = np.append(features, rolloff_mean)
features = np.append(features, rolloff_var)

flux_mean, flux_var = texture_window(s_flux)
features = np.append(features, flux_mean)
features = np.append(features, flux_var)

zc_mean, zc_var = texture_window(zc)
features = np.append(features, zc_mean)
features = np.append(features, zc_var)

features = np.append(features, low_energy_feature)

#features = np.append(features, label)

features

array([-4.52008209e+02,  9.21541094e+04,  3.86678755e-01,  7.01175078e+04,
       -1.32696609e+01,  4.41765742e+04,  4.12331486e+00,  1.23858750e+04,
        5.51054478e-01,  4.69406484e+04,  4.21191507e+03,  6.79293907e+10,
        8.01719779e+03,  6.09582370e+12,  1.76443066e+04,  4.55652544e+14,
        7.01478008e+01,  1.28515989e+04,  8.72264631e+01])