In [73]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Gaussian blur a tensor using 2D convolution and extract predominant peaks
    # Define the Gaussian kernel
def gaussian_blur(x, sigma):
    kernel_size = 2 * int(2 * sigma) + 1
    kernel = torch.tensor([torch.exp(torch.tensor(-0.5 * (i - kernel_size // 2) ** 2 / sigma ** 2)) for i in range(kernel_size)])
    kernel = kernel / kernel.sum()
    kernel = kernel.view(1, 1, 1, -1)
    kernel = kernel.repeat(x.size(1), 1, 1, 1)
    kernel = kernel.to(x.device, x.dtype)

    # Blur the tensor by convolving it with the Gaussian kernel
    x = F.pad(x, (kernel_size // 2, kernel_size // 2, 0, 0), mode='reflect')
    x = F.conv2d(x, kernel, stride=1, groups=x.size(1))

    # max pooling with stride 1 to find predominant peaks
    x = F.max_pool2d(x, kernel_size=kernel_size, padding=kernel_size // 2)
    
    return x


# From 2-d tensor get a mask of local maxima. Apply the mask to the tensor
def local_maxima(x, window_size):
    # Find the maximum value in each local window
    x_local_max = F.max_pool2d(x, kernel_size=window_size, stride=1, padding=window_size // 2)
    
    # Generate a binary mask of local maxima
    mask = (x == x_local_max).float()
    
    # Apply the mask to the tensor
    x = x * mask
    
    return x

In [71]:
import librosa
import numpy as np
import matplotlib.pyplot as plt

y, sr = librosa.load(librosa.ex("nutcracker"))
# Random snippet of the audio
r = np.random.randint(0, len(y)-sr)
y = y[r:r+2*sr]
spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=64)
spec = librosa.power_to_db(spec, ref=np.max)

peaks = gaussian_blur(torch.tensor(spec).unsqueeze(0).unsqueeze(0).float(), sigma=0.7)

plt.figure(figsize=(10, 4))
plt.imshow(spec, aspect='auto', origin='lower')
plt.colorbar()
plt.title('Original Spectrogram')
plt.show()

plt.figure(figsize=(10, 4))
plt.imshow(peaks.squeeze().detach().numpy(), aspect='auto', origin='lower')
plt.colorbar()
plt.title('Blurred spectrogram')
plt.show()


RuntimeError: The size of tensor a (89) must match the size of tensor b (91) at non-singleton dimension 3