In [None]:
import torch
import torchaudio
import torchaudio.functional as F
import librosa
import librosa.display
import os
from src.config_defaults import *
import IPython.display
import numpy as np

%load_ext autoreload
%autoreload 2

os.chdir(PATH_WORK_DIR)

audio_path = str(Path(PATH_IRMAS_TEST, "(02) dont kill the whale-1.wav"))
audio, sr = librosa.load(audio_path, sr=None)

In [None]:
import matplotlib.pyplot as plt

def play_audio(audio, rate):
    IPython.display.display(IPython.display.Audio(data=audio, rate=rate))
    
def plot_audio_and_spectrogram(audio=None, spectrogram=None, sr=16_000):
    if audio is None:
        audio = librosa.istft(spectrogram, length=len(audio))
    if spectrogram is None:
        spectrogram = librosa.stft(audio)
    
    librosa.display.waveshow(y=audio, sr=sr)
    plt.show()
    reference_power = np.max(np.abs(spectrogram))
    librosa.display.specshow(librosa.amplitude_to_db(np.abs(spectrogram), ref=reference_power), y_axis='log', x_axis='time')
    
print(audio.shape)
noise=np.random.normal(0, 0.05, audio.shape[0])

audio_with_noise = audio + noise

librosa.display.waveshow(y=audio_with_noise, sr=sr)
librosa.display.waveshow(y=audio, sr=sr)
IPython.display.Audio(data=audio, rate = sr)

reconstructed_audio = librosa.istft(librosa.stft(audio), length=len(audio))
play_audio(reconstructed_audio, sr)

plot_audio_and_spectrogram(audio)
# librosa.segment.cross_similarity(audio, reconstructed_audio)

In [None]:

spectrogram = librosa.stft(audio)
spectrogram_harmonic, spectrogram_percussive = librosa.decompose.hpss(spectrogram, margin=5)

# Pre-compute a global reference power from the input spectrum
reference_power = np.max(np.abs(spectrogram))

fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)

img = librosa.display.specshow(librosa.amplitude_to_db(np.abs(spectrogram), ref=reference_power),
                         y_axis='log', x_axis='time', ax=ax[0])
ax[0].set(title='Full spectrogram')
ax[0].label_outer()

librosa.display.specshow(librosa.amplitude_to_db(np.abs(spectrogram_harmonic), ref=reference_power),
                         y_axis='log', x_axis='time', ax=ax[1])
ax[1].set(title='Harmonic spectrogram')
ax[1].label_outer()

librosa.display.specshow(librosa.amplitude_to_db(np.abs(spectrogram_percussive), ref=reference_power),
                         y_axis='log', x_axis='time', ax=ax[2])
ax[2].set(title='Percussive spectrogram')
fig.colorbar(img, ax=ax)


y_harmonic = librosa.istft(spectrogram_harmonic, length=len(audio))
IPython.display.display(IPython.display.Audio(data=y_harmonic, rate=sr))

y_percussive = librosa.istft(spectrogram_percussive, length=len(audio))
IPython.display.display(IPython.display.Audio(data=y_percussive, rate=sr))

In [None]:
y_fast = librosa.effects.time_stretch(audio, rate=2.0)

In [None]:
from abc import ABC, abstractmethod

import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.transforms.functional as F
from torchaudio.transforms import (
    FrequencyMasking,
    MelScale,
    Spectrogram,
    TimeMasking,
    TimeStretch,
)
from transformers import ASTFeatureExtractor

import src.config_defaults as config_defaults
from src.utils_functions import EnumStr, MultiEnum
import matplotlib.pyplot as plt

sr_ours = 16_000

audio_path = str(Path(PATH_IRMAS_TEST, "01 - Canto das três raças-2.wav"))
audio, orig_sr = librosa.load(audio_path, sr=None)
audio_mono = librosa.to_mono(audio)

play_audio(audio_mono, rate=orig_sr)
audio_resampled = librosa.resample(
	audio_mono,
	orig_sr=orig_sr,
	target_sr=sr_ours,
)
play_audio(audio_resampled, rate=sr_ours)


### PLOTS

sr = orig_sr
D = np.abs(librosa.stft(audio_mono))**2
mel_spectro = librosa.feature.melspectrogram(y=audio_mono, sr=sr, n_mels=128)

plt.figure(figsize=(20, 6), dpi=80)
plt.subplot(1, 2, 1)
S_dB = librosa.power_to_db(mel_spectro, ref=np.max)
librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sr, fmax=16_000)
plt.colorbar(img, format='%+2.0f dB')
plt.title(f"Mel spectrogram for {sr}hz")

### subsampled ###############################

sr = sr_ours
n_fft = 400
D = np.abs(librosa.stft(audio_resampled))**2
mel_spectro = librosa.feature.melspectrogram(y=audio_resampled, sr=sr, n_mels=128, fmax=16_000, hop_length=160, n_fft=n_fft)

plt.subplot(1, 2, 2)
S_dB = librosa.power_to_db(mel_spectro, ref=np.max)
librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sr, fmax=16_000)
plt.colorbar(img, format='%+2.0f dB')
plt.title(f"Mel spectrogram for {sr}hz")
plt.show()




In [None]:
# """
# window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS) = 160
# window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS) = 400
# """
ast = ASTFeatureExtractor.from_pretrained(DEFAULT_AST_PRETRAINED_TAG, num_mel_bins=128, do_normalize=True,)

def get_ast_spect(audio):
    features = ast(
		audio,
		sampling_rate=16_000,
		return_tensors="np",
		do_normalize=False,
	)

    return features["input_values"].squeeze()
    

spectrogram = get_ast_spect(audio)
spectrogram_low = get_ast_spect(audio_resampled)
print("max values", spectrogram.max(), spectrogram_low.max())

librosa.display.specshow(spectrogram, x_axis='time', y_axis='mel', sr=sr, fmax=16_000)
plt.colorbar(img, format='%+2.0f dB')
plt.title(f"Mel spectrogram for {sr}hz")
plt.show()

librosa.display.specshow(spectrogram_low, x_axis='time', y_axis='mel', sr=sr, fmax=16_000)
plt.colorbar(img, format='%+2.0f dB')
plt.title(f"Mel spectrogram for low")
plt.show()


In [None]:
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
import torch
from torchaudio.transforms import (
    TimeStretch,
)
from torchaudio.transforms import Spectrogram

def plot_spectrogram_torch(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None):
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or "Spectrogram (db)")
    axs.set_ylabel(ylabel)
    axs.set_xlabel("frame")
    im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect)
    if xmax:
        axs.set_xlim((0, xmax))
    fig.colorbar(im, ax=axs)
    plt.show(block=False)
    
waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
waveform = torch.mean(waveform, dim=0).unsqueeze(dim=0)
print(waveform.size())

pyspec = Spectrogram(n_fft=800)
spec = pyspec(waveform)
ts = TimeStretch(n_fft=800)
spec_ts = ts(torch.tensor(spec), 0.5)

# plot_spectrogram_torch(spec)


librosa.display.specshow(librosa.power_to_db(spec[0].numpy()), x_axis='time', y_axis='mel', sr=sr, fmax=16_000)
plt.colorbar(img, format='%+2.0f dB')
plt.title(f"Mel spectrogram for {sr}hz")
plt.show()

librosa.display.specshow(librosa.power_to_db(spec_ts[0].numpy()), x_axis='time', y_axis='mel', sr=sr, fmax=16_000)
plt.colorbar(img, format='%+2.0f dB')
plt.title(f"Mel spectrogram for {sr}hz")
plt.show()


