### Load audio, extract image for training

In [66]:
import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt
import torch
import torchaudio
import soundfile as sf
import librosa
import numpy
import skimage.io

file = "Samples/Bassdrums/mau5_kick_06_A#m.wav"

def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def spectrogram_image(y, sr, out, hop_length, n_mels):
    # use log-melspectrogram
    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels,
                                            n_fft=4096, hop_length=hop_length)
    mels = numpy.log(mels + 1e-9) # add small number to avoid log(0)

    # min-max scale to fit inside 8-bit range
    img = scale_minmax(mels, 0, 255).astype(numpy.uint8)
    img = numpy.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert. make black==more energy

    # save as PNG
    skimage.io.imsave(out, img)


if __name__ == '__main__':
    # settings
    hop_length = 1024 # number of samples per time-step in spectrogram
    n_mels = 128 # number of bins in spectrogram. Height of image
    time_steps = 384 # number of time-steps. Width of image

    # load audio. Using example from librosa
    path = file
    y, sr = librosa.load(path, sr=44100)
    out = 'out.png'
    

    # extract a fixed length window
    start_sample = 0 # starting at beginning
    length_samples = time_steps*hop_length
    sample = y[start_sample:start_sample+length_samples]
    n_fft=4096
    stft = librosa.stft(sample, n_fft=n_fft, hop_length=hop_length)
    # calculate abs values on complex numbers to get magnitude
    spectrogram = np.abs(stft)
    

In [60]:
import imageio
im = imageio.imread('out.png')


In [64]:
y_inv = librosa.griffinlim(im)
sf.write('stereo_file1.wav', y_inv, 48000, 'PCM_24')