<a href="https://colab.research.google.com/github/franciscobarber/notebooks/blob/sound/AE_CS_P.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone -l -s https://franciscobarber@github.com/franciscobarber/free-spoken-digit-dataset.git

In [None]:
!pip install "torch>=2.0<3.0" #2.0.1
!pip install pesq
!pip install "torchmetrics>=1.0<2.0" #1.0.2
!pip install "keras>=2.0<3.0" #2.12.0
!pip install "tensorflow>=2.0<3.0" #2.12.0
!pip install git+https://github.com/yoyololicon/spectrogram-inversion

In [None]:
import torch
from torchmetrics.audio import PerceptualEvaluationSpeechQuality
from torch_specinv import griffin_lim, L_BFGS, RTISI_LA
import keras
from keras import layers
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
import torch
from torchmetrics.audio import PerceptualEvaluationSpeechQuality
g = torch.manual_seed(1)
preds = torch.randn(8000)
target = torch.randn(8000)
nb_pesq = PerceptualEvaluationSpeechQuality(8000, 'nb')
nb_pesq(preds, target)

tensor(2.2076)

In [None]:
#@title Biblioteca espectrograma
%matplotlib inline
import IPython.display
from ipywidgets import interact, interactive, fixed

# Packages we're using
import numpy as np
import matplotlib.pyplot as plt
import copy
from scipy.io import wavfile
from scipy.signal import butter, lfilter
import scipy.ndimage
# Most of the Spectrograms and Inversion are taken from: https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe


def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype="band")
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y


def overlap(X, window_size, window_step):
    """
    Create an overlapped version of X
    Parameters
    ----------
    X : ndarray, shape=(n_samples,)
        Input signal to window and overlap
    window_size : int
        Size of windows to take
    window_step : int
        Step size between windows
    Returns
    -------
    X_strided : shape=(n_windows, window_size)
        2D array of overlapped X
    """
    if window_size % 2 != 0:
        raise ValueError("Window size must be even!")
    # Make sure there are an even number of windows before stridetricks
    append = np.zeros((window_size - len(X) % window_size))
    X = np.hstack((X, append))

    ws = window_size
    ss = window_step
    a = X

    valid = len(a) - ws
    nw = (valid) // ss
    out = np.ndarray((nw, ws), dtype=a.dtype)

    for i in np.arange(nw):
        # "slide" the window along the samples
        start = i * ss
        stop = start + ws
        out[i] = a[start:stop]

    return out


def stft(
    X, fftsize=128, step=65, mean_normalize=True, real=False, compute_onesided=True
):
    """
    Compute STFT for 1D real valued input X
    """
    if real:
        local_fft = np.fft.rfft
        cut = -1
    else:
        local_fft = np.fft.fft
        cut = None
    if compute_onesided:
        cut = fftsize // 2
    if mean_normalize:
        X -= X.mean()

    X = overlap(X, fftsize, step)

    size = fftsize
    win = 0.54 - 0.46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))
    X = X * win[None]
    X = local_fft(X)[:, :cut]
    return X


def pretty_spectrogram(d, log=True, thresh=5, fft_size=512, step_size=64):
    """
    creates a spectrogram
    log: take the log of the spectrgram
    thresh: threshold minimum power for log spectrogram
    """
    specgram = np.abs(
        stft(d, fftsize=fft_size, step=step_size, real=False, compute_onesided=True)
    )

    if log == True:
        specgram /= specgram.max()  # volume normalize to max 1
        specgram = np.log10(specgram)  # take log
        specgram[
            specgram < -thresh
        ] = -thresh  # set anything less than the threshold as the threshold
    else:
        specgram[
            specgram < thresh
        ] = thresh  # set anything less than the threshold as the threshold

    return specgram


# Also mostly modified or taken from https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe
def invert_pretty_spectrogram(
    X_s, log=True, fft_size=512, step_size=512 / 4, n_iter=10
):

    if log == True:
        X_s = np.power(10, X_s)

    X_s = np.concatenate([X_s, X_s[:, ::-1]], axis=1)
    X_t = iterate_invert_spectrogram(X_s, fft_size, step_size, n_iter=n_iter)
    return X_t


def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False):
    """
    Under MSR-LA License
    Based on MATLAB implementation from Spectrogram Inversion Toolbox
    References
    ----------
    D. Griffin and J. Lim. Signal estimation from modified
    short-time Fourier transform. IEEE Trans. Acoust. Speech
    Signal Process., 32(2):236-243, 1984.
    Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory
    Model Inversion for Sound Separation. Proc. IEEE-ICASSP,
    Adelaide, 1994, II.77-80.
    Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal
    Estimation from Modified Short-Time Fourier Transform
    Magnitude Spectra. IEEE Transactions on Audio Speech and
    Language Processing, 08/2007.
    """
    reg = np.max(X_s) / 1e8
    X_best = copy.deepcopy(X_s)
    for i in range(n_iter):
        if verbose:
            print("Runnning iter %i" % i)
        if i == 0:
            X_t = invert_spectrogram(
                X_best, step, calculate_offset=True, set_zero_phase=True
            )
        else:
            # Calculate offset was False in the MATLAB version
            # but in mine it massively improves the result
            # Possible bug in my impl?
            X_t = invert_spectrogram(
                X_best, step, calculate_offset=True, set_zero_phase=False
            )
        est = stft(X_t, fftsize=fftsize, step=step, compute_onesided=False)
        phase = est / np.maximum(reg, np.abs(est))
        X_best = X_s * phase[: len(X_s)]
    X_t = invert_spectrogram(X_best, step, calculate_offset=True, set_zero_phase=False)
    return np.real(X_t)


def invert_spectrogram(X_s, step, calculate_offset=True, set_zero_phase=True):
    """
    Under MSR-LA License
    Based on MATLAB implementation from Spectrogram Inversion Toolbox
    References
    ----------
    D. Griffin and J. Lim. Signal estimation from modified
    short-time Fourier transform. IEEE Trans. Acoust. Speech
    Signal Process., 32(2):236-243, 1984.
    Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory
    Model Inversion for Sound Separation. Proc. IEEE-ICASSP,
    Adelaide, 1994, II.77-80.
    Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal
    Estimation from Modified Short-Time Fourier Transform
    Magnitude Spectra. IEEE Transactions on Audio Speech and
    Language Processing, 08/2007.
    """
    size = int(X_s.shape[1] // 2)
    wave = np.zeros((X_s.shape[0] * step + size))
    # Getting overflow warnings with 32 bit...
    wave = wave.astype("float64")
    total_windowing_sum = np.zeros((X_s.shape[0] * step + size))
    win = 0.54 - 0.46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))

    est_start = int(size // 2) - 1
    est_end = est_start + size
    for i in range(X_s.shape[0]):
        wave_start = int(step * i)
        wave_end = wave_start + size
        if set_zero_phase:
            spectral_slice = X_s[i].real + 0j
        else:
            # already complex
            spectral_slice = X_s[i]

        # Don't need fftshift due to different impl.
        wave_est = np.real(np.fft.ifft(spectral_slice))[::-1]
        if calculate_offset and i > 0:
            offset_size = size - step
            if offset_size <= 0:
                print(
                    "WARNING: Large step size >50\% detected! "
                    "This code works best with high overlap - try "
                    "with 75% or greater"
                )
                offset_size = step
            offset = xcorr_offset(
                wave[wave_start : wave_start + offset_size],
                wave_est[est_start : est_start + offset_size],
            )
        else:
            offset = 0
        wave[wave_start:wave_end] += (
            win * wave_est[est_start - offset : est_end - offset]
        )
        total_windowing_sum[wave_start:wave_end] += win
    wave = np.real(wave) / (total_windowing_sum + 1e-6)
    return wave

def xcorr_offset(x1, x2):
    """
    Under MSR-LA License
    Based on MATLAB implementation from Spectrogram Inversion Toolbox
    References
    ----------
    D. Griffin and J. Lim. Signal estimation from modified
    short-time Fourier transform. IEEE Trans. Acoust. Speech
    Signal Process., 32(2):236-243, 1984.
    Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory
    Model Inversion for Sound Separation. Proc. IEEE-ICASSP,
    Adelaide, 1994, II.77-80.
    Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal
    Estimation from Modified Short-Time Fourier Transform
    Magnitude Spectra. IEEE Transactions on Audio Speech and
    Language Processing, 08/2007.
    """
    x1 = x1 - x1.mean()
    x2 = x2 - x2.mean()
    frame_size = len(x2)
    half = frame_size // 2
    corrs = np.convolve(x1.astype("float32"), x2[::-1].astype("float32"))
    corrs[:half] = -1e30
    corrs[-half:] = -1e30
    offset = corrs.argmax() - len(x1)
    return offset

import scipy.io.wavfile as wav

### Parameters ###
fft_size = 512  # window size for the FFT
step_size = fft_size // 16  # distance to slide along the window (in time)
spec_thresh = 4  # threshold for spectrograms (lower filters out more noise)
lowcut = 500  # Hz # Low cut for our butter bandpass filter
highcut = 4000  # Hz # High cut for our butter bandpass filter
# For mels
n_mel_freq_components = 64  # number of mel frequency channels
shorten_factor = 10  # how much should we compress the x-axis (time)
start_freq = 50  # Hz # What frequency to start sampling our melS from
end_freq = 4000
audio_path='/content/free-spoken-digit-dataset/recordings/0_jackson_0.wav'
data_rate, data = wav.read(audio_path)
wav_spectrogram = pretty_spectrogram(
data.astype("float64"),
fft_size=fft_size,
step_size=step_size,
log=True,
thresh=spec_thresh,
)

# Invert from the spectrogram back to a waveform
recovered_audio_orig = invert_pretty_spectrogram(
    wav_spectrogram, fft_size=fft_size, step_size=step_size, log=True, n_iter=10
)

In [None]:
from os import listdir
import scipy.io.wavfile as wav
from os.path import isfile, join
import librosa
import librosa.display
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def create_specs(fft, time_long, step_size, log_ref, files_permutation):
  audio_dir='/content/free-spoken-digit-dataset/recordings/'
  file_names = [f for f in listdir(audio_dir) if isfile(join(audio_dir, f)) and '.wav' in f]
  train_list = np.zeros([0,int(fft/2)+1,int(time_long/step_size)+1])
  test_list = np.zeros([0,int(fft/2)+1,int(time_long/step_size)+1])

  #sp_sz=2046
  sp_sz = int(time_long)
  i = 0

  for file_name in file_names:
    audio_path = audio_dir + file_name

    sample_rate, samples = wav.read(audio_path)
    samples = np.append(samples, np.random.randn(sp_sz-samples.shape[0]%sp_sz)*10, axis=0)
    lala = np.transpose(pretty_spectrogram(samples.astype("float32"),fft_size=fft,step_size=step_size,log=False))
    y = torch.from_numpy(samples.astype("float32"))
    windowsize = 256
    window = torch.hann_window(windowsize)
    S = torch.stft(y, windowsize, window=window, return_complex=False)


    # discard phase information
    mag = S.pow(2).sum(2).sqrt()
    ms = mag.numpy()
    n_ms = samples.shape[0]//sp_sz
    ms = np.expand_dims(librosa.power_to_db(ms,
                                            ref=log_ref), axis=0)
    lms = np.split(ms, n_ms, axis=2)

    ms2 = np.concatenate(lms)
    if files_permutation[i]<np.ceil(len(file_names)*0.8):
      train_list = np.append(train_list,ms2,axis=0)
    else:
      test_list = np.append(test_list,ms2,axis=0)
    i += 1
  return train_list, test_list

class specData():
  def __init__(self,fft,time_long,fft_step_size_ratio,clip=1e0):
    self.fft= fft
    self.time_long = time_long
    self.fft_step_size_ratio = fft_step_size_ratio
    self.clip = clip
  def set_clip(self,clip):
    self.clip = clip

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Jan 30 18:19:12 2021

@author: barberot
"""


from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import librosa
import librosa.display
import scipy.io.wavfile as wav
from os import listdir
from os.path import isfile, join
import keras
from keras.layers import Activation, Dense, Input, GaussianNoise
from keras.layers import Conv2D, Flatten, BatchNormalization, Dropout
from keras.layers import Reshape, Conv2DTranspose
from keras.models import Model
from keras import backend as K
from keras.datasets import mnist
import numpy as np
from keras import regularizers
np.random.seed(1337)
import time
from keras.models import model_from_json
import scipy
import sklearn
import torch
import torchmetrics
from torchmetrics.audio import PerceptualEvaluationSpeechQuality
class myautoencoder():
  def __init__(self,compression_rate,spec, x_train, x_test, mode = 'conv', random_encoder = False, pretrain_encoder = False, encoder_w=None, std = 0, log_ref=1e-5):
    self.mode = mode
    self.time_long = int(spec.time_long)
    #modif
    self.x_train = x_train
    self.x_test = x_test
    self.fft = 256
    self.step_size = 64
    self.time = 1023
    self.clip = spec.clip
    self.encoder_w = encoder_w
    image_size= self.x_train.shape
    self.compression_rate = compression_rate
    self.x_train = np.reshape(self.x_train, [-1, image_size[1], image_size[2], 1])
    self.x_test = np.reshape(self.x_test, [-1, image_size[1], image_size[2], 1])
    self.x_train = self.x_train.astype('float16') / self.clip
    self.x_test = self.x_test.astype('float16') / self.clip
    self.log_ref = log_ref

    # Network parameters
    input_shape = (image_size[1], image_size[2], 1)
    self.batch_size = 32
    kernel_size = (int(10*self.fft/512),int(5*self.time/16))
    latent_dim =int(self.compression_rate*self.time_long)
    # Encoder/Decoder number of CNN layers and filters per layer
    layer_filters = [16, 32]

    # Build the Autoencoder Model
    # First build the Encoder Model
    inputs = Input(shape=input_shape, name='encoder_input')
    x = inputs
    # Shape info needed to build Decoder Model
    shape = K.int_shape(x)
    if mode == 'conv':
      shape = (None,int(self.fft/2)+1, int(self.time_long/self.step_size)+1,int(self.time_long/(self.step_size*2)))
    else:
      shape = (None,int(self.fft/2)+1, int(self.time_long/self.step_size)+1,1)
    # Generate the latent vector
    x = Flatten()(inputs)

    self.layer = Dense(latent_dim, name='latent_vector')
    latent = self.layer(x)
    if pretrain_encoder == True:
      self.layer.set_weights(self.encoder_w)
      self.layer.trainable = False
    if random_encoder == True:
      self.layer.trainable = False
    # Instantiate Encoder Model
    encoder = Model(inputs, latent, name='encoder')
    #encoder.summary()
    # Build the Decoder Model
    latent_inputs = Input(shape=(latent_dim,), name='decoder_input')
    if self.mode == 'mlp':
      x = Dense(500, activation="relu")(x)
      #x = Dropout(0.2)(x)
      x = BatchNormalization()(x)
      x = Dense(500, activation="relu")(x)
      #x = Dropout(0.2)(x)
      x = BatchNormalization()(x)
    x = Dense(shape[1] * shape[2] * shape[3])(latent_inputs)
    x = Reshape((shape[1], shape[2], shape[3]))(x)
    lp = x
    '''
    x = Conv2DTranspose(filters=1,
                        kernel_size=(5,5),
                        padding='same')(x)
    '''
    #x2 = x
    # Stack of Transposed Conv2D blocks
    # Notes:
    # 1) Use Batch Normalization before ReLU on deep networks
    # 2) Use UpSampling2D as alternative to strides>1
    # - faster but not as good as strides>1

    for filters in layer_filters[::-1]:
        x = Conv2DTranspose(filters=filters,
                            kernel_size=kernel_size,
                            strides=(1,1),
                            activation='relu',
                            padding='same')(x)


    conv = Conv2DTranspose(filters=1,
                        kernel_size=kernel_size,
                        padding='same')(x)

    if self.mode == 'conv':
      x2 = conv
    elif self.mode == 'lp' or self.mode == 'mlp':
      x2 = lp

    outputs = Activation('sigmoid', name='decoder_output')(x2)

    # Instantiate Decoder Model
    decoder = Model(latent_inputs, outputs, name='decoder')
    #decoder.summary()

    # Autoencoder = Encoder + Decoder
    # Instantiate Autoencoder Model
    self.autoencoder = Model(inputs, decoder(encoder(inputs)), name='autoencoder')
  def train(self, epochs, loss):
    self.epochs = epochs
    self.loss = loss
    #autoencoder.summary()
    sgd = keras.optimizers.Adam(lr=0.0001, beta_1=0.95, beta_2=0.999, amsgrad=False)
    #sgd = keras.optimizers.RMSprop(lr=0.0001, rho=0.99)
    self.autoencoder.compile(loss=self.loss, optimizer=sgd)

    # Train the autoencoder
    history = self.autoencoder.fit(x = self.x_train,
                    y = self.x_train,
                    validation_data=(self.x_test,self.x_test), verbose=1,
                    epochs=self.epochs,
                    batch_size=self.batch_size)
    if self.mode == 'lp':
      self.encoder_w=self.layer.get_weights()
    # Plot training & validation loss values

    return self.encoder_w, history
  def save_autoencoder(self,files_name):
#g = autoencoder
      g2_json = self.autoencoder.to_json()

      with open(files_name+".json", "w") as json_file:
          json_file.write(g2_json)
# serialize weights to HDF5
      self.autoencoder.save_weights(files_name+".h5")
      print("Saved autoencoder model  to disk")

  def load_autoencoder(self,files_name):

      # load json and create model
      json_file = open(files_name+".json", 'r')
      loaded_model_json = json_file.read()
      json_file.close()
      self.autoencoder = model_from_json(loaded_model_json)
      # load weights into new model
      self.autoencoder.load_weights(files_name+".h5")
      print("Loaded model g2 from disk")
  def audio_evaluation(self, num_audios, files_permutation):

    sp_sz = int(self.time_long)
    loss1 = np.zeros(num_audios,)
    loss2 = np.zeros(num_audios,)
    loss3 = np.zeros(num_audios,)
    loss4 = []
    SMSE = []
    p = []
    pearson = []
    pearson_s = []
    SNR = np.zeros(num_audios,)
    t0 = time.time()
    total_specs=0
    audio_dir='/content/free-spoken-digit-dataset/recordings/'
    file_names = [f for f in listdir(audio_dir) if isfile(join(audio_dir, f)) and '.wav' in f]
    for i in range(num_audios):

      audio_path = audio_dir + file_names[files_permutation[int(i+np.ceil(len(file_names)*.8))]]
      sample_rate, samples = wav.read(audio_path)
      samples = np.append(samples, np.random.randn(sp_sz-samples.shape[0]%sp_sz)*10, axis=0)

      y = torch.from_numpy(samples.astype("float32"))
      windowsize = 256
      window = torch.hann_window(windowsize)
      S = torch.stft(y, windowsize, window=window, return_complex=False)
      mag = S.pow(2).sum(2).sqrt()
      num = mag.numpy()

      ms=librosa.power_to_db(num, ref=self.log_ref)
      n_ms = samples.shape[0]//sp_sz
      ms2 = np.expand_dims(ms, axis=0)
      lms = np.split(ms2, n_ms, axis=2)
      ms2 = np.concatenate(lms)
      msrs = np.reshape(ms2/self.clip, [-1, ms2.shape[1], ms2.shape[2], 1])


      #decrypt
      x_decoded = self.autoencoder.predict(msrs)
      lms = np.split(x_decoded, x_decoded.shape[0], axis=0)
      x_decoded2 = np.concatenate(lms,axis=2)
      x_decoded3 = np.reshape(x_decoded2, [ x_decoded2.shape[1], x_decoded2.shape[2]])*self.clip
      comp3 = librosa.core.db_to_power(x_decoded3, ref=self.log_ref)
      #print('shape', num.shape)



      try:
        yhat = griffin_lim(torch.from_numpy(comp3), maxiter=100, alpha=0.3, window=window)
        #mag = trsfn(y)
      #yhat = RTISI_LA(mag, look_ahead=-1, asymmetric_window=False, max_iter=25,
      #alpha=0.99, verbose=1)

        # check convergence

        g = torch.manual_seed(1)
        preds = torch.from_numpy(samples)
        nb_pesq = PerceptualEvaluationSpeechQuality(8000, 'nb')
        yhatn=yhat.numpy()
        pesq_value = nb_pesq(yhat.type('torch.ShortTensor'), preds[:yhatn.shape[0]]).float()
        p.append(pesq_value)

        b =  pretty_spectrogram(samples[:yhatn.shape[0]].astype("float32")
            ,fft_size=self.fft,step_size=self.step_size,log=False)
        c =  pretty_spectrogram(yhatn.astype("float32")
          ,fft_size=self.fft,step_size=self.step_size,log=False)
        pearson_s.append(scipy.stats.pearsonr(b.reshape(-1),c.reshape(-1)).statistic)
        pearson.append(scipy.stats.pearsonr(samples[:yhatn.shape[0]],yhatn).statistic)
        p.append(pesq_value)
        SMSE.append(np.linalg.norm(b-c)/np.linalg.norm(b))

      except:
        pass

    print('PESQ',sum(p)/len(p))
    print('PEARSON SPECTRAL',sum(pearson_s)/len(pearson_s))
    print('PEARSON',sum(pearson)/len(pearson))
    print('SMSE',sum(SMSE)/len(SMSE))


  def audio_hearing(self, audio_name):

    sp_sz = int(self.time_long)

    audio_dir='/content/free-spoken-digit-dataset/recordings/'
    file_names = [f for f in listdir(audio_dir) if isfile(join(audio_dir, f)) and '.wav' in f]

    audio_path = audio_dir + audio_name
    sample_rate, samples = wav.read(audio_path)
    samples = np.append(samples, np.random.randn(sp_sz-samples.shape[0]%sp_sz)*10, axis=0)

    y = torch.from_numpy(samples.astype("float32"))
    windowsize = 256
    window = torch.hann_window(windowsize)
    S = torch.stft(y, windowsize, window=window, return_complex=False)
    mag = S.pow(2).sum(2).sqrt()
    num = mag.numpy()

    ms=librosa.power_to_db(num, ref=self.log_ref)
    n_ms = samples.shape[0]//sp_sz
    ms2 = np.expand_dims(ms, axis=0)
    lms = np.split(ms2, n_ms, axis=2)
    ms2 = np.concatenate(lms)
    msrs = np.reshape(ms2/self.clip, [-1, ms2.shape[1], ms2.shape[2], 1])


    #decrypt
    x_decoded = self.autoencoder.predict(msrs)
    lms = np.split(x_decoded, x_decoded.shape[0], axis=0)
    x_decoded2 = np.concatenate(lms,axis=2)
    x_decoded3 = np.reshape(x_decoded2, [ x_decoded2.shape[1], x_decoded2.shape[2]])*self.clip
    comp3 = librosa.core.db_to_power(x_decoded3, ref=self.log_ref)
    #print('shape', num.shape)



    try:
      yhat = griffin_lim(torch.from_numpy(comp3), maxiter=100, alpha=0.3, window=window)
      #mag = trsfn(y)
    #yhat = RTISI_LA(mag, look_ahead=-1, asymmetric_window=False, max_iter=25,
    #alpha=0.99, verbose=1)

      return yhat.numpy()

    except:
      pass



In [None]:
print(audio.shape)

(6080,)


In [None]:
fft = 256
time_long = 1023

step_size = 64
fft_step_size_ratio = int(fft/step_size)
log_ref = 5e-0
audio_dir='/content/free-spoken-digit-dataset/recordings/'
file_names = [f for f in listdir(audio_dir) if isfile(join(audio_dir, f)) and '.wav' in f]
files_permutation = np.random.permutation(len(file_names))

X_train, X_test = create_specs(fft=fft, time_long=time_long, step_size= step_size, log_ref= log_ref,files_permutation=files_permutation)
spec = specData(fft,time_long,fft_step_size_ratio)
clip=np.ceil(np.amax(X_train))
spec.set_clip(clip)

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:863.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


In [None]:
compression_rate = 0.125

mlp25 = myautoencoder(compression_rate= compression_rate, spec= spec,
                    x_train=X_train, x_test=X_test, mode='mlp',log_ref= log_ref)
encoder_w, historymlp25 = mlp25.train(100, 'mse')
mlp25.save_autoencoder("CR_625_mlp16_n")
#mlp25.load_autoencoder("CR_25_mlp_n")



#files_permutation=np.load('files_permutation.npy')

In [None]:
#mlp25.save_autoencoder("CR_675_mlp_n")
audio = mlp25.audio_hearing("9_george_0.wav")



100%|██████████| 200/200 [00:00<00:00, 354.09it/s, SC=-19.2, loss=1.05e+6]


In [None]:
wavfile.write('nine.wav', 8000, audio)

In [None]:
import IPython.display as ipd




from IPython.display import Audio

Audio(audio, rate=8000)

In [None]:
mlp25.audio_evaluation(10, files_permutation)