# Investigating pre-trained Audio autoencoder

https://github.com/archinetai/archisound

In [None]:
import numpy as np
import torch
import librosa
from archisound import ArchiSound
from IPython.display import Audio, display

# Load an audio

In [None]:
samplesound_path = '../data/raw/nus-48e/ADIZ/sing/09.wav'

In [None]:
resample_rate = 48000

In [None]:
waveform, sr = librosa.load(samplesound_path, sr=resample_rate)

In [None]:
display(Audio(waveform, rate=sr))

In [None]:
np.shape(waveform)

In [None]:
waveform = np.expand_dims(waveform, axis=0)
np.shape(waveform)

In [None]:
waveform_st = np.concatenate([waveform,waveform], axis=0)
np.shape(waveform_st)

# Prep Autoencoders

We will try out the 3 pre-trained models from archisound; autoencoder1d-AT-v1, dmae1d-ATC32-v3, and dmae1d-ATC64-v2.

In [None]:
def reconstruct1(autoencoder, waveform, blocksize = 262144, limit_segment = 5):
    with torch.no_grad():
        x = torch.from_numpy(waveform)
        x = torch.unsqueeze(x, dim=0)
        segments = x.size()[2] // blocksize
        segment_size = np.minimum(limit_segment, segments)
        resized_samples = segment_size * blocksize
        x = x[:, :, 0:resized_samples]
        y_pred = torch.zeros_like(x)
        for i in range(0, segment_size):
            offsets = i * blocksize
            x_segment = x[:, :, offsets: (offsets + blocksize)]
            z_segment = autoencoder.encode(x_segment) # [1, 32, 8192]
            y_pred_segment = autoencoder.decode(z_segment)
            y_pred[:, :, offsets: (offsets + blocksize)] = y_pred_segment
        y = torch.squeeze(y_pred)
        return y.detach().numpy()

In [None]:
def reconstruct2(autoencoder, waveform, blocksize = 262144, limit_segment = 5):
    with torch.no_grad():
        x = torch.from_numpy(waveform)
        x = torch.unsqueeze(x, dim=0)
        segments = x.size()[2] // blocksize
        segment_size = np.minimum(limit_segment, segments)
        resized_samples = segment_size * blocksize
        x = x[:, :, 0:resized_samples]
        y_pred = torch.zeros_like(x)
        for i in range(0, segment_size):
            offsets = i * blocksize
            x_segment = x[:, :, offsets: (offsets + blocksize)]
            z_segment = autoencoder.encode(x_segment)
            y_pred_segment = autoencoder.decode(z_segment, num_steps=20)
            y_pred[:, :, offsets: (offsets + blocksize)] = y_pred_segment
        y = torch.squeeze(y_pred)
        return y.detach().numpy()

## autoencoder1d-AT-v1 Reconstruction

In [None]:
autoencoder = ArchiSound.from_pretrained('autoencoder1d-AT-v1')
autoencoder.eval()
autoencoder.zero_grad()

In [None]:
y1_pred = reconstruct1(autoencoder, waveform_st)

In [None]:
display(Audio(y1_pred, rate=sr))

# comments:
- click artifacts at the start of the block sample
- reconstructed sound is not great (autoencoder1d-AT-v1)

## dmae1d-ATC32-v3 Reconstruction

In [None]:
autoencoder = ArchiSound.from_pretrained('dmae1d-ATC32-v3')
autoencoder.eval()
autoencoder.zero_grad()

In [None]:
y2_pred = reconstruct2(autoencoder, waveform_st)
display(Audio(y2_pred, rate=sr))

# comments:
- click artifacts at the start of the block sample
- reconstructed sound is very bad (dmae1d-ATC32-v3)

## dmae1d-ATC64-v2 Reconstruction

In [None]:
autoencoder = ArchiSound.from_pretrained("dmae1d-ATC64-v2")
autoencoder.eval()
autoencoder.zero_grad()

In [None]:
y3_pred = reconstruct2(autoencoder, waveform_st)
display(Audio(y3_pred, rate=sr))

# comments:
- click artifacts at the start of the block sample
- reconstructed sound is very bad (dmae1d-ATC64-v2)

Findings: seems like autoencoder1d-AT-v1 is among the better sounding one.

# Investigations with varying blocksize
the starting of the blocks is expected to produce click, we just need to determine the quality ignoring the clicks

In [None]:
autoencoder = ArchiSound.from_pretrained('autoencoder1d-AT-v1')
autoencoder.eval()
autoencoder.zero_grad()

In [None]:
y4_pred = reconstruct1(autoencoder, waveform_st, blocksize=131072, limit_segment=10)
display(Audio(y4_pred, rate=sr))

Sounds okay

In [None]:
y5_pred = reconstruct1(autoencoder, waveform_st, blocksize=65536, limit_segment=20)
display(Audio(y5_pred, rate=sr))

Sounds okay

In [None]:
y6_pred = reconstruct1(autoencoder, waveform_st, blocksize=32768, limit_segment=40)
display(Audio(y6_pred, rate=sr))

Sounds okay

In [None]:
y7_pred = reconstruct1(autoencoder, waveform_st, blocksize=16384, limit_segment=80)
display(Audio(y7_pred, rate=sr))

slight high frequency noise

In [None]:
y8_pred = reconstruct1(autoencoder, waveform_st, blocksize=8192, limit_segment=160)
display(Audio(y8_pred, rate=sr))

high frequency noise, degraded audio

In [None]:
y9_pred = reconstruct1(autoencoder, waveform_st, blocksize=4096, limit_segment=320)
display(Audio(y9_pred, rate=sr))

high frequency noise, very degraded audio

# Investigating oversampling audio reconstruction

In [None]:
resample_rate = 96000
waveform, sr = librosa.load(samplesound_path, sr=resample_rate)
waveform = np.expand_dims(waveform, axis=0)
waveform_st = np.concatenate([waveform, waveform], axis=0)

In [None]:
display(Audio(waveform_st, rate=sr))

In [None]:
y10_pred = reconstruct1(autoencoder, waveform_st, blocksize=32768, limit_segment=80)
display(Audio(y10_pred, rate=sr))

In [None]:
y11_pred = reconstruct1(autoencoder, waveform_st, blocksize=16384, limit_segment=80)
display(Audio(y11_pred, rate=sr))

In [None]:
resample_rate = 192000
waveform, sr = librosa.load(samplesound_path, sr=resample_rate)
waveform = np.expand_dims(waveform, axis=0)
waveform_st = np.concatenate([waveform, waveform], axis=0)

In [None]:
y12_pred = reconstruct1(autoencoder, waveform_st, blocksize=32768, limit_segment=80)
display(Audio(y12_pred, rate=sr))

It kind of worked for oversampled audio. while maintaining limits of input block size of 32768, is about 0.17 sec in 192000 samplerate.
A block size of 32768 under 48000 is 0.68 sec, which might hurt plugin performance.