# Investigating RAVE's autoencoder

In [None]:
import torch
import numpy as np
import librosa
from IPython.display import Audio, display

In [None]:
samplesound_path = '../data/processed/nus/predict/x/09_1.wav'
resample_rate = 48000
waveform, sr = librosa.load(samplesound_path, sr=resample_rate)

In [None]:
waveform = np.expand_dims(waveform, axis=0)
waveform_st = np.concatenate([waveform, waveform], axis=0)

In [None]:
display(Audio(waveform_st, rate=sr))

In [None]:
ae_path = '../models/pre-trained/VCTK.ts'
ae_model = torch.jit.load(ae_path)

In [None]:
def reconstruct1(autoencoder, waveform, blocksize = 262144, limit_segment = 5):
    x = torch.from_numpy(waveform)
    x = torch.unsqueeze(x, dim=0)
    segments = x.size()[2] // blocksize
    segment_size = np.minimum(limit_segment, segments)
    resized_samples = segment_size * blocksize
    x = x[:, :, 0:resized_samples]
    y_pred = torch.zeros_like(x)
    y_pred = torch.cat([y_pred,y_pred],dim=1)
    # print(y_pred.size())
    for i in range(0, segment_size):
        offsets = i * blocksize
        x_segment = x[:, :, offsets: (offsets + blocksize)]
        with torch.no_grad():
            # print(x_segment.size())
            y_pred_segment = autoencoder(x_segment)
        y_pred[:, :, offsets: (offsets + blocksize)] = y_pred_segment
    y = torch.squeeze(y_pred)
    return y.detach().numpy()

In [None]:
y1_pred = reconstruct1(ae_model, waveform, blocksize=2**15, limit_segment=40)

In [None]:
display(Audio(y1_pred, rate=sr))

# Comments
- while trying out vctk audio, it sounds reasonable, but still with quite abit of artifacts.
- Rave ae is able to prevent clicks from block processings, probably due to the use of their `cached_conv`.
- for nus wav, Audio reconstruction is weird, it is kind of expected as it only trained from vctk audio.