# Exploring features from RVC HuBERT

## Noise scaling
Applying random noise to speech features. Run below cell and use slider to adjust noise

In [6]:
# Setup
from svc_helper.svc.rvc import RVCModel
from svc_helper.sfeatures.models import RVCHubertModel
from huggingface_hub import hf_hub_download
import librosa
import soundfile as sf
import torch
import IPython.display as ipd
import ipywidgets as widgets

rvc_model = RVCModel()
input_path = 'test_speech.wav'

test_model_path = hf_hub_download(repo_id='therealvul/RVCv2', 
    filename='RainbowDashS1/RainbowDashS1.pth')
test_index_path = hf_hub_download(repo_id='therealvul/RVCv2', 
    filename='RainbowDashS1/added_IVF1357_Flat_nprobe_1_RainbowDashS1_v2.index')

rvc_model.load_model(model_path = test_model_path,
    index_path = test_index_path)

noise_scale = widgets.FloatSlider(value=0.1, min=0, max=3.0)
display(noise_scale)

2024-07-18 14:47:27 | INFO | svc_helper.svc.rvc.modules.vc.modules | Get sid: D:\hf_cache\hub\models--therealvul--RVCv2\snapshots\87778762d011892db45370f0dd963be836d55a08\RainbowDashS1\RainbowDashS1.pth
2024-07-18 14:47:27 | INFO | svc_helper.svc.rvc.modules.vc.modules | Loading: D:\hf_cache\hub\models--therealvul--RVCv2\snapshots\87778762d011892db45370f0dd963be836d55a08\RainbowDashS1\RainbowDashS1.pth


FloatSlider(value=0.1, max=3.0)

In [8]:
# Infer and play audio
wav_opt = rvc_model.infer_file(input_path, transpose=15,
    feature_transform=lambda t: t + torch.randn_like(t)*noise_scale.value)
ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate())

2024-07-18 14:47:59 | INFO | svc_helper.svc.rvc.modules.vc.pipeline | Loading rmvpe model
  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


## Gaussian smoothing

In [111]:
import torch
import torch.nn.functional as F
import math

slider_sigma = widgets.FloatSlider(value=0.1, min=0, max=30.0, description='Sigma')
slider_kernel_size = widgets.IntSlider(value=5, min=1, max=15, description='Kernel Size')
def gaussian_smooth(tensor):
    sigma = slider_sigma.value
    kernel_size = slider_kernel_size.value

    if sigma == 0.0:
        return tensor

    # Ensure odd kernel size
    kernel_size = kernel_size if kernel_size % 2 == 1 else kernel_size + 1
    
    # Create a 1D Gaussian kernel
    # Create a range of values centered around zero
    x = torch.arange(-kernel_size // 2 + 1, kernel_size // 2 + 1, dtype=torch.float32)
    
    # Compute the 1D Gaussian kernel
    kernel = torch.exp(-0.5 * (x / sigma) ** 2)
    
    # Normalize the kernel
    kernel = kernel / kernel.sum()

    kernel = kernel.unsqueeze(0).unsqueeze(0)
    
    # Pad the input tensor
    padding = (kernel_size - 1) // 2
    padded_tensor = F.pad(tensor, (0, 0, padding, padding), mode='reflect')
    
    # Reshape the tensor to [768, 1, 456] to apply convolution separately on each channel
    reshaped = padded_tensor.permute(2, 0, 1)

    # Ensure same dtype
    kernel = kernel.to(tensor.dtype).to(tensor.device)
    
    # Apply the filter
    smoothed = F.conv1d(reshaped, kernel)
    
    # Reshape back to original dimensions
    smoothed = smoothed.permute(1, 2, 0)
    
    return smoothed

display(slider_sigma)
display(slider_kernel_size)


FloatSlider(value=0.1, description='Sigma', max=30.0)

IntSlider(value=5, description='Kernel Size', max=15, min=1)

In [115]:
wav_opt = rvc_model.infer_file(input_path, transpose=15,
    feature_transform=gaussian_smooth)
ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate())



# Laplacian filter

In [18]:
import torch.nn.functional as F
def laplacian_filter(tensor, kernel_size=3):
    # Ensure odd kernel size
    kernel_size = kernel_size if kernel_size % 2 == 1 else kernel_size + 1
    
    # Create a 1D Laplacian kernel
    if kernel_size == 3:
        kernel = torch.tensor([1, -2, 1]).float()
    else:
        # For larger kernel sizes, we'll use an approximation
        kernel = torch.ones(kernel_size)
        kernel[kernel_size // 2] = 1 - kernel_size
    
    kernel = kernel.unsqueeze(0).unsqueeze(0)
    
    # Pad the input tensor
    padding = (kernel_size - 1) // 2
    padded_tensor = F.pad(tensor, (0, 0, padding, padding), mode='reflect')
    
    # Reshape the tensor to [768, 1, 456] to apply convolution separately on each channel
    reshaped = padded_tensor.permute(2, 0, 1)

    # Ensure same dtype
    kernel = kernel.to(tensor.dtype).to(tensor.device)
    
    # Apply the filter
    filtered = F.conv1d(reshaped, kernel)
    
    # Reshape back to original dimensions
    filtered = filtered.permute(1, 2, 0)
    
    return filtered

In [19]:
wav_opt = rvc_model.infer_file(input_path, transpose=15,
    feature_transform=lambda t: t - laplacian_filter(t, 5)*1.0)
ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate())

  File "c:\Users\vul\AppData\Local\Programs\Python\Python310\lib\site-packages\svc_helper\svc\rvc\modules\vc\modules.py", line 186, in vc_single
    audio_opt = self.pipeline.pipeline(
  File "c:\Users\vul\AppData\Local\Programs\Python\Python310\lib\site-packages\svc_helper\svc\rvc\modules\vc\pipeline.py", line 503, in pipeline
    self.vc(
  File "c:\Users\vul\AppData\Local\Programs\Python\Python310\lib\site-packages\svc_helper\svc\rvc\modules\vc\pipeline.py", line 227, in vc
    feats = feature_transform(feats)
  File "C:\Users\vul\AppData\Local\Temp\ipykernel_18672\870044823.py", line 2, in <lambda>
    feature_transform=lambda t: t - laplacian_filter(t, 5)*1.0)
  File "C:\Users\vul\AppData\Local\Temp\ipykernel_18672\1141140495.py", line 17, in laplacian_filter
    padded_tensor = F.pad(tensor, (0, 0, padding, padding), mode='reflect')
NameError: name 'F' is not defined



ValueError: No audio data found. Expecting filename, url, or data.

# PCA perturbations

In [14]:
import torch
import numpy as np
from sklearn.decomposition import PCA

def pca_perturbation(tensor, n_components=50,
    apply_perturbations = [0]):
    # Reshape the tensor to 2D for PCA
    original_shape = tensor.shape
    flattened = tensor.reshape(-1, original_shape[-1])

    # Perform PCA
    pca = PCA(n_components=n_components)
    pca_components = pca.fit_transform(flattened.cpu().numpy())
    print(f'cumulative variance: {np.cumsum(pca.explained_variance_ratio_)}')

    # Convert back to PyTorch tensor
    pca_components = torch.from_numpy(pca_components).float()

    # Generate random perturbations
    perturbations = torch.zeros_like(pca_components)
    perturbations[:, :len(apply_perturbations)] = torch.tensor(apply_perturbations)

    # Apply perturbations in PCA space
    perturbed_pca = pca_components + perturbations

    # Transform back to original space
    perturbed_features = torch.from_numpy(
        pca.inverse_transform(perturbed_pca.numpy())
    ).float()

    # Reshape back to original tensor shape
    perturbed_tensor = perturbed_features.reshape(original_shape)

    return perturbed_tensor.to(tensor.device)

In [17]:
wav_opt = rvc_model.infer_file(input_path, transpose=15,
    feature_transform=lambda t: pca_perturbation(t, n_components=10, apply_perturbations=[0,0,0]))
ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate())



cumulative variance: [0.06605308 0.12012193 0.16754012 0.21204172 0.250226   0.28693747
 0.32076536 0.35291796 0.38420927 0.4140085 ]


# Data augmentation

In [24]:
import librosa
import IPython.display as ipd
from svc_helper.augmentation.pedalboard import PedalboardRandomAugmentor
audio, sr = librosa.load('tests/test_singing.wav')
augmentor = PedalboardRandomAugmentor({'limit': 1.0})
augmented_audio = augmentor.process(audio,sr)
ipd.Audio(augmented_audio, rate=sr, normalize=False)