In [1]:
!nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1

NVIDIA GeForce RTX 4090


In [2]:
import io
import time
import torch
import numpy as np
import PIL
import torchaudio
import matplotlib.pyplot as plt
import cdpam
from IPython.display import Audio
from transformers import EncodecModel, AutoProcessor
from torchvision.transforms import ToPILImage, PILToTensor
from datasets import load_dataset, Image 
from walloc import walloc
from spauq.core.metrics import spauq_eval
class Config: pass

In [3]:
codec = EncodecModel.from_pretrained("facebook/encodec_48khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_48khz")

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


In [4]:
cdpam_loss = cdpam.CDPAM()

  state = torch.load(modfolder,map_location="cpu")['state']


In [5]:
MUSDB = load_dataset("danjacobellis/musdb_segments", split='validation')

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [6]:
def pad(audio, p=2**16):
    B,C,L = audio.shape
    padding_size = (p - (L % p)) % p
    if padding_size > 0:
        audio = torch.nn.functional.pad(audio, (0, padding_size), mode='constant', value=0)
    return audio

In [7]:
def encodec_compress(sample):
    with torch.no_grad():
        x, fs = torchaudio.load(sample['audio_mix']['bytes'],normalize=False)
        x = x.to(torch.float)
        x = x - x.mean()
        max_abs = x.abs().max()
        x = x / (max_abs + 1e-8)
        x = x/2
        L = x.shape[-1]
    
        t0 = time.time()
        x_pre = torchaudio.transforms.Resample(fs,processor.sampling_rate)(2*x)
        x_pre = processor(raw_audio=x_pre, sampling_rate=processor.sampling_rate, return_tensors="pt")
        Y = codec.encode(
            input_values = x_pre['input_values'].to(device),
            padding_mask = x_pre['padding_mask'].to(device),
            bandwidth=12.0,
        )
        Y['audio_codes'] = Y['audio_codes'].to("cpu")
        encode_time = time.time() - t0
    
        t0 = time.time()
        x_hat = codec.decode(
            audio_codes=Y['audio_codes'].to(device),
            audio_scales=Y['audio_scales'],
            padding_mask=x_pre['padding_mask']
        )['audio_values']
        x_hat = torchaudio.transforms.Resample(processor.sampling_rate,fs).to(device)(x_hat)
        x_hat = x_hat / 2
        x_hat = x_hat[0,:,:L].clamp(-0.5, 0.5)
        decode_time = time.time() - t0
    
        bps = 10*Y['audio_codes'].numel()/(x.numel())
        PSNR = -10*np.log10(torch.nn.functional.mse_loss(x,x_hat.to("cpu")))
        SDR = spauq_eval(x,x_hat.to("cpu"),fs=fs)
        SSDR = SDR['SSR']
        SRDR = SDR['SRR']
        cdpam = cdpam_loss.forward(x.to(device),x_hat).mean().item()

    return {
        'compressed': Y,
        'encode_time': encode_time,
        'decode_time': decode_time,
        'bps': bps,
        'L': L,
        'PSNR': PSNR,
        'SSDR': SSDR,
        'SRDR': SRDR,
        'CDPAM': cdpam
    }

In [8]:
device = "cuda"
codec = codec.to(device)
gpu = MUSDB.map(
    encodec_compress,
    writer_batch_size=16,
)

Map:   0%|          | 0/262 [00:00<?, ? examples/s]



In [9]:
def encodec_compress_cpu(sample):
    with torch.no_grad():
        x, fs = torchaudio.load(sample['audio_mix']['bytes'],normalize=False)
        x = x.to(torch.float)
        x = x - x.mean()
        max_abs = x.abs().max()
        x = x / (max_abs + 1e-8)
        x = x/2
        L = x.shape[-1]
    
        t0 = time.time()
        x_pre = torchaudio.transforms.Resample(fs,processor.sampling_rate)(2*x)
        x_pre = processor(raw_audio=x_pre, sampling_rate=processor.sampling_rate, return_tensors="pt")
        Y = codec.encode(
            input_values = x_pre['input_values'].to(device),
            padding_mask = x_pre['padding_mask'].to(device),
            bandwidth=12.0,
        )
        Y['audio_codes'] = Y['audio_codes'].to("cpu")
        encode_time = time.time() - t0
    
        t0 = time.time()
        x_hat = codec.decode(
            audio_codes=Y['audio_codes'].to(device),
            audio_scales=Y['audio_scales'],
            padding_mask=x_pre['padding_mask']
        )['audio_values']
        x_hat = torchaudio.transforms.Resample(processor.sampling_rate,fs).to(device)(x_hat)
        x_hat = x_hat / 2
        x_hat = x_hat[0,:,:L].clamp(-0.5, 0.5)
        decode_time = time.time() - t0

    return {
        'cpu_encode_time': encode_time,
        'cpu_decode_time': decode_time,
    }

In [10]:
device = "cpu"
codec.eval();
codec = codec.to(device)
cpu = MUSDB.map(encodec_compress_cpu, writer_batch_size=16)
combined = gpu.add_column('cpu_encode_time',cpu['cpu_encode_time'])
combined = combined.add_column('cpu_decode_time',cpu['cpu_decode_time'])

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

In [11]:
metrics = [
    'encode_time',
    'decode_time',
    'bps',
    'PSNR',
    'SSDR',
    'SRDR',
    'CDPAM',
    'cpu_encode_time',
    'cpu_decode_time',
]

In [12]:
for metric in metrics:
    μ = np.mean(combined[metric])
    print(f"{metric}: {μ}")

encode_time: 0.3939989122725625
decode_time: 0.33626718284519574
bps: 0.14019012451171875
PSNR: 31.95164082796519
SSDR: 22.69493529722898
SRDR: 6.691856596921452
CDPAM: 2.065623472935146e-05
cpu_encode_time: 1.5419972561697923
cpu_decode_time: 1.4067453200580509


In [None]:
combined.push_to_hub("danjacobellis/MUSDB_encodec_12kbps",split='validation')