In [2]:
import random
import torch

from datasets import load_dataset, Audio
from dotenv import load_dotenv
from IPython.display import Audio as AudioDisplay
from transformers import DacModel, AutoProcessor

load_dotenv()
random.seed(0)

In [3]:
data_path_input = "JacobLinCool/VoiceBank-DEMAND-16k"
data_path_output = "gokulkarthik/vb-demand-synthetic"
model_path = "descript/dac_16khz"

In [4]:
ds = load_dataset(data_path_input)
ds = ds.select_columns(['id', 'clean'])
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'clean'],
        num_rows: 11572
    })
    test: Dataset({
        features: ['id', 'clean'],
        num_rows: 824
    })
})

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DacModel.from_pretrained(model_path).to(device)
processor = AutoProcessor.from_pretrained(model_path)
sampling_rate = processor.sampling_rate

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


# Test a sample

In [6]:
sample_idx = random.randint(0, len(ds['test']))
sample = ds['test'][sample_idx]
clean = torch.from_numpy(sample['clean']['array'])
print(clean.shape)
sample

torch.Size([44418])


{'id': 'p257_002',
 'clean': {'path': 'p257_002.wav',
  'array': array([ 0.00048828,  0.00076294,  0.00048828, ..., -0.01205444,
         -0.01153564, -0.00946045], shape=(44418,)),
  'sampling_rate': 16000}}

In [7]:
AudioDisplay(clean, rate=sampling_rate)

In [8]:
def dac_forward(audio_sample, n_quantizers=12):
    inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")["input_values"].to(device)
    with torch.inference_mode():
        outputs_predicted = model(inputs, n_quantizers=n_quantizers).audio_values.unsqueeze(0)
        outputs = torch.zeros_like(inputs)
        min_length = min(outputs_predicted.shape[-1], outputs.shape[-1])
        outputs[:, :, :min_length] = outputs_predicted[:, :, :min_length]
        outputs = outputs.detach().cpu().numpy()[0][0]
    return outputs

In [9]:
noisy = dac_forward(clean, n_quantizers=2)
print(noisy.shape)
AudioDisplay(noisy, rate=sampling_rate)

(44480,)


In [10]:
noisy = dac_forward(clean, n_quantizers=4)
print(noisy.shape)
AudioDisplay(noisy, rate=sampling_rate)

(44480,)


In [11]:
noisy = dac_forward(clean, n_quantizers=8)
print(noisy.shape)
AudioDisplay(noisy, rate=sampling_rate)

(44480,)


# Transform dataset

In [12]:
def add_artifacts(row):
    audio_sample = row['clean']['array']
    n_quantizers = random.choice([4, 8])
    inputs = processor(raw_audio=audio_sample, sampling_rate=sampling_rate, return_tensors="pt")["input_values"].to(device)
    with torch.inference_mode():
        outputs_predicted = model(inputs, n_quantizers=n_quantizers).audio_values.unsqueeze(0)
        outputs = torch.zeros_like(inputs)
        min_length = min(outputs_predicted.shape[-1], outputs.shape[-1])
        outputs[:, :, :min_length] = outputs_predicted[:, :, :min_length]
        outputs = outputs.detach().cpu().numpy()[0][0]
        inputs = inputs.detach().cpu().numpy()[0][0]
    result = {
        'clean': {"array": inputs, "sampling_rate": sampling_rate},
        'noisy': {"array": outputs, "sampling_rate": sampling_rate}
    }
    return result

In [13]:
ds = ds.map(add_artifacts, num_proc=1)

Map:   0%|          | 0/11572 [00:00<?, ? examples/s]

Map:   0%|          | 0/824 [00:00<?, ? examples/s]

In [14]:
ds = ds.cast_column('clean', Audio(sampling_rate=sampling_rate))
ds = ds.cast_column('noisy', Audio(sampling_rate=sampling_rate))
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'clean', 'noisy'],
        num_rows: 11572
    })
    test: Dataset({
        features: ['id', 'clean', 'noisy'],
        num_rows: 824
    })
})

In [15]:
ds["test"][0]

{'id': 'p232_001',
 'clean': {'path': None,
  'array': array([0.00213623, 0.00323486, 0.00244141, ..., 0.        , 0.        ,
         0.        ], shape=(28160,)),
  'sampling_rate': 16000},
 'noisy': {'path': None,
  'array': array([0.0005188 , 0.00082397, 0.0010376 , ..., 0.        , 0.        ,
         0.        ], shape=(28160,)),
  'sampling_rate': 16000}}

In [16]:
AudioDisplay(ds["test"][0]["clean"]["array"], rate=sampling_rate)

In [17]:
AudioDisplay(ds["test"][0]["noisy"]["array"], rate=sampling_rate)

In [18]:
ds.push_to_hub(data_path_output)

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/2315 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/441M [00:00<?, ?B/s]

Map:   0%|          | 0/2315 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/414M [00:00<?, ?B/s]

Map:   0%|          | 0/2314 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/426M [00:00<?, ?B/s]

Map:   0%|          | 0/2314 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/443M [00:00<?, ?B/s]

Map:   0%|          | 0/2314 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/411M [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/824 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/132M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/gokulkarthik/vb-demand-synthetic/commit/8e1c1b46b9fa950ce8c71096e4695334cb60d8ea', commit_message='Upload dataset', commit_description='', oid='8e1c1b46b9fa950ce8c71096e4695334cb60d8ea', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/gokulkarthik/vb-demand-synthetic', endpoint='https://huggingface.co', repo_type='dataset', repo_id='gokulkarthik/vb-demand-synthetic'), pr_revision=None, pr_num=None)