In [2]:
from datasets import DatasetDict
from audio_augmentator.utils import get_speech_timestamps, collect_chunks, tensor_normalization, signal_energy_noise_search
import torch
import numpy as np
import soundfile as sf
import os

In [3]:
sf.__libsndfile_version__

'1.2.0'

In [4]:
noises_dataset_path = 'noises_dataset'
noises_dataset = DatasetDict.load_from_disk(noises_dataset_path)

In [5]:
# %%timeit
noises_dataset

DatasetDict({
    household_noises: Dataset({
        features: ['audio'],
        num_rows: 1676
    })
    speech_noises: Dataset({
        features: ['audio'],
        num_rows: 4049
    })
    background_music_noises: Dataset({
        features: ['audio'],
        num_rows: 1972
    })
    pets_noises: Dataset({
        features: ['audio'],
        num_rows: 862
    })
})

In [6]:
# %%timeit
house = noises_dataset['household_noises']

In [7]:
# %%timeit
speech = noises_dataset['speech_noises']

In [8]:
# %%timeit 's'
speech[1000]

{'audio': {'path': 'ru_0018.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.05175781e-05,  0.00000000e+00,  0.00000000e+00]),
  'sampling_rate': 16000}}

In [9]:
silero_vad_model_path = "silero_vad.jit"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.jit.load(silero_vad_model_path,
                       map_location=device)
model = model.to(device)


In [10]:
# speech[]

In [11]:
def preprocess_speech(
        speech_array: np.ndarray,
        vad_model
) -> torch.tensor:
    noise_to_mix_tensor = torch.from_numpy(np.float32(speech_array))
    noise_to_mix_tensor = tensor_normalization(noise_to_mix_tensor)

    speech_timestamps = get_speech_timestamps(
        input_audio=noise_to_mix_tensor,
        silero_vad_model=vad_model
    )
    if len(speech_timestamps) >= 1:
        noise_to_mix_tensor = collect_chunks(
            speech_timestamps,
            noise_to_mix_tensor
        )
    noise_to_mix_tensor = torch.unsqueeze(noise_to_mix_tensor, 0)
    return noise_to_mix_tensor

def preprocess_other(
        audio_array: np.ndarray
):
    noise_to_mix_array = signal_energy_noise_search(audio_array)
    noise_to_mix_array = np.float32(noise_to_mix_array)
    noise_to_mix_tensor = torch.unsqueeze(torch.from_numpy(noise_to_mix_array), 0)
    return noise_to_mix_tensor

def speech_mapper(example):
    example['audio']['array'] = preprocess_speech(example['audio']['array'], vad_model=model)
    return example

In [14]:
speech.map(speech_mapper, num_proc=4)
# speech_mapper(speech[1001])
# speech.map

Map (num_proc=4):   0%|          | 0/4049 [00:00<?, ? examples/s]

Process ForkPoolWorker-35:
Traceback (most recent call last):
  File "/home/limp/miniconda3/envs/smarthome/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
Process ForkPoolWorker-36:
  File "/home/limp/miniconda3/envs/smarthome/lib/python3.9/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)


TimeoutError: 

In [None]:
speech[1000]['audio']['array']

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -3.05175781e-05,  0.00000000e+00,  0.00000000e+00])

In [20]:
preprocess_speech(speech[133]['audio']['array'],model)

tensor([[-0.0040, -0.0037, -0.0029,  ...,  0.0084,  0.0086,  0.0093]])

In [26]:
base_audio = house[133]['audio']['array']
processed_audio = preprocess_other(house[133]['audio']['array'])

In [27]:
import IPython

IPython.display.Audio(base_audio, rate=16000)

In [28]:
import IPython

IPython.display.Audio(processed_audio, rate=16000)