In [158]:
import torch
import torchaudio
import librosa
import librosa.display
import os
from src.config.config_defaults import ConfigDefault, get_default_config
import numpy as np
from pathlib import Path
from src.utils.utils_dataset import get_example_val_sample
import pyrootutils
%load_ext autoreload
%autoreload 2
import IPython
path_workdir: Path = Path(pyrootutils.find_root(search_from=os.curdir, indicator=".project-root"))
os.chdir(Path(path_workdir))

config = get_default_config()


sr = 16_000
original_audio = get_example_val_sample(sr)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [159]:
def play_audio(audio, rate=sr):
    IPython.display.display(IPython.display.Audio(data=audio, rate=rate))
torchaudio.sox_effects.effect_names()

['allpass',
 'band',
 'bandpass',
 'bandreject',
 'bass',
 'bend',
 'biquad',
 'chorus',
 'channels',
 'compand',
 'contrast',
 'dcshift',
 'deemph',
 'delay',
 'dither',
 'divide',
 'downsample',
 'earwax',
 'echo',
 'echos',
 'equalizer',
 'fade',
 'fir',
 'firfit',
 'flanger',
 'gain',
 'highpass',
 'hilbert',
 'loudness',
 'lowpass',
 'mcompand',
 'norm',
 'oops',
 'overdrive',
 'pad',
 'phaser',
 'pitch',
 'rate',
 'remix',
 'repeat',
 'reverb',
 'reverse',
 'riaa',
 'silence',
 'sinc',
 'speed',
 'stat',
 'stats',
 'stretch',
 'swap',
 'synth',
 'tempo',
 'treble',
 'tremolo',
 'trim',
 'upsample',
 'vad',
 'vol']

In [293]:
import wavaugment as augment
audio = torch.tensor(original_audio).unsqueeze(0)
"""
https://sox.sourceforge.net/sox.html
original audio and the repeated audio.

reverb [−w|−−wet-only] [reverberance (50%) [HF-damping (50%)

[room-scale (100%) [stereo-depth (100%)
[pre-delay (0ms) [wet-gain (0dB)]]]]]]
wet-gain limit: 10

-w: Wet gain, which controls the level of the reverberant signal in decibels relative to the dry signal. A value of 0 means no reverb is applied, while higher values increase the amount of reverb.
# max w = 100

-p: Pre-delay, which is the time delay between the original signal and the start of the reverb. This value is specified in milliseconds.

-r: Reverberance, which controls the overall decay time of the reverb. A higher value means a longer decay time, resulting in a more "lush" or "wet" sound.

-l: Low-frequency decay, which controls the amount of low-frequency damping in the reverb. A higher value means more damping, resulting in a less "boomy" or "muddy" sound.

-h: High-frequency damping, which controls the amount of high-frequency damping in the reverb. A higher value means more damping, resulting in a less "bright" or "shimmery" sound.

-t: Tone controls, which can be used to adjust the balance between the low and high frequencies in the reverb. A value of 0 means a flat frequency response, while positive and negative values adjust the balance towards the high and low frequencies, respectively.
"""
effects = [
    # ['pitch', str(np.random.uniform(-1000, 1000))],  # pitch shift
    # ['tempo', str(np.random.uniform(0.6, 1.4))],  # tempo change
    # ['bass', str(np.random.uniform(-30, 30))],  # add bass
    # ['treble', str(np.random.uniform(-30, 30))],  # add treble
    # ['contrast', str(np.random.uniform(limit, limit+1))],  # increase contrast, [0, 100], this hass effect even when 0 is included
    # ['dcshift', str(np.random.uniform(-0.5, 0.5))],  # shift DC component of audio
    # ['highpass', str(np.random.uniform(100, 500))],  # high pass filter
    # ['lowpass', str(np.random.uniform(100, 500))],  # low pass filter
    ['bandpass', str(np.random.uniform(998, 999)), str(np.random.uniform(1000, 1001))],  # band pass filter
    # ['delay', str(np.random.uniform(0, 1))],  # delay effect
    # ['reverb', str(np.random.uniform(0, 100)), str(np.random.uniform(0, 100)), str(np.random.uniform(0, 100)), str(np.random.uniform(0, 100)), str(np.random.uniform(0, 200)), str(np.random.uniform(0, 10))],  # add reverb
]
# for _ in range(100000):
waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(audio, sr, effects, channels_first=True)

play_audio(original_audio)

random_pitch_shift = lambda: np.random.randint(-400, +400)
random_room_size = lambda: np.random.randint(0, 101)

# chain = augment.EffectChain().pitch("-q", random_pitch_shift).reverb(50, 50, random_room_size)
# chain_runner = augment.ChainRunner(chain)
# audio = chain_runner(waveform)

play_audio(waveform.squeeze())



In [346]:
import audiomentations
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import audiomentations as AA



train_transforms = AA.Compose([AA.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    AA.AddGaussianSNR(min_snr_in_db=0.5, max_snr_in_db=1.0, p=0.5),

    # Time and frequency domain transforms
    AA.TimeStretch(min_rate=0.8, max_rate=1.2, p=1, leave_length_unchanged=False),
    AA.PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    AA.Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
    AA.Normalize(p=0.5),
    AA.TimeMask(min_band_part=0.2, max_band_part=0.5, p=0.5),
    
    # Filter and equalizer effects
    # AA.LowPassFilter(min_cutoff_freq=500, max_cutoff_freq=1000, p=0.5),
    # AA.HighPassFilter(min_cutoff_freq=500, max_cutoff_freq=1000, p=0.5),
    # AA.BandPassFilter(min_center_freq=1000, max_center_freq=4000, p=0.5),
    AA.SevenBandParametricEQ(p=1),
    
    # Volume and clipping
    AA.LoudnessNormalization(min_lufs_in_db=-16, p=0.5),
    # AA.Gain(min_gain_in_db=-15, max_gain_in_db=15, p=0.5),
    AA.ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=30, p=0.5),
], p=1.0)


augmented_samples = train_transforms(samples=original_audio, sample_rate=16000)
play_audio(augmented_samples.squeeze())
