In [None]:
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import os
import cv2 as cv
import scipy as sp
import pickle

In [None]:
!pip install audiomentations

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting audiomentations
  Downloading audiomentations-0.28.0-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.0/66.0 KB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: audiomentations
Successfully installed audiomentations-0.28.0


In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

def augment_input(audio_path,  num_samples=4, return_fb=True):

    augment = Compose([
        TimeStretch(min_rate=0.8, max_rate=1.2, p=0.7),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        Shift(min_fraction=0.5, max_fraction=0.5, p=0.5, fade = True),
    ])

    audio, fs = librosa.load(audio_path, sr=None)
    audios_out = num_samples*[None]

    for i in range(num_samples):
        audios_out[i] = augment(samples=audio, sample_rate=fs)

    if not return_fb:
        return audios_out

    if return_fb:
        fb_out = (num_samples+1)*[None]
        fb_out[0] = generate_fb_and_mfcc(audio, fs)
        for i in range(num_samples):
            fb_out[i+1] = generate_fb_and_mfcc(audios_out[i], fs)
        return np.dstack(fb_out)

In [None]:
def generate_fb_and_mfcc(signal, sample_rate):

    # Pre-Emphasis
    pre_emphasis = 0.97
    emphasized_signal = np.append(
        signal[0],
        signal[1:] - pre_emphasis * signal[:-1])

    # Framing
    frame_size = 0.025
    frame_stride = 0.01

    # Convert from seconds to samples
    frame_length, frame_step = (
        frame_size * sample_rate,
        frame_stride * sample_rate)
    signal_length = len(emphasized_signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))

    # Make sure that we have at least 1 frame
    num_frames = int(
        np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))

    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))

    # Pad Signal to make sure that all frames have equal
    # number of samples without truncating any samples
    # from the original signal
    pad_signal = np.append(emphasized_signal, z)

    indices = (
        np.tile(np.arange(0, frame_length), (num_frames, 1)) +
        np.tile(
            np.arange(0, num_frames * frame_step, frame_step),
            (frame_length, 1)
        ).T
    )
    frames = pad_signal[indices.astype(np.int32, copy=False)]

    # Window
    frames *= np.hamming(frame_length)

    # Fourier-Transform and Power Spectrum
    NFFT = 512

    # Magnitude of the FFT
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))

    # Power Spectrum
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))

    # Filter Banks
    nfilt = 40

    low_freq_mel = 0

    # Convert Hz to Mel
    high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))

    # Equally spaced in Mel scale
    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)

    # Convert Mel to Hz
    hz_points = (700 * (10**(mel_points / 2595) - 1))
    bin = np.floor((NFFT + 1) * hz_points / sample_rate)

    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin[m - 1])   # left
        f_m = int(bin[m])             # center
        f_m_plus = int(bin[m + 1])    # right

        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    filter_banks = np.dot(pow_frames, fbank.T)

    # Numerical Stability
    filter_banks = np.where(
        filter_banks == 0,
        np.finfo(float).eps,
        filter_banks)

    # dB
    filter_banks = 20 * np.log10(filter_banks)
    return filter_banks

In [None]:
listdir = os.listdir('gdrive/MyDrive/audio_clean_v2/de_trim')
counter = 1000
fmin = 0
fmax = 4000
X = []
y = []

for i, audio in enumerate(listdir):
  fb_augmented = augment_input(os.path.join('gdrive/MyDrive/audio_clean_v2/de_trim', audio), num_samples=4, return_fb=True)
  X.append(fb_augmented)
  if(i == counter - 1):
    break

filename = 'gdrive/MyDrive/audio_clean_v2/de_augmented_npy'
fileObject = open(filename, 'wb')


pickle.dump(X, fileObject)
fileObject.close()
