In [9]:
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as transforms

import math

from IPython.display import Audio
import matplotlib.pyplot as plt
import os
from torchaudio.utils import download_asset

from scipy.io import wavfile
import math
import numpy as np
from scipy import signal
from pathlib import Path
import scipy.signal as sps
from scipy.signal import butter, lfilter
import soundfile as sf
import pydub
import uuid
from pydub import AudioSegment, effects

print(torch.__version__)
print(torchaudio.__version__)

2.0.1+cu117
2.0.2+cu117




In [12]:
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)
    plt.show(block=False)

In [13]:
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
    waveform = waveform.numpy()

    num_channels, _ = waveform.shape

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)
    plt.show(block=False)

In [14]:
speech_fpath = "sample_audio/speech/"
speech_clips = os.listdir(speech_fpath)
noise_fpath = "sample_audio/noise/"
noise_clips = os.listdir(noise_fpath)
print("No. of .wav files in speech folder = ",len(speech_clips))
print("No. of .wav files in noise folder = ",len(noise_clips))

No. of .wav files in speech folder =  10
No. of .wav files in noise folder =  10


In [8]:
#Randomise speech files can not be used twice
import random
speech_ran_list = random.sample(range(len(speech_clips)), len(speech_clips))
print(speech_ran_list)

for i in range(0, len(speech_clips)):
    os.rename(speech_fpath + speech_clips[i], speech_fpath + str(speech_ran_list[i]) + "_speech.wav")

[0, 4, 3, 6, 7, 1, 8, 9, 2, 5]


In [10]:
def butter_lowpass(cutoff, fs, order=5):
    return butter(order, cutoff, fs=fs, btype='low', analog=False)

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [17]:
test_speech_fpath = "sample_audio/test_speech/"
test_speech_clips = os.listdir(test_speech_fpath)
test_noise_fpath = "sample_audio/test_noise/"
test_noise_clips = os.listdir(test_noise_fpath)
print("No. of .wav files in test speech folder = ",len(test_speech_clips))
print("No. of .wav files in test noise folder = ",len(test_noise_clips))

No. of .wav files in test speech folder =  0
No. of .wav files in test noise folder =  0


In [18]:
speech_fpath = "sample_audio/speech/"
speech_clips = os.listdir(speech_fpath)
noise_fpath = "sample_audio/noise/"
noise_clips = os.listdir(noise_fpath)
print("No. of .wav files in speech folder = ",len(speech_clips))
print("No. of .wav files in noise folder = ",len(noise_clips))

No. of .wav files in speech folder =  10
No. of .wav files in noise folder =  10


In [19]:
for i in range (0, len(speech_clips)):
   samplerate, data = wavfile.read(speech_fpath + speech_clips[i])
   Fs1 = samplerate
   Fs2 = 16000
   N = len(data)
   total_time = (N-1)/Fs1
   Max_Signal_Frequency =Fs2/2
   New_sample_amount = math.ceil(Fs2*total_time)
   Single_Channel = np.zeros(New_sample_amount)
   data = data/(2**(15-1))
   Original_signal = data
   Anti_Aliased_signal = np.array(butter_lowpass_filter(Original_signal,Max_Signal_Frequency,Fs1))
   Down_sampled_signal = np.array(sps.resample(Anti_Aliased_signal,New_sample_amount))
   Single_Channel = Down_sampled_signal
   Transformed_single_channel = Single_Channel.transpose()
   sf.write(test_speech_fpath + "speech_" + str(i) +".wav", Transformed_single_channel, Fs2, 'PCM_16')
   rawsound = AudioSegment.from_wav(test_speech_fpath + "speech_" + str(i) +".wav")  
   normalizedsound = effects.normalize(rawsound)  
   normalizedsound.export(test_speech_fpath + "speech_" + str(i) +".wav", format = 'wav')

In [20]:
for i in range (0, len(noise_clips)):
   rawsound = AudioSegment.from_wav(noise_fpath + noise_clips[i])  
   normalizedsound = effects.normalize(rawsound)  
   normalizedsound.export(test_noise_fpath + "noise_norm" + str(i) +".wav", format = 'wav')

In [22]:
test_speech_fpath = "sample_audio/test_speech/"
test_speech_clips = os.listdir(test_speech_fpath)
test_noise_fpath = "sample_audio/test_noise/"
test_noise_clips = os.listdir(test_noise_fpath)
speech_fpath = "sample_audio/speech/"
speech_clips = os.listdir(speech_fpath)
noise_fpath = "sample_audio/noise/"
noise_clips = os.listdir(noise_fpath)
print("No. of .wav files in speech folder = ",len(speech_clips))
print("No. of .wav files in noise folder = ",len(noise_clips))
print("No. of .wav files in test speech folder = ",len(test_speech_clips))
print("No. of .wav files in test noise folder = ",len(test_noise_clips))

No. of .wav files in speech folder =  10
No. of .wav files in noise folder =  10
No. of .wav files in test speech folder =  10
No. of .wav files in test noise folder =  10


In [23]:
#Randomise noise files can one be done once
import random
noise_ran_list = random.sample(range(len(test_noise_clips)), len(test_noise_clips))
print(noise_ran_list)

for i in range(0, len(test_noise_clips)):
    os.rename(test_noise_fpath + test_noise_clips[i], test_noise_fpath + str(noise_ran_list[i]) + "_noise.wav")

[6, 4, 1, 2, 0, 8, 9, 3, 7, 5]


In [26]:
test_speech_fpath = "sample_audio/test_speech/"
test_speech_clips = os.listdir(test_speech_fpath)
test_noise_fpath = "sample_audio/test_noise/"
test_noise_clips = os.listdir(test_noise_fpath)
speech_fpath = "sample_audio/speech/"
speech_clips = os.listdir(speech_fpath)
noise_fpath = "sample_audio/noise/"
noise_clips = os.listdir(noise_fpath)
print("No. of .wav files in speech folder = ",len(speech_clips))
print("No. of .wav files in noise folder = ",len(noise_clips))
print("No. of .wav files in test speech folder = ",len(test_speech_clips))
print("No. of .wav files in test noise folder = ",len(test_noise_clips))

No. of .wav files in speech folder =  10
No. of .wav files in noise folder =  10
No. of .wav files in test speech folder =  10
No. of .wav files in test noise folder =  10


In [27]:
# Adding all noise and speech together at different SNR levels

for i in range(0, len(speech_clips)):
  #waveform1, sample_rate1 = torchaudio.load(test_speech_fpath + test_speech_clips[i])
  #waveform2, sample_rate2 = torchaudio.load(test_noise_fpath + test_noise_clips[i])

  # Load waveforms
  speech, _ = torchaudio.load(test_speech_fpath + test_speech_clips[i])
  noise, _ = torchaudio.load(test_noise_fpath + test_noise_clips[i])

  # From a random point in the noise waveform make the size of the noise the same as the speech
  first = random.randint(0, noise.shape[1] - speech.shape[1])
  noise = noise[:, first:first + speech.shape[1]]

  # At all SNR levels add the noise to the speech
  snr_dbs = torch.tensor([-3, -6, -9])
  noisy_speeches = F.add_noise(speech, noise, snr_dbs)

  snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1]
  torchaudio.save("sample_audio/noisy_speech/SNR_-3dB/" + "SNR_-3dB_" + str(i) + ".wav", noisy_speech, 16000, encoding="PCM_S", bits_per_sample=16)

  snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2]
  torchaudio.save("sample_audio/noisy_speech/SNR_-6dB/" + "SNR_-6dB_" + str(i) + ".wav", noisy_speech, 16000, encoding="PCM_S", bits_per_sample=16)

  snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3]
  torchaudio.save("sample_audio/noisy_speech/SNR_-9dB/" + "SNR_-9dB_" + str(i) + ".wav", noisy_speech, 16000, encoding="PCM_S", bits_per_sample=16)