In [7]:
import librosa
import numpy as np
import noisereduce as nr
import webrtcvad
from scipy.signal import find_peaks
from sklearn.preprocessing import StandardScaler
from scipy.signal import find_peaks
from scipy.io.wavfile import write

def reduce_noise(audio_data):
    stft = librosa.stft(audio_data)
    stft_db = librosa.amplitude_to_db(np.abs(stft))
    mean_noise = np.mean(stft_db[:, :200], axis=1)
    stft_db -= mean_noise[:, np.newaxis]
    noise_reduced_audio = librosa.griffinlim(librosa.db_to_amplitude(stft_db))
    return noise_reduced_audio

def vad_energy_based(audio_data, threshold=0.05):
    active_segments = []  # Initialize an empty list to store active voice segments
    segment_start = 0  # Variable to store the start index of an active segment
    is_previous_active = False  # Flag to keep track of the previous state of voice activity

    for i, sample in enumerate(audio_data):
        energy = np.sum(sample ** 2)  # Calculate energy of the current sample
        is_active = energy > threshold  # Check if the current sample indicates voice activity

        if is_active:
            if not is_previous_active:
                segment_start = i  # Start of a new active segment
            is_previous_active = True
        else:
            if is_previous_active:
                active_segments.append(audio_data[segment_start:i])  # Append active segment to list
            is_previous_active = False

    # Check if the last segment is active
    if is_previous_active:
        active_segments.append(audio_data[segment_start:])

    # Concatenate all active segments into a single numpy array
    if active_segments:
        active_voice_array = np.concatenate(active_segments)
        return active_voice_array
    else:
        return np.array([])  # Return an empty array if no active segments are found




In [16]:
# Load the WAV audio file
audio_file = 'output_file.wav'
y, sr = librosa.load(audio_file, sr=None)

def generateOutputAudioFile(outputDestination, inputArray):

    # Scale the values in the array to the range [-32768, 32767] (for 16-bit PCM audio)
    scaled_array = np.int16(inputArray*32767)

    # Write the array to a WAV file
    write(outputDestination, sr, scaled_array)

print(y.shape)


(2980160,)


In [18]:

# Resample the audio to a common sampling rate (e.g., 16 kHz) - Preprocessing 1 Desampling
target_sr = 16000
print(sr)
y_resampled = librosa.resample(y, orig_sr=sr, target_sr=16000)
print(y_resampled.shape)

generateOutputAudioFile('pre_processed_audio/pre_processing_1_output.wav', y_resampled)

16000
(2980160,)


In [19]:

# Normalize the audio to ensure consistent amplitude levels - Preprocessing 2 Normalization
y_normalized = librosa.util.normalize(y_resampled)

print(y_normalized.shape)
generateOutputAudioFile('pre_processed_audio/pre_processing_2_output.wav', y_normalized)


In [32]:

# Remove silence using a threshold (e.g., -40 dB) - Preprocessing 3 Silence Removal
y_trimmed, _ = librosa.effects.trim(y_normalized, top_db=3)

print(y_trimmed.shape)

generateOutputAudioFile('pre_processed_audio/pre_processing_3_output.wav', y_trimmed)


In [33]:

# Perform noise reduction using the NoiseReduce library - Preprocessing 4 Noise Reduction
y_denoised = reduce_noise(y_trimmed) 
print(y_denoised.shape)


generateOutputAudioFile('pre_processed_audio/pre_processing_4_output.wav', y_denoised)


(2785280,)


In [22]:

# speech_segments = perform_vad(y_denoised, sr) - Preprocessing 5 Voice Activity Detection (VAD):
active_voice_array = vad_energy_based(y_denoised)
print("Active Voice Array:", active_voice_array.shape)
y_preprocessed5 = active_voice_array
# y_preprocessed5 = np.concatenate(speech_segments)
print(y_preprocessed5.shape)

generateOutputAudioFile('pre_processed_audio/pre_processing_5_output.wav', active_voice_array)


Active Voice Array: (1663428,)


In [23]:

# Find peaks in the audio signal - Preprocessing 6  Dynamic Range Compression:
peaks, _ = find_peaks(np.abs(y_preprocessed5), height=0.5)

# Apply compression by reducing the amplitude of peaks
compression_factor = 0.5
y_compressed = np.copy(y_preprocessed5)

y_compressed[peaks] *= compression_factor
print(y_compressed.shape)
# Now y_compressed contains the audio with dynamic range compression applied

generateOutputAudioFile('pre_processed_audio/pre_processing_6_output.wav', y_compressed)




(1663428,)


In [24]:

# Initialize StandardScaler for feature scaling - Preprocessing 7
scaler = StandardScaler()

# Reshape y_compressed to a 2D array (assuming it's a 1D array representing audio signal)
y_reshaped = y_compressed.reshape(-1, 1)

# Scale the features (audio samples) using the scaler
scaled_audio = scaler.fit_transform(y_reshaped)

# Reshape scaled_audio back to 1D array (if needed)
scaled_audio = scaled_audio.ravel()

# Now scaled_audio contains the scaled audio signal with zero mean and unit variance
print(y_reshaped.shape)

generateOutputAudioFile('pre_processed_audio/pre_processing_7_output.wav', scaled_audio)




(1663428, 1)
