In [20]:
import librosa
import numpy as np
import noisereduce as nr
import webrtcvad
from scipy.signal import find_peaks
from sklearn.preprocessing import StandardScaler
from scipy.signal import find_peaks
import os

def reduce_noise(audio_data):
    stft = librosa.stft(audio_data)
    stft_db = librosa.amplitude_to_db(np.abs(stft))
    mean_noise = np.mean(stft_db[:, :200], axis=1)
    stft_db -= mean_noise[:, np.newaxis]
    noise_reduced_audio = librosa.griffinlim(librosa.db_to_amplitude(stft_db))
    return noise_reduced_audio

import numpy as np
import librosa

def perform_vad(audio,sr, frame_duration=30, energy_threshold=0.05):
    """
    Perform Voice Activity Detection (VAD) on audio using energy thresholding.
    
    Args:
    - audio (numpy.ndarray): Input audio signal.
    - frame_duration (float): Duration of each frame in milliseconds.
    - energy_threshold (float): Energy threshold for classifying speech.
    
    Returns:
    - segments (list): List of speech segments identified by VAD.
    """
    samples_per_frame = int(sr * frame_duration / 1000)
    
    # Calculate energy for each frame
    energy = np.array([np.sum(frame ** 2) / len(frame) for frame in np.array_split(audio, len(audio) // samples_per_frame)])
    
    # Apply thresholding to classify frames as speech or silence
    speech_indices = np.where(energy > energy_threshold)[0]
    
    segments = []
    start = None
    
    for idx in speech_indices:
        if start is None:
            start = idx
        elif segments and idx - segments[-1][-1] > 1:
            segments.append((start * samples_per_frame, idx * samples_per_frame))
            start = idx
    
    # If there's remaining speech at the end, add the last segment
    if start is not None:
        segments.append((start * samples_per_frame, speech_indices[-1] * samples_per_frame))
    
    return segments

def convertNumpytoWav(audio_file):
# Example usage
    audio_file = "path/to/your/audio/file.wav"
    audio, sr = librosa.load(audio_file, sr=None)
    speech_segments = perform_vad(audio)

    print("Speech segments:", speech_segments)




In [3]:
# Load the WAV audio file
audio_file = 'output_file.wav'
y, sr = librosa.load(audio_file, sr=None)


In [4]:

# Resample the audio to a common sampling rate (e.g., 16 kHz) - Preprocessing 1
target_sr = 16000
y_resampled = librosa.resample(y, orig_sr=44100, target_sr=16000)


In [5]:

# Normalize the audio to ensure consistent amplitude levels - Preprocessing 2
y_normalized = librosa.util.normalize(y_resampled)



In [6]:

# Remove silence using a threshold (e.g., -40 dB) - Preprocessing 3
y_trimmed, _ = librosa.effects.trim(y_normalized, top_db=40)

# print(y_trimmed.shape)


In [9]:

# Perform noise reduction using the NoiseReduce library - Preprocessing 4
y_denoised = reduce_noise(y_trimmed) 
print(y_denoised)



[-0.02467542 -0.00395957 -0.00713327 ... -0.00338722  0.00419936
 -0.00896311]


In [21]:

speech_segments = perform_vad(y_denoised, sr)


In [None]:

# Find peaks in the audio signal - Preprocessing 6
peaks, _ = find_peaks(np.abs(y_preprocessed5), height=0.5)

# Apply compression by reducing the amplitude of peaks
compression_factor = 0.5
y_compressed = np.copy(y_preprocessed5)
y_compressed[peaks] *= compression_factor
# Now y_compressed contains the audio with dynamic range compression applied



In [None]:

# Initialize StandardScaler for feature scaling - Preprocessing 7
scaler = StandardScaler()

# Reshape y_compressed to a 2D array (assuming it's a 1D array representing audio signal)
y_reshaped = y_compressed.reshape(-1, 1)

# Scale the features (audio samples) using the scaler
scaled_audio = scaler.fit_transform(y_reshaped)

# Reshape scaled_audio back to 1D array (if needed)
scaled_audio = scaled_audio.ravel()

# Now scaled_audio contains the scaled audio signal with zero mean and unit variance
