In [1]:
!pip install librosa pydub -q

In [2]:
from pathlib import Path
import soundfile as sf
import librosa
from pydub import AudioSegment
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from typing import Union
import torch
import numpy as np

In [3]:
# Constants for configuration
MODEL_NAME = "openai/whisper-small"
CACHE_DIR = "./model_dir"
TARGET_SAMPLE_RATE = 16000
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")


def load_model_and_processor(
    model_name: str = MODEL_NAME,
    cache_dir: str = CACHE_DIR,
    device: str = DEVICE,
):
    """Load Whisper model and processor with specified configuration."""
    processor = WhisperProcessor.from_pretrained(
        model_name, cache_dir=cache_dir
    )
    model = WhisperForConditionalGeneration.from_pretrained(model_name).to(
        device
    )
    model.config.forced_decoder_ids = None
    return processor, model

processor, model = load_model_and_processor(MODEL_NAME, CACHE_DIR, DEVICE)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [22]:
def milliseconds_until_sound(
    sound: AudioSegment, silence_threshold: float = -20.0, chunk_size: int = 10
) -> int:
    """Calculate the duration until sound begins in an audio segment."""
    trim_ms = 0
    while (
        trim_ms < len(sound)
        and sound[trim_ms : trim_ms + chunk_size].dBFS < silence_threshold
    ):
        trim_ms += chunk_size
    return trim_ms


def trim_leading_silence(
    filepath: Union[str, Path], silence_threshold: float = -20.0
) -> Path:
    """Trim leading silence from audio and save the processed file temporarily."""
    try:
        path = Path(filepath)
        audio = AudioSegment.from_file(filepath, format="wav")

        # Get start trim point
        start_trim = milliseconds_until_sound(audio, silence_threshold)

        # Trim audio and repeat for additional length if needed
        trimmed_audio = audio[start_trim:] * 5
        trimmed_filepath = path.parent / f"trimmed_{path.name}"

        trimmed_audio.export(trimmed_filepath, format="wav")
        return trimmed_filepath

    except Exception as e:
        print(f"Error trimming silence from {filepath}: {e}")
        raise  # Re-raise the exception for further handling


def reduce_noise(file_path: Union[str, Path]) -> Path:
    """Reduce noise with spectral gating."""
    try:
        path = Path(file_path)

        # Load audio file
        y, sr = librosa.load(file_path, sr=None)

        # Apply spectral gating noise reduction
        reduced_noise = librosa.effects.preemphasis(y)

        # Save the cleaned audio for further processing
        cleaned_filepath = path.parent / f"cleaned_{path.name}"
        sf.write(cleaned_filepath, reduced_noise, sr)

        return cleaned_filepath

    except Exception as e:
        print(f"Error reducing noise for {file_path}: {e}")
        raise  # Re-raise the exception for further handling


def resample_audio(
    audio_array, orig_sr: int, target_sr: int = TARGET_SAMPLE_RATE
):
    """Resample audio to the target sample rate if needed."""
    try:
        if orig_sr != target_sr:
            audio_array = librosa.resample(
                audio_array, orig_sr=orig_sr, target_sr=target_sr
            )
        return audio_array

    except Exception as e:
        print(f"Error resampling audio: {e}")
        raise  # Re-raise the exception for further handling


def transcribe_audio(audio_array, sampling_rate: int) -> str:
    """Transcribe audio array using the Whisper model."""
    try:
        input_features = processor(
            audio_array, sampling_rate=sampling_rate, return_tensors="pt"
        ).input_features.to(DEVICE)
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(
            predicted_ids, skip_special_tokens=True
        )

        return transcription[0]

    except Exception as e:
        print(f"Error transcribing audio: {e}")
        raise  # Re-raise the exception for further handling


def transcribe_from_file(
    file_path: Union[str, Path],
    trim_silence: bool = False,
    noise_reduction: bool = False,
) -> str:
    """Complete pipeline to trim, resample, and transcribe audio from a file."""
    try:
        # Process file to remove silence if specified
        if trim_silence:
            file_path = trim_leading_silence(file_path)

        # Process file to reduce noise if specified
        if noise_reduction:
            file_path = reduce_noise(file_path)

        # Load and potentially resample audio
        audio_array, sampling_rate = sf.read(file_path)
        audio_array = resample_audio(audio_array, orig_sr=sampling_rate)

        # Transcribe the processed audio
        transcription = transcribe_audio(audio_array, TARGET_SAMPLE_RATE)

        return transcription

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        raise  # Re-raise the exception for further handling

In [23]:
file_path = "test.wav"
print(f"Actual Transcription: Hello. This is great. Whisper is supervised by Rewan")

transcription = transcribe_from_file(file_path, trim_silence=False, noise_reduction=False)
print("Transcription (from raw and resampled file):", transcription)

transcription = transcribe_from_file(file_path, trim_silence=True, noise_reduction=False)
print("Transcription (from trimmed and resampled file):", transcription)

transcription = transcribe_from_file(file_path, trim_silence=False, noise_reduction=True)
print("Transcription (from noise reduced and resampled file):", transcription)


Actual Transcription: Hello. This is great. Whisper is supervised by Rewan
Transcription (from raw and resampled file):  Hello. This is great. This book is supervised by Revan.
Transcription (from trimmed and resampled file):  This is great. Whisper is supervised by Revan.
Transcription (from noise reduced and resampled file):  Hello. This is great. This book is supervised by Revan.
