## \[Research\] Audio transcription with `faster-whisper`

This notebook demonstrates the audio processing pipeline, focusing on transcribing audio with `faster-whisper`.

### 1. Imports and Configurations

In [None]:
import warnings
import pandas as pd
import justsdk

from pathlib import Path
from faster_whisper import WhisperModel

warnings.filterwarnings("ignore")

ROOT = Path.cwd().parent
DATA_DIR = ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
SAMPLE_DIR = DATA_DIR / "sample"
MODEL_DIR = ROOT / "models"

WHISPER_MODEL = "base"

WHISPER_CONFIG = {
    "model_size": WHISPER_MODEL,
    "device": "cpu",
    "compute_type": "int8",
    "num_workers": 2,
    "download_root": str(MODEL_DIR / f"whisper-{WHISPER_MODEL}"),
}

AUDIO_CONFIG = {
    "language": "en",
    "task": "transcribe",
    "beam_size": 5,  # Paths searches during decoding
    "best_of": 5,
    "patience": 1,
    "length_penalty": 1,
    "temperature": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],  # Temperature fallback
    "compression_ratio_threshold": 2.4,  # Reject if text is too repetitive
    "log_prob_threshold": -1.0,  # Threshold for confidence levels
    "no_speech_threshold": 0.6,  # Threshold for non-speech detection
    "word_timestamps": True,  # Generate word-level timestamps
    "vad_filter": True,  # Skip silent parts
    "vad_parameters": {
        "threshold": 0.5,
        "min_speech_duration_ms": 250,
        "max_speech_duration_s": float("inf"),
        "min_silence_duration_ms": 2000,
        "window_size_samples": 1024,
        "speech_pad_ms": 400,
    },
}

justsdk.print_info(f"WHISPER_MODEL: {WHISPER_MODEL}", newline_before=True)
justsdk.print_info(f"DEVICE: {WHISPER_CONFIG['device']}")