## 1. Installing Necessary Packages

To install the necessary packages for this project, use the following pip command:

In [7]:
%pip install numpy pyaudio faster-whisper

Note: you may need to restart the kernel to use updated packages.


## 2. Import Necessary Libraries

In this step, we import all the required libraries for audio processing, handling queues, threading, and the Whisper model.

In [8]:
# Import necessary libraries
import numpy as np
import queue
import re
import sys
import threading
import time
from typing import Dict, List, Tuple
from faster_whisper import WhisperModel
import pyaudio

# Print statement for beginners to know the step completed
print("Imported all necessary libraries.")

Imported all necessary libraries.


## 3. Create and Load the Whisper Model

### Model and Audio Settings

This section defines the settings for the Whisper model and audio configuration, including model type, processing options, and audio properties.

In [9]:
# Model settings

# MODEL_TYPE: Defines the type of Whisper model to use. Options include "small", "medium", "large", etc.
# Smaller models are faster but less accurate, while larger models are more accurate but require more resources.
MODEL_TYPE = "small"

# RUN_TYPE: Specifies whether the model should run on a CPU or GPU. Set to "gpu" for GPU acceleration if available.
RUN_TYPE = "cpu"  # Change to "gpu" if you have a GPU available

# For CPU usage:
# NUM_WORKERS: Number of worker threads used by the model for CPU operations. More workers can speed up processing.
NUM_WORKERS = 10

# CPU_THREADS: Number of threads to use for CPU operations. This should ideally match the number of CPU cores available.
CPU_THREADS = 4

# For GPU usage:
# GPU_DEVICE_INDICES: List of GPU indices to use. For example, [0, 1] will use the first two GPUs.
GPU_DEVICE_INDICES = [0, 1, 2, 3]

# VAD_FILTER: Voice Activity Detection filter flag. When True, the model will filter out non-speech audio segments.
VAD_FILTER = True

# Visualization (expected max number of characters for LENGTH_IN_SEC audio)
# MAX_SENTENCE_CHARACTERS: The maximum number of characters expected in a single line of transcription.
# This helps in formatting the display of transcribed text.
MAX_SENTENCE_CHARACTERS = 80

# Audio settings

# STEP_IN_SEC: The length of each audio chunk in seconds. This defines the duration of audio data captured in one go.
STEP_IN_SEC: int = 1

# LENGTH_IN_SEC: Maximum duration of audio data to process at once. This sets the maximum length of audio data that will be processed together.
LENGTH_IN_SEC: int = 6

# NB_CHANNELS: The number of audio channels. 1 for mono, 2 for stereo.
NB_CHANNELS = 1

# RATE: The sample rate of the audio data (in Hz). Common rates include 16000 (16kHz) and 44100 (44.1kHz).
RATE = 16000

# CHUNK: The number of audio samples per frame. This typically matches the sample rate for 1 second of audio data.
CHUNK = RATE

## INPUT_DEVICE_ID
INPUT_DEVICE_ID = 3

# Queues to handle audio data

# audio_queue: Queue to store audio data chunks captured from the microphone. These chunks are processed sequentially.
audio_queue = queue.Queue()

# length_queue: Queue to store audio chunks that will be processed together. It helps manage the batch size of audio data.
# maxsize is set to LENGTH_IN_SEC to limit the number of chunks held at once.
length_queue = queue.Queue(maxsize=LENGTH_IN_SEC)

### Create and Load the Whisper Model

This function initializes the Whisper model based on the specified settings. It can run on either CPU or GPU, depending on the configuration.

In [10]:
# Function to create the Whisper model
def create_whisper_model() -> WhisperModel:
    if RUN_TYPE.lower() == "gpu":
        whisper = WhisperModel(
            MODEL_TYPE,
            device="cuda",
            compute_type="float16",
            device_index=GPU_DEVICE_INDICES,
            download_root="./models",
        )
    elif RUN_TYPE.lower() == "cpu":
        whisper = WhisperModel(
            MODEL_TYPE,
            device="cpu",
            compute_type="int8",
            num_workers=NUM_WORKERS,
            cpu_threads=CPU_THREADS,
            download_root="./models",
        )
    else:
        raise ValueError(f"Invalid model type: {RUN_TYPE}")

    print("Loaded model")
    return whisper


# Load the model
model = create_whisper_model()
print("Whisper model is ready to use.")

Loaded model
Whisper model is ready to use.


## 4. Transcription, Record and Process Functions

### Transcription Function

This function uses the Whisper model to transcribe audio data into text. It processes the audio, detects speech segments, and generates text along with language information.

In [11]:
# Function to transcribe audio using the Whisper model
def execute_whisper_transcription(
    model: WhisperModel, audio_data_array: np.ndarray, language_code: str = ""
) -> Tuple[str, str, float]:
    language_code = language_code.lower().strip()
    segments, info = model.transcribe(
        audio_data_array,
        language=language_code if language_code != "" else None,
        beam_size=5,
        vad_filter=VAD_FILTER,
        vad_parameters=dict(min_silence_duration_ms=500),
    )
    segments = [s.text for s in segments]
    transcription = " ".join(segments).strip()
    return transcription, info.language, info.language_probability


print("Transcription function is set up.")

Transcription function is set up.


### Audio Recording Function

This function captures audio data from the microphone in chunks and places it into a queue for processing.

In [12]:
# Function to record audio from the microphone
def record_audio():
    global running
    audio = pyaudio.PyAudio()
    stream = audio.open(
        format=pyaudio.paInt16,
        channels=NB_CHANNELS,
        rate=RATE,
        input=True,
        frames_per_buffer=CHUNK,  # 1 second of audio
        input_device_index=INPUT_DEVICE_ID,  # Specify the selected input device
    )

    print("-" * 80)
    print("Microphone initialized, recording started...")
    print("-" * 80)
    print("TRANSCRIPTION")
    print("-" * 80)

    while running:
        audio_data = b""
        for _ in range(STEP_IN_SEC):
            chunk = stream.read(RATE)  # Read 1 second of audio data
            audio_data += chunk

        audio_queue.put(audio_data)  # Put the 1-second audio data into the queue

    stream.stop_stream()
    stream.close()
    audio.terminate()
    print("Microphone recording stopped.")

### Audio Processing Function

This function processes the audio data from the queue, transcribes it, and outputs the transcribed text.

In [13]:
# Function to process audio and get transcription
def process_audio(stats):
    global running
    while running or not audio_queue.empty():
        if length_queue.qsize() >= LENGHT_IN_SEC:
            with length_queue.mutex:
                length_queue.queue.clear()
                print()

        try:
            audio_data = audio_queue.get(timeout=1)
        except queue.Empty:
            continue

        transcription_start_time = time.time()
        length_queue.put(audio_data)

        # Concatenate audio data in the length_queue
        audio_data_to_process = b""
        for i in range(length_queue.qsize()):
            # We index it so it won't get removed
            audio_data_to_process += length_queue.queue[i]

        try:
            # Convert to NumPy array and normalize
            audio_np = (
                np.frombuffer(audio_data_to_process, np.int16).astype(np.float32)
                / 255.0
            )
            transcription, language, language_probability = (
                execute_whisper_transcription(model, audio_np)
            )
            transcription = re.sub(r"\[.*\]", "", transcription)
            transcription = re.sub(r"\(.*\)", "", transcription)
        except Exception as e:
            print(e)
            transcription = "Error"

        transcription_end_time = time.time()

        # Display transcription
        transcription_to_visualize = transcription.ljust(MAX_SENTENCE_CHARACTERS, " ")
        transcription_postprocessing_end_time = time.time()

        sys.stdout.write("\033[K" + transcription_to_visualize + "\r")

        audio_queue.task_done()

        overall_elapsed_time = (
            transcription_postprocessing_end_time - transcription_start_time
        )
        transcription_elapsed_time = transcription_end_time - transcription_start_time
        postprocessing_elapsed_time = (
            transcription_postprocessing_end_time - transcription_end_time
        )
        stats["overall"].append(overall_elapsed_time)
        stats["transcription"].append(transcription_elapsed_time)
        stats["postprocessing"].append(postprocessing_elapsed_time)

    print("Audio processing stopped.")

## Set the input device index

In [14]:
# Initialize PyAudio
audio = pyaudio.PyAudio()

# List all audio input devices
input_devices = []
for i in range(audio.get_device_count()):
    device_info = audio.get_device_info_by_index(i)
    if device_info["maxInputChannels"] > 0:
        input_devices.append((i, device_info["name"], device_info["maxInputChannels"]))
        print(
            f"Device ID {i}: {device_info['name']} - Channels: {device_info['maxInputChannels']}"
        )

# Terminate PyAudio instance (to be reinitialized later in the recording function)
audio.terminate()

Device ID 1: External Microphone - Channels: 1
Device ID 3: MacBook Pro Microphone - Channels: 1
Device ID 5: Gabriel’s iPhone Microphone - Channels: 1
Device ID 6: Microsoft Teams Audio - Channels: 2


In [15]:
# Prompt the user to select a device by entering the device ID
INPUT_DEVICE_ID = int(
    input("Enter the Device ID of the input device you want to use: ")
)

# Retrieve the number of channels for the selected device
selected_device_info = next(
    (device for device in input_devices if device[0] == INPUT_DEVICE_ID), None
)
if selected_device_info:
    selected_channels = selected_device_info[2]
    print(f"Selected Device ID: {INPUT_DEVICE_ID}, Channels: {selected_channels}")
else:
    print("Invalid Device ID")

# Set global channel count based on selected device
NB_CHANNELS = selected_channels if selected_device_info else 1

Selected Device ID: 3, Channels: 1


## 5. Running the Audio Processing System

This final cell sets up the audio recording and processing threads, and handles the clean shutdown of these threads upon interruption.

In [16]:
# Flag to control the running state of threads
running = True

# Initialize statistics dictionary
stats: Dict[str, List[float]] = {
    "overall": [],
    "transcription": [],
    "postprocessing": [],
}

# Start recording and processing threads
producer = threading.Thread(target=record_audio)
producer.start()

consumer = threading.Thread(target=process_audio, args=(stats,))
consumer.start()

print(
    "Audio recording and processing started. Press 'Stop' or interrupt the kernel to stop."
)

# This block is to ensure proper shutdown of threads
try:
    producer.join()
    consumer.join()
except KeyboardInterrupt:
    print("Stopping...")
    running = False
    producer.join()
    consumer.join()
    print("Stopped.")
    # Print statistics
    print("Number of processed chunks: ", len(stats["overall"]))
    print(
        f"Overall time: avg: {np.mean(stats['overall']):.4f}s, std: {np.std(stats['overall']):.4f}s"
    )
    print(
        f"Transcription time: avg: {np.mean(stats['transcription']):.4f}s, std: {np.std(stats['transcription']):.4f}s"
    )
    print(
        f"Postprocessing time: avg: {np.mean(stats['postprocessing']):.4f}s, std: {np.std(stats['postprocessing']):.4f}s"
    )
    print(f"The average latency is {np.mean(stats['overall']) + STEP_IN_SEC:.4f}s")

Audio recording and processing started. Press 'Stop' or interrupt the kernel to stop.
--------------------------------------------------------------------------------
Microphone initialized, recording started...
--------------------------------------------------------------------------------
TRANSCRIPTION
--------------------------------------------------------------------------------
[KAnd this is the end of this video.  Thank you for watching.  I hope you enjoyed this video.  I'll see you in the next video.  Bye.
[Kand talk to those at the center of the problem to find out what can be done.    
Stopping...
Microphone recording stopped.                                                      
[K                                                                                
Audio processing stopped.                                                          
Stopped.
Number of processed chunks:  20
Overall time: avg: 1.2311s, std: 0.9780s
Transcription time: avg: 1.2311s, std: 0.9780s
P