# Whisper Streaming Tests

In [None]:
import torch
import time
from whisper_online import FasterWhisperASR,OnlineASRProcessor,load_audio,load_audio_chunk
import numpy as np
import pyaudio
import librosa

language = "en"
model_path = "C:/Users/Greg/Documents/Job/PythonProject/ressources/Model/faster_whisper_v3"

# Loads and wrap whisper model
asr = FasterWhisperASR(language, model_dir=model_path)

## Set Options

# Translate Task
# asr.set_translate_task()

# Vad option
# asr.use_vad()

# create processing object with default buffer trimming option
online = OnlineASRProcessor(asr)

## a) Using basic audio file

In [None]:
import torch
import time
from whisper_online import FasterWhisperASR,OnlineASRProcessor,load_audio,load_audio_chunk
import numpy as np
import pyaudio
import librosa

language = "en"
model_path = "C:/Users/Greg/Documents/Job/PythonProject/ressources/Model/faster_whisper_turbo"

# Loads and wrap whisper model
asr = FasterWhisperASR(language, model_dir=model_path)

## Set Options

# Translate Task
# asr.set_translate_task()

# Vad option
# asr.use_vad()

# create processing object with default buffer trimming option
online = OnlineASRProcessor(asr)

audio_path = "audio/elo_musk_podcast.wav"

SAMPLING_RATE = 16000

# Adjust it in function (espcially when streaming)

# duration = len(load_audio(audio_path))/SAMPLING_RATE # Whole audio duration

duration = 60  # One minute duration

a = load_audio_chunk(audio_path,0,1)


# warm up the asr because the very first transcribe take much more time than the other 
asr.transcribe(a)

beg = 0
start = time.time()-beg
end = 0
min_chunk = 1

print("Starting Audio stream !")
while True :

    now = time.time() - start
    if now < end + min_chunk:
        time.sleep(min_chunk+end-now)

    end = time.time() - start
    a = load_audio_chunk(audio_path,beg,end)
    beg = end
    online.insert_audio_chunk(a)
    o = online.process_iter()

    print(o[2],end="",flush=True)
    now = time.time() - start

    if end >= duration:
        break
print()
print("Finishing Audio Stream !")
o = online.finish()

## b) Using Streamed Audio

In [None]:
import pyaudio

# PyAudio configuration for capturing audio
CHUNK = 1024  # Number of audio samples per frame
FORMAT = pyaudio.paInt16
CHANNELS = 2  # Stereo input
INPUT_RATE = 44100  # Typical system audio rate
OUTPUT_RATE = 16000  # Required rate for the pipeline (16kHz)

# Initialize PyAudio
p = pyaudio.PyAudio()

# List available audio input devices
print("Available audio devices:")
for i in range(p.get_device_count()):
    dev = p.get_device_info_by_index(i)
    print(f"{i}: {dev['name']} (max input channels: {dev['maxInputChannels']})")

## Choose yours

index_stream_audio = 35


In [None]:
def process_audio(audio_chunk, input_rate, output_rate, channels):
    """
    Process audio to required format:
    - Downsample to output_rate
    - Convert to mono if necessary
    """
    # Convert raw audio bytes to NumPy array
    audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
    # Reshape stereo to separate channels
    if channels == 2:
        audio_data = audio_data.reshape(-1, 2)
        audio_data = np.mean(audio_data, axis=1)  # Convert to mono by averaging
    # Resample to 16kHz
    audio_data = librosa.resample(audio_data.astype(np.float32), orig_sr=input_rate, target_sr=output_rate)
    return audio_data


audio_path = "audio/elo_musk_podcast.wav"

SAMPLING_RATE = 16000

# Adjust it in function (espcially when streaming)

# duration = len(load_audio(audio_path))/SAMPLING_RATE # Whole audio duration

duration = 60  # One minute duration

a = load_audio_chunk(audio_path,0,1)


# warm up the asr because the very first transcribe take much more time than the other 
asr.transcribe(a)

print("Warm-up finished")

duration = 60 # Duration in second

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=INPUT_RATE,
                input=True,
                input_device_index=index_stream_audio,
                frames_per_buffer=CHUNK)

print("Listening to system audio...")



start = time.time()

min_chunk = 1

try:
    while True:
        now = time.time()

        # Capture audio chunk
        audio_chunk = stream.read(CHUNK, exception_on_overflow=False)

        # Process audio to 16kHz mono
        processed_audio = process_audio(audio_chunk, INPUT_RATE, OUTPUT_RATE, CHANNELS)
        
        print(pro)
        # online.insert_audio_chunk(a)
        # o = online.process_iter()
        # print(o,end="  ",flush=True)

        # if(time.time()-start >duration+2):
        #     print("\n Duration Complete")
        #     break
        
        
        # if now - time.time() <  min_chunk:
        #     time.sleep(min_chunk)
        

except KeyboardInterrupt:
    print("\nStopped by user.")
finally:
    stream.stop_stream()
    stream.close()
    p.terminate()

## c) Using Server 

In [None]:
import requests
import numpy as np
import librosa
import time
import scipy.io.wavfile as wav
import json
import soundfile as sf


# URL of the FastAPI application
url = "http://127.0.0.1:8000"


# Function to read and resample audio chunk
def read_audio_chunk(file_path, start, end, sr=16000):
    # Calculate the start and end in samples for the desired sample rate
    start_sample = int(start * sr)
    end_sample = int(end * sr)
    duration = end - start

    # Use soundfile to load only the required segment
    audio, orig_sr = sf.read(file_path, start=start_sample, stop=end_sample, dtype='float32')

    # Convert to mono if it's multi-channel
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)

    # Resample only if necessary
    if orig_sr != sr:
        audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr)

    return audio

# Path to the audio file
audio_path = "audio/elo_musk_podcast.wav"

# Total duration of the audio file
total_duration = 60  # 60 seconds

# Initialize the model
response = requests.post(f"{url}/init/")
print(response.json()["message"])


# Start sending chunks
start_time = time.time()
beg = 0
end = 0
min_chunk = 2  # Minimum chunk duration in seconds

while True:
    now = time.time() - start_time
    if now < end + min_chunk:
        time.sleep(min_chunk + end - now)

    end = time.time() - start_time


    audio_chunk = read_audio_chunk(audio_path, beg, end)

    payload = {
    "audio_array": audio_chunk.tolist(),
    "sample_rate": 16000
        }

    # Send the audio chunk to the FastAPI application
    response = requests.post(f"{url}/upload_chunk/",json=payload)
    

    # Print the transcription
    print(response.json()["transcription"],end="",flush=True)
    beg = end

    if end >= total_duration:
        break

# Finish processing and get the final transcription
response = requests.post(f"{url}/finish/")
print(response.json()["final_transcription"])

In [None]:
import socket
import sounddevice as sd
import numpy as np
from scipy.signal import resample
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Parameters
host = "localhost"  # Set the server host
port = 8000       # Set the server port
sampling_rate = 16000  # Required sampling rate by the server
chunk_duration = 1.0  # Duration of each audio chunk in seconds (1 second)
chunk_size = int(sampling_rate * chunk_duration)  # Number of samples per chunk

def stream_live_audio_with_response(host, port, sampling_rate, chunk_size):
    """
    Stream live audio from the computer's output to the server and print transcriptions.
    """
    try:
        # Connect to the server
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.connect((host, port))
            logger.info(f"Connected to server at {host}:{port}")

            def callback(indata, outdata, frames, time, status):
                """
                Audio callback to process, resample, and send audio data.
                """
                if status:
                    logger.warning(f"Audio status: {status}")
                
                # If audio has multiple channels, convert to mono
                if indata.shape[1] > 1:  
                    indata = np.mean(indata, axis=1, keepdims=True)

                # Resample to 16 kHz if necessary
                current_rate = sd.query_devices(sd.default.device[0], "input")["default_samplerate"]
                if current_rate != sampling_rate:
                    num_samples = int(chunk_size)  # Desired number of samples for 16 kHz
                    indata = resample(indata, num_samples, axis=0)

                try:
                    s.sendall(indata.astype(np.int16).tobytes())  # Convert to 16-bit PCM
                except BrokenPipeError:
                    logger.error("Connection to server lost.")
                    raise

            # Open audio stream
            with sd.Stream(
                samplerate=sampling_rate,  # Set target sample rate
                channels=1,                # Force mono audio
                dtype="float32",           # Use float32 for processing before conversion
                callback=callback,
                blocksize=chunk_size       # Set chunk size
            ):
                logger.info("Streaming audio... Press Ctrl+C to stop.")
                
                # Continuously receive server responses
                while True:
                    try:
                        response = s.recv(1024)  # Adjust buffer size as needed
                        if response:
                            logger.info(f"Received transcription: {response.decode('utf-8')}")
                    except socket.timeout:
                        pass  # No response yet; continue streaming
                    except Exception as e:
                        logger.error(f"Error receiving server response: {e}")
                        break

    except ConnectionError as e:
        logger.error(f"Failed to connect to the server: {e}")
    except Exception as e:
        logger.error(f"An error occurred: {e}")

# Call the function to start streaming
stream_live_audio_with_response(host, port, sampling_rate, chunk_size)


In [None]:
import socket
import wave
import logging
from pydub import AudioSegment

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Parameters
host = "localhost"  # Set the server host
port = 43007        # Set the server port
file_path = "path/to/audio.wav"  # Path to the WAV file to stream
chunk_size = 32000  # Audio chunk size in bytes

def convert_to_mono_16khz(input_path, output_path):
    """
    Convert a WAV file to mono and 16 kHz if necessary.
    """
    logger.info(f"Processing {input_path} for conversion to mono and 16 kHz...")
    try:
        audio = AudioSegment.from_file(input_path)
        if audio.frame_rate != 16000 or audio.channels != 1:
            audio = audio.set_frame_rate(16000).set_channels(1)
            audio.export(output_path, format="wav")
            logger.info(f"File converted and saved to {output_path}")
            return output_path
        else:
            logger.info("File is already in the correct format.")
            return input_path
    except Exception as e:
        logger.error(f"Error during conversion: {e}")
        raise

# Function to stream a WAV file and receive transcriptions
def stream_wav_and_receive(host, port, file_path, chunk_size):
    # Ensure the file is in mono and 16 kHz
    try:
        processed_file = "processed_audio.wav"
        file_path = convert_to_mono_16khz(file_path, processed_file)
    except Exception as e:
        logger.error("Unable to process audio file.")
        return

    try:
        # Open the WAV file
        with wave.open(file_path, "rb") as wf:
            # Validate WAV file properties
            if wf.getsampwidth() != 2 or wf.getnchannels() != 1 or wf.getframerate() != 16000:
                logger.error("The WAV file must be 16-bit PCM, mono, with a 16kHz sampling rate.")
                return

            # Connect to the server
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                s.connect((host, port))
                logger.info(f"Connected to server at {host}:{port}")

                # Stream audio in chunks and listen for responses
                data = wf.readframes(chunk_size // 2)
                s.settimeout(1)  # Set timeout for receiving responses
                try:
                    while data:
                        # Send audio chunk
                        s.sendall(data)
                        logger.info(f"Sent {len(data)} bytes")
                        data = wf.readframes(chunk_size // 2)

                        # Receive transcription from server
                        try:
                            response = s.recv(1024)  # Adjust buffer size as needed
                            if response:
                                logger.info(f"Received transcription: {response.decode('utf-8')}")
                        except socket.timeout:
                            pass  # No response yet; continue sending audio

                    logger.info("Finished streaming the audio file.")

                except Exception as e:
                    logger.error(f"Error during streaming: {e}")

    except FileNotFoundError:
        logger.error(f"The file {file_path} was not found.")
    except ConnectionError as e:
        logger.error(f"Failed to connect to the server: {e}")
    except wave.Error as e:
        logger.error(f"Error reading WAV file: {e}")

# Call the function
stream_wav_and_receive(host, port, file_path, chunk_size)


In [1]:
import asyncio
import websockets
import wave
import logging
from pydub import AudioSegment
import nest_asyncio

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Parameters
host = "127.0.0.1"  # Set the server URL
port = 8000        # Set the server port
file_path = "audio/elo_musk_podcast.wav"  # Path to the WAV file to stream
chunk_size = 32000  # Audio chunk size in bytes

def convert_to_mono_16khz(input_path, output_path):
    """
    Convert a WAV file to mono and 16 kHz if necessary.
    """
    logger.info(f"Processing {input_path} for conversion to mono and 16 kHz...")
    try:
        audio = AudioSegment.from_file(input_path)
        if audio.frame_rate != 16000 or audio.channels != 1:
            audio = audio.set_frame_rate(16000).set_channels(1)
            audio.export(output_path, format="wav")
            logger.info(f"File converted and saved to {output_path}")
            return output_path
        else:
            logger.info("File is already in the correct format.")
            return input_path
    except Exception as e:
        logger.error(f"Error during conversion: {e}")
        raise

async def stream_wav_and_receive(host, port, file_path, chunk_size):
    # Ensure the file is in mono and 16 kHz
    try:
        processed_file = "processed_audio.wav"
        file_path = convert_to_mono_16khz(file_path, processed_file)
    except Exception as e:
        logger.error("Unable to process audio file.")
        return

    try:
        # Open the WAV file
        with wave.open(file_path, "rb") as wf:
            # Validate WAV file properties
            if wf.getsampwidth() != 2 or wf.getnchannels() != 1 or wf.getframerate() != 16000:
                logger.error("The WAV file must be 16-bit PCM, mono, with a 16kHz sampling rate.")
                return

            # Connect to the server
            async with websockets.connect(f"ws://{host}:{port}/ws") as websocket:
                logger.info(f"Connected to server at {host}:{port}")

                # Stream audio in chunks and listen for responses
                data = wf.readframes(chunk_size // 2)
                while data:
                    # Send audio chunk
                    await websocket.send(data)
                    # logger.info(f"Sent {len(data)} bytes")
                    data = wf.readframes(chunk_size // 2)

                    # Receive transcription from server
                    try:
                        
                        response = await websocket.recv()
                        
                        if response:
                            # logger.info(f"Received transcription: {response}")
                            logger.info(f"{response}")
                    except websockets.exceptions.ConnectionClosed:
                        logger.info("WebSocket connection closed")
                        break

                logger.info("Finished streaming the audio file.")

    except FileNotFoundError:
        logger.error(f"The file {file_path} was not found.")
    except ConnectionError as e:
        logger.error(f"Failed to connect to the server: {e}")
    except wave.Error as e:
        logger.error(f"Error reading WAV file: {e}")

# Apply nest_asyncio patch
nest_asyncio.apply()

# Call the function
asyncio.run(stream_wav_and_receive(host, port, file_path, chunk_size))

INFO:__main__:Processing audio/elo_musk_podcast.wav for conversion to mono and 16 kHz...
INFO:__main__:File converted and saved to processed_audio.wav
INFO:__main__:Connected to server at 127.0.0.1:8000
INFO:__main__: The following is a
INFO:__main__: conversation with Elon Musk.
INFO:__main__: his fourth time on this
INFO:__main__: The Lex Friedman Podcast.
INFO:__main__: I thought you were going to finish it.
INFO:__main__: It's one of the greatest
INFO:__main__: themes in all film history.
INFO:__main__: Yeah, it's great.
INFO:__main__: So I was just thinking
INFO:__main__: about the Roman Empire


KeyboardInterrupt: 