Transcription / Translation

In [2]:
# ! pip install openai-whisper
# ! pip install torch torchaudio
# ! pip install sounddevice
# ! pip install vosk

USE THIS ONE!!! Translation using Whisper and transcription

In [None]:
import whisper
import sounddevice as sd
import numpy as np
import time

def translate_audio_to_english(n_sec=3):
    # Load Whisper model
    model = whisper.load_model("tiny")  # You can use "small", "medium", or "large" for higher accuracy

    print("Adjusting microphone... Speak now!")
    
    # Record audio using the microphone
    def record_audio(duration=10, samplerate=16000):
        print(f"Recording for {n_sec} seconds ...")
        audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32")
        sd.wait()  # Wait until the recording is finished
        print("Recording finished!")
        return np.squeeze(audio)

    print("Start speaking for audio translation.")
    try:
        while True:
            audio_data = record_audio(duration=n_sec)

            # Use Whisper to transcribe and translate the audio
            time_start = time.time()
            result = model.transcribe(audio_data, task="translate")
            translated_text = result["text"]
            print(f"Translated to English: {translated_text}")
            time_end = time.time()
            print(f'------------- transcribing/translating took {round(time_end - time_start, 2)} sec -------------')
    except KeyboardInterrupt:
        print("\nTranslation stopped.")

translate_audio_to_english(n_sec=3)


Adjusting microphone... Speak now!
Start speaking for audio translation.
Recording for 3 seconds ...
Recording finished!
Translated to English: 
------------- transcribing/translating took 0.97 sec -------------
Recording for 3 seconds ...
Recording finished!
Translated to English: 
------------- transcribing/translating took 0.94 sec -------------
Recording for 3 seconds ...
Recording finished!
Translated to English:  Hello.
------------- transcribing/translating took 0.77 sec -------------
Recording for 3 seconds ...
Recording finished!
Translated to English:  3, 4, 5, 6
------------- transcribing/translating took 2.34 sec -------------
Recording for 3 seconds ...
Recording finished!
Translated to English: 
------------- transcribing/translating took 0.9 sec -------------
Recording for 3 seconds ...
Recording finished!
Translated to English:  Thank you.
------------- transcribing/translating took 0.8 sec -------------
Recording for 3 seconds ...
Recording finished!
Translated to Engl

transcription only

In [None]:
import speech_recognition as sr
from collections import deque

def synchronous_transcription():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()

    print("Adjusting microphone... Speak now!")
    with mic as source:
        recognizer.adjust_for_ambient_noise(source, duration=1)

    print("Start speaking for synchronous transcription. Press Ctrl+C to stop.")
    try:
        while True:
            with mic as source:
                # Capture small chunks of speech continuously
                audio = recognizer.listen(source, timeout=100, phrase_time_limit=2)
                try:
                    # Transcribe the audio chunk immediately
                    text = recognizer.recognize_google(audio)
                    print(text)
                except sr.UnknownValueError:
                    print("...")  # Display silence if nothing is understood
                except sr.RequestError as e:
                    print(f"API error: {e}")
    except KeyboardInterrupt:
        print("\nTranscription stopped.")

synchronous_transcription()


Adjusting microphone... Speak now!
Start speaking for synchronous transcription. Press Ctrl+C to stop.
hey can you understand what I'm saying
...
all right
not capturing
...
are you good at this
...
...
why are you missing
open Netflix
play record my voice
dim the last
...
...
private please Brand
what record
...
...
can transcribe it please
...

Transcription stopped.


Using Vosk is supposed to be faster

In [None]:
from vosk import Model, KaldiRecognizer
import sounddevice as sd
from deep_translator import GoogleTranslator
import json

def translate_audio_vosk():
    model = Model("model")  # Download Vosk model and specify its path
    recognizer = KaldiRecognizer(model, 16000)

    def record_audio(duration=3, samplerate=16000):
        print("Recording...")
        audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="int16")
        sd.wait()
        print("Recording finished!")
        return audio.tobytes()

    print("Start speaking for real-time translation. Press Ctrl+C to stop.")
    try:
        while True:
            # Record audio
            audio_data = record_audio(duration=3)

            # Perform real-time transcription
            if recognizer.AcceptWaveform(audio_data):
                result = json.loads(recognizer.Result())
                text = result.get("text", "")
                print(f"Transcribed Text: {text}")

                # Translate Text to English
                if text:
                    translated_text = GoogleTranslator(source="auto", target="en").translate(text)
                    print(f"Translated Text: {translated_text}")
    except KeyboardInterrupt:
        print("\nTranslation stopped.")

translate_audio_vosk()


Transcription + translating the text (not audio)

In [11]:
import speech_recognition as sr
from deep_translator import GoogleTranslator

def synchronous_transcription_with_translation():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()

    print("Connected to microphone... Speak now!")
    with mic as source:
        recognizer.adjust_for_ambient_noise(source, duration=1)

    print("Start speaking for synchronous transcription. Press Ctrl+C to stop.")
    try:
        while True:
            with mic as source:
                # Capture small chunks of speech continuously
                audio = recognizer.listen(source, timeout=100, phrase_time_limit=2)
                try:
                    # Transcribe the audio chunk immediately
                    text = recognizer.recognize_google(audio)
                    
                    # Translate to English if needed (auto-detect source language)
                    translated_text = GoogleTranslator(source="auto", target="en").translate(text)
                    
                    # Display the original and translated text
                    if translated_text.lower() != text.lower():
                        print(f"Original: {text}")
                        print(f"Translated to English: {translated_text}")
                    else:
                        print(f"English: {text}")
                except sr.UnknownValueError:
                    print("...")  # Display silence if nothing is understood
                except sr.RequestError as e:
                    print(f"API error: {e}")
    except KeyboardInterrupt:
        print("\nTranscription stopped.")

synchronous_transcription_with_translation()


Connected to microphone... Speak now!
Start speaking for synchronous transcription. Press Ctrl+C to stop.
...
English: can you turn the alarm
English: my name is Esther
...
...
...
...
...
...
English: drive please
...

Transcription stopped.


In [None]:
import speech_recognition as sr
from collections import deque

def live_transcription_rolling_window():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()
    word_queue = deque(maxlen=6)  # Store up to 6 words

    print("Adjusting microphone... Speak now!")
    with mic as source:
        recognizer.adjust_for_ambient_noise(source, duration=1)

    print("Start speaking! Press Ctrl+C to stop.")
    try:
        while True:
            with mic as source:
                # Listen for speech with a timeout and phrase limit
                audio = recognizer.listen(source, timeout=20, phrase_time_limit=5)
                try:
                    # Recognize speech using Google Web Speech API
                    text = recognizer.recognize_google(audio)
                    words = text.split()  # Split recognized text into words
                    
                    # Add words to the deque and maintain the last 6 words
                    for word in words:
                        word_queue.append(word)
                    
                    # Display the rolling window of the last 6 words
                    print(" ".join(word_queue))
                except sr.UnknownValueError:
                    print("Could not understand the audio.")
                except sr.RequestError as e:
                    print(f"API error: {e}")
    except KeyboardInterrupt:
        print("\nTranscription stopped.")

live_transcription_rolling_window()