<a href="https://colab.research.google.com/github/eduardodarocha/text-to-speech-and-speech-to-text_action-Alexa/blob/main/Speech_to_text_ALEXA_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
# Instalar dependências
!pip install SpeechRecognition gTTS pyjokes wikipedia playsound==1.2.2 pydub ffmpeg-python
!apt-get install -y ffmpeg





Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [74]:
# Importar bibliotecas
import speech_recognition as sr
from gtts import gTTS
import os
import pyjokes
import wikipedia
from IPython.display import display, Javascript, Audio
import base64
from pydub import AudioSegment
from google.colab import output
from ipywidgets import Button
import asyncio
import nest_asyncio
from yt_finder import YoutubeSearch

nest_asyncio.apply()

In [75]:
# ===========================
# Variáveis de controle
# ===========================
is_recording = False
file_saved = False
audio_processed = False
button = None

In [76]:
# ===========================
# Função para remover arquivos antigos
# ===========================
def cleanup_audio_files():
    for fname in ["recorded.webm", "recorded.wav", "voice.mp3"]:
        if os.path.exists(fname):
            os.remove(fname)

# ===========================
# Callback para salvar áudio
# ===========================
def save_audio(b64_audio):
    global file_saved
    audio_bytes = base64.b64decode(b64_audio)
    with open("recorded.webm", "wb") as f:
        f.write(audio_bytes)
    file_saved = True
    process_audio_async()  # Processa imediatamente após receber o áudio

output.register_callback('notebook.save_audio', save_audio)

# ===========================
# Função para iniciar gravação
# ===========================
def start_recording():
    cleanup_audio_files()  # Apaga arquivos antigos antes de gravar
    RECORD_JS = """
    async function recordAudio() {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      const recorder = new MediaRecorder(stream);
      const data = [];
      recorder.ondataavailable = event => data.push(event.data);
      recorder.start();
      window.recorder = recorder;
      window.audioData = data;
    }
    recordAudio();
    """
    display(Javascript(RECORD_JS))

# ===========================
# Função para parar gravação e enviar áudio
# ===========================
def stop_recording():
    STOP_JS = """
    if(window.recorder) {
        window.recorder.stop();
        window.recorder.onstop = async () => {
            const blob = new Blob(window.audioData);
            const arrayBuffer = await blob.arrayBuffer();
            const base64String = btoa(
                new Uint8Array(arrayBuffer)
                  .reduce((data, byte) => data + String.fromCharCode(byte), '')
            );
            google.colab.kernel.invokeFunction('notebook.save_audio', [base64String], {});
        };
    }
    """
    display(Javascript(STOP_JS))

# ===========================
# Função para processar áudio
# ===========================
def process_audio_async():
    global audio_processed, is_recording, file_saved, button

    button.description = "Processando..."
    print("⚙️ Processando áudio...")

    # Converter webm -> wav
    sound = AudioSegment.from_file("recorded.webm", format="webm")
    sound.export("recorded.wav", format="wav")

    # Reconhecimento de fala
    r = sr.Recognizer()
    with sr.AudioFile("recorded.wav") as source:
        audio = r.record(source)
        try:
            text = r.recognize_google(audio, language="en-US")
            print("✅ Você disse:", text)
            audio_processed = True
            # Need to run the async handle_command in an async context
            asyncio.run(handle_command(text.lower()))
        except Exception as e:
            print("Erro no reconhecimento:", str(e))
            audio_processed = True
            speak("Sorry, I did not get that.")

    # Resetar estado
    is_recording = False
    file_saved = False
    audio_processed = False
    button.description = "Gravar"

# ===========================
# Função para falar
# ===========================
def speak(text):
    tts = gTTS(text=text, lang='en')
    filename = "voice.mp3"
    tts.save(filename)
    display(Audio(filename, autoplay=True))

# ===========================
# Função para realizar busca no YouTube
# ===========================
async def perform_youtube_search(query):
    try:
        search = YoutubeSearch(query, max_results=3, language="en", region="US")
        videos = await search.search()
        if videos:
            speak(f"Here are the top results for {query} on YouTube:")
            for video in videos:
                print(f"Title: {video.title}")
                print(f"URL: {video.yt_url}")
                # speak(f"Title: {video.title}")
        else:
            speak(f"Sorry, I could not find any videos for {query} on YouTube.")
    except Exception as e:
        print(f"Error during YouTube search: {e}")
        speak("Sorry, I encountered an error while searching YouTube.")


# ===========================
# Função para executar comandos
# ===========================
async def handle_command(text):
    if 'search youtube channel' in text:
        desired_channel = text.split("search youtube channel", 1)[-1].strip()
        print(f"Searching YouTube for channel: {desired_channel}")
        # speak(f"Searching YouTube for channel {desired_channel}")
        await perform_youtube_search(desired_channel)

    elif 'search wikipedia' in text:
        # speak("Searching Wikipedia ...")
        query = text.replace("search", "")
        try:
            result = wikipedia.summary(query, sentences=2)
            speak("According to Wikipedia" + result)
            print(result)
            # speak(result)
        except:
            speak("I could not find anything on Wikipedia")
    elif 'joke' in text:
        joke = pyjokes.get_joke()
        speak(joke)
        print(joke)
    elif 'exit' in text:
        speak("Goodbye, till next time")
        print("🛑 Sessão encerrada.")
    else:
        speak("You said: " + text)

# ===========================
# Função principal do assistente
# ===========================
def assistant_action(btn):
    global is_recording, button

    if not is_recording:
        # Começar gravação
        is_recording = True
        start_recording()
        button.description = "Waiting audio..."
        print("🎤 Gravando... pressione o botão novamente para finalizar.")
    else:
        # Parar gravação e enviar
        stop_recording()
        print("⏳ Enviando áudio...")

In [77]:
# ===========================
# Criar botão único
# ===========================
button = Button(description="Gravar", layout={'width':'200px', 'height':'50px'}, style={'font_weight':'bold'})
button.on_click(assistant_action)
display(button)

Button(description='Gravar', layout=Layout(height='50px', width='200px'), style=ButtonStyle(font_weight='bold'…

<IPython.core.display.Javascript object>

🎤 Gravando... pressione o botão novamente para finalizar.


<IPython.core.display.Javascript object>

⏳ Enviando áudio...
⚙️ Processando áudio...
✅ Você disse: search Wikipedia black diamond


Black Diamond is a city in King County, Washington, United States. The population was 4,697 at the 2020 census.


<IPython.core.display.Javascript object>

🎤 Gravando... pressione o botão novamente para finalizar.


<IPython.core.display.Javascript object>

⏳ Enviando áudio...
⚙️ Processando áudio...
✅ Você disse: tell me a joke


Windows is NOT a virus. Viruses DO something.


<IPython.core.display.Javascript object>

🎤 Gravando... pressione o botão novamente para finalizar.


<IPython.core.display.Javascript object>

⏳ Enviando áudio...
⚙️ Processando áudio...
✅ Você disse: search YouTube channel CNN
Searching YouTube for channel: cnn


Title: Source: Conservative activist Charlie Kirk shot during event in Utah
URL: https://youtube.com/watch?v=WieCe53rh-s&pp=ygUDY25u
Title: Democrats, socialism and capitalism: Harry Enten runs the numbers
URL: https://youtube.com/watch?v=GijMLdb2CP0&pp=ygUDY25u
Title: CNN reports on aftermath of Charlie Kirk shooting
URL: https://youtube.com/shorts/NZbuexg2eFM


In [78]:
!pip install yt-finder



In [79]:
# This cell is no longer needed as the YouTube search functionality has been integrated into the voice assistant code.
# from yt_finder import YoutubeSearch
# import asyncio

# async def main():
#     search = YoutubeSearch("python", max_results=5, language="en", region="US")
#     videos = await search.search()
#     for video in videos:
#         print("=" * 20)
#         print(f"Title: {video.title}")
#         print(f"URL: {video.yt_url}")
#         print("=" * 20)

# Directly await the main function in Colab
# await main()