<a href="https://colab.research.google.com/github/fguidotti-git/PTEN_Voice_Assistant/blob/main/PTEN_Voice_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install and basic setup
!pip -q install gTTS

from gtts import gTTS
from IPython.display import Audio, display
from pathlib import Path
import time

# Create an output folder
out_dir = Path("tts_outputs")
out_dir.mkdir(exist_ok=True)

print("gTTS ready. Output folder:", out_dir.resolve())


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hgTTS ready. Output folder: /content/tts_outputs


In [3]:
# Simple, reusable TTS function for Colab
def tts_gtts(text: str, lang: str = "pt", filename_prefix: str = "tts") -> Path:
    """
    Generate speech from text using gTTS and return the saved file path.
    - text: text to synthesize
    - lang: language code (e.g., "pt", "en", "es"); see gTTS docs for options
    - filename_prefix: start of the output file name
    """
    if not text or not text.strip():
        raise ValueError("Text must be a non-empty string.")

    # use a timestamp to avoid overwriting files
    ts = int(time.time())
    out_path = out_dir / f"{filename_prefix}_{lang}_{ts}.mp3"

    # Create and save audio
    tts = gTTS(text=text, lang=lang, slow=False)
    tts.save(out_path.as_posix())
    return out_path

def play_audio(path: Path, autoplay: bool = True):
    """
    Display an audio player in the notebook for the given file path.
    """
    if not path.exists():
        raise FileNotFoundError(f"File does not exist: {path}")
    # Autoplay may be blocked by browser settings; player UI will still appear.
    display(Audio(filename=path.as_posix(), autoplay=autoplay))


# ---- Quick test (English) ----
demo_en = "Hello! This is your virtual assistant. Text-to-speech is working."
mp3_path_en = tts_gtts(demo_en, lang="en", filename_prefix="demo")
print("Saved:", mp3_path_en)
play_audio(mp3_path_en)


Saved: tts_outputs/demo_en_1758421116.mp3


In [4]:
# Install core libraries for STT and audio conversion
!apt -y -qq install ffmpeg
!pip -q install SpeechRecognition pydub

import speech_recognition as sr
from pydub import AudioSegment
from IPython.display import Audio, display
from google.colab import files
from pathlib import Path
import uuid
import os

print("Installed: SpeechRecognition + pydub + ffmpeg")


ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalled: SpeechRecognition + pydub + ffmpeg


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


In [5]:
# Create I/O folders
in_dir = Path("stt_inputs"); in_dir.mkdir(exist_ok=True)
proc_dir = Path("stt_processed"); proc_dir.mkdir(exist_ok=True)

def upload_audio():
    """
    Opens a file picker to upload an audio file (mp3/wav/m4a/ogg).
    Returns the local path of the uploaded file.
    """
    uploaded = files.upload()
    if not uploaded:
        raise RuntimeError("No file uploaded.")
    fname = list(uploaded.keys())[0]
    src_path = in_dir / fname
    # Colab already saved the file in CWD, move it into stt_inputs
    os.replace(fname, src_path.as_posix())
    return src_path

def to_wav_mono_16k(src_path: Path) -> Path:
    """
    Convert any supported audio (mp3/m4a/ogg/wav) to 16kHz mono WAV for STT.
    Returns the converted wav path.
    """
    audio = AudioSegment.from_file(src_path.as_posix())
    audio = audio.set_channels(1).set_frame_rate(16000)
    wav_path = proc_dir / f"{src_path.stem}_{uuid.uuid4().hex}.wav"
    audio.export(wav_path.as_posix(), format="wav")
    return wav_path


In [6]:
def transcribe_google_sr(wav_path: Path, language: str = "pt-BR", show_all: bool = False) -> str:
    """
    Transcribe a short WAV (<= ~60s recommended) using SpeechRecognition + Google.
    - language: e.g., "pt-BR" (Brazilian Portuguese), "en-US", "es-ES"
    - show_all: if True, returns the raw JSON-like result (for debugging)
    Returns transcript string (or raises an error).
    """
    recognizer = sr.Recognizer()
    # Optional: tweak thresholds if needed
    recognizer.pause_threshold = 0.8
    recognizer.non_speaking_duration = 0.5

    with sr.AudioFile(wav_path.as_posix()) as source:
        audio_data = recognizer.record(source)  # Reads the entire file

    try:
        result = recognizer.recognize_google(audio_data, language=language, show_all=show_all)
        if show_all:
            return result  # raw dict
        return result
    except sr.UnknownValueError:
        return "[Unrecognized speech — try clearer audio or another sample.]"
    except sr.RequestError as e:
        return f"[API Request error: {e}]"


In [8]:
# 1) Upload an audio file (e.g., MP3/WAV with your voice)
print("Upload an audio file (mp3/wav/m4a/ogg) with your speech...")
src = upload_audio()

# 2) Convert to 16k mono WAV for best results
wav = to_wav_mono_16k(src)
print("Converted to WAV:", wav)

# 3) Play audio back (so you can confirm it's the right file)
display(Audio(wav.as_posix(), rate=16000))

# 4) Transcribe (change language if needed: "pt-BR" or "en-US")
lang_code = "pt-BR"  # or "en-US"
transcript = transcribe_google_sr(wav, language=lang_code)
print("\n--- TRANSCRIPT ---\n", transcript)


Upload an audio file (mp3/wav/m4a/ogg) with your speech...


Saving Gravando.m4a to Gravando.m4a
Converted to WAV: stt_processed/Gravando_28fb52bbfaa94033b9467b29720384e4.wav



--- TRANSCRIPT ---
 teste de gravação de áudio para texto


**Opcional) Célula 5 — Transcrever áudios mais longos (em “pedaços”)**

O serviço do recognize_google funciona melhor para clipes curtos. Para áudios maiores, você pode dividir em chunks (~25–30s) e concatenar.

In [None]:
from typing import List

def chunked_transcribe(wav_path: Path, language: str = "pt-BR", chunk_ms: int = 25_000) -> str:
    """
    Split a long WAV into ~chunk_ms pieces and transcribe each, then join.
    """
    audio = AudioSegment.from_wav(wav_path.as_posix())
    recognizer = sr.Recognizer()
    transcripts: List[str] = []
    for start in range(0, len(audio), chunk_ms):
        end = min(start + chunk_ms, len(audio))
        chunk = audio[start:end]
        tmp_chunk = proc_dir / f"chunk_{start}_{end}.wav"
        chunk.export(tmp_chunk.as_posix(), format="wav")

        with sr.AudioFile(tmp_chunk.as_posix()) as source:
            audio_data = recognizer.record(source)

        try:
            text = recognizer.recognize_google(audio_data, language=language)
        except sr.UnknownValueError:
            text = ""  # skip unreadable parts
        except sr.RequestError as e:
            text = f"[API error: {e}]"
        transcripts.append(text)

        # cleanup (optional): tmp_chunk.unlink(missing_ok=True)

    return " ".join(t for t in transcripts if t)

# Example usage:
# long_text = chunked_transcribe(wav, language="pt-BR", chunk_ms=25_000)
# print(long_text)


In [9]:
# Install the Wikipedia client
!pip -q install wikipedia


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone


In [10]:
from pathlib import Path
from IPython.display import display, HTML, Audio
from urllib.parse import quote_plus
from gtts import gTTS
import time
import wikipedia

# Prepare an output directory for audio responses
out_dir = Path("assistant_outputs")
out_dir.mkdir(exist_ok=True)

def speak_gtts(text: str, lang: str = "pt"):
    """
    Generate a short spoken response using gTTS and play it in the notebook.
    Returns the saved file path.
    """
    ts = int(time.time())
    path = out_dir / f"resp_{lang}_{ts}.mp3"
    gTTS(text=text, lang=lang).save(path.as_posix())
    display(Audio(filename=path.as_posix(), autoplay=False))
    return path

def clickable_link(url: str, label: str | None = None):
    """
    Display a clickable link inside the notebook.
    """
    label = label or url
    display(HTML(f'<a href="{url}" target="_blank" rel="noopener noreferrer">{label}</a>'))

def guess_lang_from_text(t: str) -> str:
    """
    Ultra-light heuristic to prefer Portuguese or English based on keywords.
    Not a full language detector—keeps things simple for this project.
    """
    pt_markers = ["o que é", "wikipedia", "pesquise", "abrir", "perto de mim", "farmácia", "mais próxima"]
    if any(m in t.lower() for m in pt_markers):
        return "pt"
    return "en"


In [11]:
import re
from dataclasses import dataclass

@dataclass
class Intent:
    name: str
    query: str | None = None
    place: str | None = None
    location: str | None = None
    lang: str = "pt"

def parse_intent(text: str, prefer_lang: str | None = None) -> Intent:
    """
    Parse a free-form command in Portuguese or English and return an Intent.
    Supported: Wikipedia search, YouTube search, Nearby pharmacy (maps).
    Examples (PT):
      - "Pesquisar inteligência artificial na Wikipedia"
      - "Abrir YouTube para Lo-fi hip hop"
      - "Farmácia mais próxima em Orlando"
      - "Farmácia mais próxima"
    Examples (EN):
      - "Search Wikipedia for Alan Turing"
      - "Open YouTube for python tutorials"
      - "nearest pharmacy in Orlando"
    """
    text_norm = text.strip()
    lang = prefer_lang or guess_lang_from_text(text_norm)

    t = text_norm.lower()

    # --- Wikipedia patterns ---
    # PT examples
    m = re.search(r"(wikipedia|wikipédia).*(?:de|sobre|for)?\s*(.+)", t)
    if m:
        query = m.group(2).strip()
        return Intent(name="WIKI_SEARCH", query=query, lang=lang)

    m = re.search(r"(pesquisar|pesquise|buscar)\s+(.+?)\s+(?:na|no)\s+wikipedia", t)
    if m:
        return Intent(name="WIKI_SEARCH", query=m.group(2).strip(), lang=lang)

    # EN examples
    m = re.search(r"(search|look up)\s+wikipedia\s+(?:for\s+)?(.+)", t)
    if m:
        return Intent(name="WIKI_SEARCH", query=m.group(2).strip(), lang=lang)

    # Generic "what is X"
    m = re.search(r"(o que é|what is)\s+(.+)", t)
    if m:
        return Intent(name="WIKI_SEARCH", query=m.group(2).strip(), lang=lang)

    # --- YouTube patterns ---
    # PT: "abrir youtube para X" / "buscar no youtube X"
    m = re.search(r"(abrir|open)\s+(?:o\s+)?youtube(?:\s+(?:para|for)\s+(.+))?", t)
    if m:
        q = (m.group(2) or "").strip() or None
        return Intent(name="YT_SEARCH", query=q, lang=lang)

    m = re.search(r"(buscar|pesquisar)\s+(?:no|on)\s+youtube\s+(.+)", t)
    if m:
        return Intent(name="YT_SEARCH", query=m.group(2).strip(), lang=lang)

    # EN: "search youtube for X"
    m = re.search(r"(search)\s+youtube\s+(?:for\s+)?(.+)", t)
    if m:
        return Intent(name="YT_SEARCH", query=m.group(2).strip(), lang=lang)

    # --- Maps / Pharmacy patterns ---
    # PT: "farmácia mais próxima", optional "em <cidade>"
    m = re.search(r"farm[aá]cia\s+(?:mais\s+pr[oó]xima|perto\s+de\s+mim)(?:\s+em\s+(.+))?", t)
    if m:
        loc = (m.group(1) or "").strip() or None
        return Intent(name="MAPS_NEARBY", place="farmácia", location=loc, lang=lang)

    # EN: "nearest pharmacy", optional "in <city>"
    m = re.search(r"(nearest|closest)\s+pharmacy(?:\s+in\s+(.+))?", t)
    if m:
        loc = (m.group(2) or "").strip() or None
        return Intent(name="MAPS_NEARBY", place="pharmacy", location=loc, lang=lang)

    # Fallback: try maps for generic "near me" queries like "supermarket near me"
    m = re.search(r"(.+?)\s+(?:perto\s+de\s+mim|near\s+me)", t)
    if m:
        return Intent(name="MAPS_NEARBY", place=m.group(1).strip(), location=None, lang=lang)

    # If nothing matched
    return Intent(name="UNKNOWN", lang=lang)


In [12]:
from typing import Optional

def handle_wikipedia(query: str, lang: str = "pt") -> dict:
    """
    Fetch a short summary and page URL from Wikipedia.
    Returns a dict with 'summary' and 'url'.
    """
    wikipedia.set_lang("pt" if lang.startswith("pt") else "en")
    try:
        page = wikipedia.page(query, auto_suggest=True, redirect=True)
        # Get a concise summary (first 2-3 sentences)
        summary = wikipedia.summary(query, sentences=3, auto_suggest=True, redirect=True)
        return {"summary": summary, "url": page.url}
    except wikipedia.DisambiguationError as e:
        # Pick the first option as a simple fallback
        opt = e.options[0] if e.options else query
        try:
            page = wikipedia.page(opt, auto_suggest=False)
            summary = wikipedia.summary(opt, sentences=3, auto_suggest=False)
            return {"summary": summary, "url": page.url}
        except Exception as inner:
            return {"summary": f"Could not resolve disambiguation for '{query}'. Try being more specific.",
                    "url": ""}
    except wikipedia.PageError:
        return {"summary": f"No Wikipedia page found for '{query}'.", "url": ""}
    except Exception as e:
        return {"summary": f"Error accessing Wikipedia: {e}", "url": ""}

def handle_youtube(query: Optional[str]) -> str:
    """
    Build a YouTube search URL. If no query provided, open the homepage.
    """
    if query:
        return f"https://www.youtube.com/results?search_query={quote_plus(query)}"
    return "https://www.youtube.com/"

def handle_maps(place: str = "farmácia", location: Optional[str] = None, lang: str = "pt") -> str:
    """
    Build a Google Maps search URL. If location is None, we use 'near me'.
    This relies on the browser/device location once you click the link.
    """
    if location:
        q = f"{place} near {location}" if not lang.startswith("pt") else f"{place} perto de {location}"
    else:
        q = f"{place} near me" if not lang.startswith("pt") else f"{place} perto de mim"
    return f"https://www.google.com/maps/search/{quote_plus(q)}"


In [13]:
def run_command(command_text: str, prefer_lang: str | None = None) -> dict:
    """
    Parse the user's command, run the appropriate action, and present outputs.
    Returns a dict with intent, message, url, and audio path (if any).
    """
    intent = parse_intent(command_text, prefer_lang=prefer_lang)
    result = {"intent": intent.name, "message": "", "url": "", "audio_path": None}

    if intent.name == "WIKI_SEARCH" and intent.query:
        info = handle_wikipedia(intent.query, lang=intent.lang)
        msg = info["summary"]
        result["message"] = msg
        result["url"] = info["url"] or ""
        # Speak a short response
        spoken = msg if len(msg) < 300 else msg[:280] + "..."
        result["audio_path"] = speak_gtts(spoken, lang=("pt" if intent.lang.startswith("pt") else "en"))
        if info["url"]:
            clickable_link(info["url"], label="🔗 Open on Wikipedia")

    elif intent.name == "YT_SEARCH":
        url = handle_youtube(intent.query)
        result["message"] = ("Abrindo YouTube…" if intent.lang.startswith("pt") else "Opening YouTube…")
        result["url"] = url
        result["audio_path"] = speak_gtts(result["message"], lang=("pt" if intent.lang.startswith("pt") else "en"))
        clickable_link(url, label="▶️ Open YouTube search")

    elif intent.name == "MAPS_NEARBY":
        url = handle_maps(place=intent.place or ("farmácia" if intent.lang.startswith("pt") else "pharmacy"),
                          location=intent.location, lang=intent.lang)
        result["message"] = ("Mostrando no mapa…" if intent.lang.startswith("pt") else "Showing on the map…")
        result["url"] = url
        result["audio_path"] = speak_gtts(result["message"], lang=("pt" if intent.lang.startswith("pt") else "en"))
        clickable_link(url, label="🗺️ Open in Google Maps")

    else:
        result["message"] = ("Desculpe, não entendi. Tente: 'Pesquisar Alan Turing na Wikipedia' "
                             "ou 'Abrir YouTube para música relaxante' ou 'Farmácia mais próxima em Orlando'.")
        result["audio_path"] = speak_gtts(result["message"], lang="pt")
    print(f"[Intent] {result['intent']}")
    print(f"[Msg] {result['message']}")
    if result["url"]:
        print(f"[URL] {result['url']}")
    return result


In [14]:
# >>> Edit the examples below and run this cell to test the pipeline <<<

# Portuguese examples
_ = run_command("Pesquisar inteligência artificial na Wikipedia")  # PT wiki
_ = run_command("Abrir YouTube para lo-fi hip hop")                # PT youtube
_ = run_command("Farmácia mais próxima em Orlando")                # PT maps

# English examples
_ = run_command("Search Wikipedia for Alan Turing", prefer_lang="en")
_ = run_command("Open YouTube for python tutorials", prefer_lang="en")
_ = run_command("nearest pharmacy in Boston", prefer_lang="en")


[Intent] WIKI_SEARCH
[Msg] Na informática, a Inteligência Artificial (abreviado IA) genericamente é a inteligência, o raciocínio e o aprendizado exibida por máquinas semelhante ao raciocino humano; busca desenvolver máquinas autônomas ou sistemas especialistas capazes de simular o pensamento humano e realizar várias tarefas complexas de forma independente. É o sistema que permite aos computadores executar funções avançadas, como a capacidade de analisar dados em grande escala e fazer previsões/recomendações; É um campo de pesquisa em ciência da computação que desenvolve e estuda métodos e softwares que permitem que as máquinas percebam seu ambiente e usem o aprendizado e a inteligência para tomar ações que maximizem suas chances de atingir objetivos definidos. A IA iniciou na década de 1950 com os pesquisadores Alan Turing e Herbert Simon baseado no conceito do filósofo grego Aristóteles.
[URL] https://pt.wikipedia.org/wiki/Intelig%C3%AAncia_artificial


[Intent] YT_SEARCH
[Msg] Abrindo YouTube…
[URL] https://www.youtube.com/results?search_query=lo-fi+hip+hop


[Intent] MAPS_NEARBY
[Msg] Mostrando no mapa…
[URL] https://www.google.com/maps/search/farm%C3%A1cia+perto+de+orlando


[Intent] WIKI_SEARCH
[Msg] G, or g, is the seventh letter of the Latin alphabet, used in the modern English alphabet, the alphabets of other western European languages, and others worldwide. Its name in English is gee (pronounced  ), plural gees.
The lowercase version can be written in two forms: the single-storey (sometimes "opentail")  and the double-storey (sometimes "looptail") .
[URL] https://en.wikipedia.org/wiki/G


[Intent] YT_SEARCH
[Msg] Opening YouTube…
[URL] https://www.youtube.com/results?search_query=python+tutorials


[Intent] MAPS_NEARBY
[Msg] Showing on the map…
[URL] https://www.google.com/maps/search/pharmacy+near+boston


In [15]:
# If you still have a variable named `transcript` from Passo 2, you can run:
try:
    transcript  # noqa
    print("Using 'transcript' from the STT step:", transcript)
    _ = run_command(transcript)
except NameError:
    print("No 'transcript' variable found. Run Passo 2 or set one manually, e.g.:")
    print("transcript = 'Pesquisar Python na Wikipedia'")


Using 'transcript' from the STT step: teste de gravação de áudio para texto


[Intent] UNKNOWN
[Msg] Desculpe, não entendi. Tente: 'Pesquisar Alan Turing na Wikipedia' ou 'Abrir YouTube para música relaxante' ou 'Farmácia mais próxima em Orlando'.
