# Bibliotecas

In [1]:
import cv2
import time
import threading
import queue
from ultralytics import YOLO
import torch
import pyttsx3

# Configurações

In [2]:
# ==============================
# Configurações iniciais
# ==============================

video_path = r"C:/codes/unicamp/MC949/T3/videos/VID_20251015_183037787.mp4"

# Carrega modelo YOLOv8 nano (mais rápido)
model = YOLO("yolov8n.pt")

# Usa GPU se disponível
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Dispositivo ativo: {device}")
model.to(device)

Dispositivo ativo: cpu


YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

## Text-to-Speech

In [3]:
engine = pyttsx3.init()
voices = engine.getProperty('voices')

for v in voices:
    print(f"ID: {v.id} | Name: {v.name} | Lang: {v.languages}")

ID: HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_PT-BR_MARIA_11.0 | Name: Microsoft Maria Desktop - Portuguese(Brazil) | Lang: ['pt-BR']
ID: HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0 | Name: Microsoft Zira Desktop - English (United States) | Lang: ['en-US']


In [9]:
# ==============================
# Configuração do TTS (PT-BR, fila única)
# ==============================

speech_queue = queue.Queue()

def tts_worker():
    tts = pyttsx3.init()
    tts.setProperty("rate", 175)
    tts.setProperty("volume", 1.0)
    for v in tts.getProperty("voices"):
        if "brazil" in v.name.lower() or "portugu" in v.name.lower():
            print("Voz selecionada:", v.name)
            tts.setProperty("voice", v.id)
            break

    while True:
        text = speech_queue.get()
        if text is None:
            break
        try:
            print(f"Narrado: {text}")
            tts.stop()
            tts.say(text)
            # tts.runAndWait()
            # tts.runAndWait()  # agora seguro, pois engine é exclusiva da thread
        except Exception as e:
            print(f"Erro no TTS: {e}")
        finally:
            speech_queue.task_done()

threading.Thread(target=tts_worker, daemon=True).start()

def speak(text: str):
    """Adiciona texto à fila de fala"""
    if text.strip():
        speech_queue.put(text)

Voz selecionada: Microsoft Maria Desktop - Portuguese(Brazil)
Narrado: Microwave
Narrado: Refrigerator
Narrado: Person
Narrado: Train
Narrado: Potted plant
Narrado: Tv
Narrado: Potted plant
Narrado: Tv
Narrado: Potted plant
Narrado: Vase
Narrado: Refrigerator
Narrado: Potted plant
Narrado: Person
Narrado: Person
Narrado: Train
Narrado: Backpack
Narrado: Potted plant
Narrado: Person
Narrado: Potted plant
Narrado: Potted plant
Narrado: Potted plant
Narrado: Train
Narrado: Bus
Narrado: Car
Narrado: Traffic light
Narrado: Car
Narrado: Chair
Narrado: Person
Narrado: Truck
Narrado: Umbrella
Narrado: Refrigerator
Narrado: Train


# Modelo YOLO

In [None]:
# ==============================
# Processamento do vídeo
# ==============================

cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    raise RuntimeError(f"Erro: não foi possível abrir o vídeo em {video_path}")

last_narration = {}
NARRATION_COOLDOWN = 5  # segundos
prev_time = time.time()


while True:
    ret, frame = cap.read()
    if not ret:
        print("Fim do vídeo.")
        break
    
    frame_height, frame_width = frame.shape[:2]

    # Define ROI central (50% largura x 50% altura)
    roi_x1 = frame_width // 4
    roi_y1 = frame_height // 4
    roi_x2 = frame_width * 3 // 4
    roi_y2 = frame_height * 3 // 4

    # Para desenhar a ROI
    cv2.rectangle(frame, (roi_x1, roi_y1), (roi_x2, roi_y2), (255, 0, 0), 2)

    # Inferência YOLO
    results = model.predict(frame, imgsz=480, conf=0.35, verbose=False, device=device)[0]

    for box in results.boxes:
        xyxy = box.xyxy[0].cpu().numpy()
        conf = float(box.conf[0])
        cls_id = int(box.cls[0])
        cls_name = model.names[cls_id]
        x1, y1, x2, y2 = map(int, xyxy)

        # Desenha caixa e label
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        label = f"{cls_name} {conf:.2f}"
        cv2.putText(frame, label, (x1, y1 - 6), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

            # Narração controlada por cooldown
        box_cx = (x1 + x2) // 2
        box_cy = (y1 + y2) // 2
        if roi_x1 <= box_cx <= roi_x2 and roi_y1 <= box_cy <= roi_y2:
            now = time.time()
            # Cooldown curto por classe
            if now - last_narration.get(cls_name, 0) > NARRATION_COOLDOWN:
                msg = cls_name.capitalize()  # ou traduza se quiser
                speak(msg)
                last_narration[cls_name] = now

    # Calcular e exibir FPS
    curr_time = time.time()
    fps_display = 1.0 / (curr_time - prev_time)
    prev_time = curr_time
    cv2.putText(frame, f"FPS: {fps_display:.1f}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    # Mostrar vídeo em tempo real
    cv2.imshow("YOLO Navegacao Assistida", frame)

    # Pressione 'q' para sair
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# ==============================
# Finalização
# ==============================

cap.release()
cv2.destroyAllWindows()
speech_queue.put(None)  # encerra thread TTS
print("Processamento concluído com sucesso.")

Fim do vídeo.
Processamento concluído com sucesso.


In [33]:
import pyttsx3
engine = pyttsx3.init() # object creation

# RATE
rate = engine.getProperty('rate')   # getting details of current speaking rate
print (rate)                        # printing current voice rate
engine.setProperty('rate', 125)     # setting up new voice rate

# VOLUME
volume = engine.getProperty('volume')   # getting to know current volume level (min=0 and max=1)
print (volume)                          # printing current volume level
engine.setProperty('volume',1.0)        # setting up volume level  between 0 and 1

# VOICE
voices = engine.getProperty('voices')       # getting details of current voice
#engine.setProperty('voice', voices[0].id)  # changing index, changes voices. o for male
engine.setProperty('voice', voices[1].id)   # changing index, changes voices. 1 for female

engine.say("Hello World!")
engine.say('My current speaking rate is ' + str(rate))
engine.runAndWait()
engine.stop()

engine = pyttsx3.init()
engine.say("Hello World!")
engine.runAndWait()
engine.stop()

175
1.0


In [38]:
import cv2, time, torch
import numpy as np

# ==============================
# Configuração do TTS (PT-BR, fila única)
# ==============================

speech_queue = queue.Queue()

def tts_worker():
    engine = pyttsx3.init() # object creation

    # RATE
    engine.setProperty('rate', 175)     # setting up new voice rate
    # VOLUME
    engine.setProperty('volume',1.0)        # setting up volume level  between 0 and 1

    for v in engine.getProperty("voices"):
        if "brazil" in v.name.lower() or "portugu" in v.name.lower():
            print("Voz selecionada:", v.name)
            engine.setProperty("voice", v.id)
            break

    while True:
        text = speech_queue.get()
        if text is None:
            break
        try:
            print(f"Narrado: {text}")
            engine = pyttsx3.init()
            engine.say(f"{text}")
            engine.say(".")
            engine.runAndWait()
            engine.stop()
        except Exception as e:
            print(f"Erro no TTS: {e}")
        finally:
            speech_queue.task_done()

threading.Thread(target=tts_worker, daemon=True).start()

def speak(text: str):
    """Adiciona texto à fila de fala"""
    speech_queue.put(text.strip())

# Profundidade (MiDaS small)
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small").to(device)
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms").small_transform

# ---- Parâmetros ----
CLASSES_INTERESSE = ["person", "door", "trash", "bench", "bicycle", "car", "stairs", "cone"]
NARRATION_COOLDOWN = 5
last_narration = {}
scale_factor = 0.5  # ajuste empírico

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise RuntimeError(f"Erro: não foi possível abrir {video_path}")

prev_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_height, frame_width = frame.shape[:2]

    roi_x1, roi_y1 = frame_width // 4, frame_height // 4
    roi_x2, roi_y2 = frame_width * 3 // 4, frame_height * 3 // 4
    cv2.rectangle(frame, (roi_x1, roi_y1), (roi_x2, roi_y2), (255, 0, 0), 2)

    # ---- Profundidade ----
    input_batch = midas_transforms(frame).to(device)
    with torch.no_grad():
        depth = midas(input_batch)
        depth = torch.nn.functional.interpolate(
            depth.unsqueeze(1),
            size=frame.shape[:2],
            mode="bilinear",
            align_corners=False
        ).squeeze().cpu().numpy()

    frame_resized = cv2.resize(frame, (480, 270))
    # ---- Detecção ----
    results = model.predict(frame, imgsz=480, conf=0.35, verbose=False, device=device)[0]

    for box in results.boxes:
        xyxy = box.xyxy[0].cpu().numpy()
        conf = float(box.conf[0])
        cls_id = int(box.cls[0])
        cls_name = model.names[cls_id].lower()

        if cls_name not in CLASSES_INTERESSE:
            continue

        x1, y1, x2, y2 = map(int, xyxy)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # ---- Distância estimada ----
        depth_crop = depth[y1:y2, x1:x2]
        if depth_crop.size > 0:
            dist_rel = np.median(depth_crop)
            distance_m = dist_rel * scale_factor
        else:
            distance_m = None

        label = f"{cls_name} ({distance_m:.1f}m)" if distance_m else cls_name
        cv2.putText(frame, label, (x1, y1 - 6),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

        # ---- Risco ----
        box_cx, box_cy = (x1 + x2)//2, (y1 + y2)//2
        in_roi = roi_x1 <= box_cx <= roi_x2 and roi_y1 <= box_cy <= roi_y2
        now = time.time()

        if in_roi and distance_m and now - last_narration.get(cls_name, 0) > NARRATION_COOLDOWN:
            if distance_m < 1.0:
                msg = f"{cls_name.capitalize()} a {distance_m:.1f} metros. Cuidado!"
            elif distance_m < 2.0:
                msg = f"{cls_name.capitalize()} a {distance_m:.1f} metros. Siga com atenção."
            else:
                msg = f"{cls_name.capitalize()} a {distance_m:.1f} metros. Siga em frente."
            speak(msg)
            last_narration[cls_name] = now

    # ---- FPS ----
    curr_time = time.time()
    fps_display = 1.0 / (curr_time - prev_time)
    prev_time = curr_time
    cv2.putText(frame, f"FPS: {fps_display:.1f}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    cv2.imshow("YOLO Navegação Assistida", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()
speech_queue.put(None)


Voz selecionada: Microsoft Maria Desktop - Portuguese(Brazil)


Using cache found in C:\Users\gabrielgomes/.cache\torch\hub\intel-isl_MiDaS_master


Loading weights:  None


Using cache found in C:\Users\gabrielgomes/.cache\torch\hub\rwightman_gen-efficientnet-pytorch_master
Using cache found in C:\Users\gabrielgomes/.cache\torch\hub\intel-isl_MiDaS_master


Narrado: Person a 64.4 metros. Siga em frente.
Narrado: Person a 49.8 metros. Siga em frente.
Narrado: Person a 40.5 metros. Siga em frente.


KeyboardInterrupt: 