In [None]:
!pip install ultralytics pytesseract pyttsx3
import cv2
from ultralytics import YOLO
from PIL import Image
import numpy as np
import torch
import clip
import pytesseract

# Load models
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)
yolo_model = YOLO("yolov8n.pt")

# Candidate captions (you can expand this list)
candidates = ["a man", "a woman", "a car", "a dog", "a group of people", "a child", "a building", "a bike"]

# OCR
def extract_ocr(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    return pytesseract.image_to_string(gray).strip()

# Caption generator
def generate_clip_caption(frame):
    image_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    image_input = preprocess(image_pil).unsqueeze(0).to(device)
    text_tokens = clip.tokenize(candidates).to(device)

    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_tokens)
        logits_per_image, _ = clip_model(image_input, text_tokens)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
    
    best_idx = np.argmax(probs)
    return candidates[best_idx]

# OpenCV camera loop
cap = cv2.VideoCapture(0)  # 0 for webcam

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # YOLO Detection
    yolo_results = yolo_model.predict(frame, verbose=False)[0]
    annotated_frame = yolo_results.plot()

    # CLIP Caption
    caption = generate_clip_caption(frame)

    # OCR
    ocr_text = extract_ocr(frame)

    # Overlay info
    cv2.putText(annotated_frame, f"Caption: {caption}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
    if ocr_text:
        cv2.putText(annotated_frame, f"OCR: {ocr_text[:50]}", (10, 60),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 1)

    # Show the frame
    cv2.imshow("VisionTextAI Live", annotated_frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


