In [1]:
import os
from PIL import Image
import pytesseract
import pyttsx3
from transformers import BlipProcessor, BlipForConditionalGeneration
from IPython.display import Audio, display



In [2]:
# Initialize pyttsx3 TTS engine once
engine = pyttsx3.init()
engine.setProperty('rate', 150)  # Optional: control speed



In [3]:
# Load BLIP model and processor (make sure you already have this loaded)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", use_fast=False)
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")



In [4]:
# Set path for tesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Image folder path
image_dir = r"C:\Users\Admin\Desktop\VisionVoice\data\\"
image_files = [f for f in os.listdir(image_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]



In [5]:
# Your text cleaning function (assumed to be defined earlier)
def clean_text(text):
    return text.strip().replace("\n", " ")



In [6]:
# Process each image
for i, file in enumerate(image_files, 1):
    print(f"🔍 Processing Image {i}: {file}")
    img_path = os.path.join(image_dir, file)

    # Open and convert image
    img = Image.open(img_path).convert("RGB")

    # OCR
    ocr_text = pytesseract.image_to_string(img)

    # Captioning
    inputs = processor(img, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    # Clean text
    cleaned_ocr = clean_text(ocr_text)
    cleaned_caption = clean_text(caption)
    final_description = f"{cleaned_caption}. The text on the image reads: {cleaned_ocr}."

    # Save and speak using pyttsx3
    audio_path = f"../outputs/image_{i}_description.mp3"
    engine.save_to_file(final_description, audio_path)
    engine.runAndWait()

    # Playback (optional in notebook)
    print("🧠 Final Description:\n", final_description)
    display(Audio(audio_path))
    print("-" * 50)


🔍 Processing Image 1: flag.jpeg
🧠 Final Description:
 the indian flag flying in the sky. The text on the image reads: .


--------------------------------------------------
🔍 Processing Image 2: photo.JPG
🧠 Final Description:
 a man in a red and white shirt. The text on the image reads: .


--------------------------------------------------
🔍 Processing Image 3: sample.png
🧠 Final Description:
 a quote from the book, the best of times was worst of times. The text on the image reads: It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness....


--------------------------------------------------
🔍 Processing Image 4: sample1.jpg
🧠 Final Description:
 a group of people skate down a street. The text on the image reads: Li ee LLL Ts  “ew.


--------------------------------------------------
