In [None]:
import cv2
import numpy as np
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import os
from playsound import playsound
import pyttsx3


def read_aloud(text):
    # Initialize the TTS engine
    engine = pyttsx3.init()

    # Set properties (optional)
    engine.setProperty('rate', 150)    # Speed of speech
    engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)

    # Use the engine to say the text
    engine.say(text)
    
    # Wait for the speech to finish
    engine.runAndWait()
    
    
# Ensure PyTorch is installed
try:
    import torch
except ImportError as e:
    print("PyTorch is not installed. Please install it using 'pip install torch'")
    exit()

# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Convert the frame to a PIL image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Process the image and generate a caption
    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs, max_length=150, num_beams=5, early_stopping=True) 
    caption = processor.decode(out[0], skip_special_tokens=True)
    print(caption)

    # Display the frame with the caption
    # cv2.putText(frame, caption, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    # cv2.imshow('BLIP Real-Time Captioning', frame)

    print(caption)
    

    read_aloud(caption)


    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()

In [None]:
!pip install googletrans

In [None]:
import cv2
import numpy as np
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import pyttsx3
from googletrans import Translator

def read_aloud(text):
    # Initialize the TTS engine
    engine = pyttsx3.init()

    # Set properties (optional)
    engine.setProperty('rate', 150)    # Speed of speech
    engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)

    # Use the engine to say the text
    engine.say(text)
    
    # Wait for the speech to finish
    engine.runAndWait()

def translate_text(text, target_language="hi"):
    translator = Translator()
    translation = translator.translate(text, dest=target_language)
    return translation.text

# Ensure PyTorch is installed
try:
    import torch
except ImportError as e:
    print("PyTorch is not installed. Please install it using 'pip install torch'")
    exit()

# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Convert the frame to a PIL image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Process the image and generate a caption
    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs, max_length=150, num_beams=5, early_stopping=True)
    caption = processor.decode(out[0], skip_special_tokens=True)
    print("Original Caption: ", caption)

    # Translate the caption to Hindi
    hindi_caption = translate_text(caption, "hi")
    print("Translated Caption: ", hindi_caption)

    # Display the frame with the caption (optional)
    # cv2.putText(frame, hindi_caption, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    # cv2.imshow('BLIP Real-Time Captioning', frame)

    # Read aloud the translated caption
    read_aloud(hindi_caption)

    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


In [None]:
!pip install deep-translator


In [None]:
import cv2
import numpy as np
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import pyttsx3
from deep_translator import GoogleTranslator
import requests
from gtts import gTTS
import os
from playsound import playsound


def read_aloud(text):
    # Initialize the TTS engine
    engine = pyttsx3.init()

    # Set properties (optional)
    engine.setProperty('rate', 150)    # Speed of speech
    engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)

    # Use the engine to say the text
    engine.say(text)
    
    # Wait for the speech to finish
    engine.runAndWait()
    
def read_aloud_hindi(text):
    # Initialize the TTS engine
    engine = pyttsx3.init()

    # Set properties (optional)
    engine.setProperty('rate', 150)    # Speed of speech
    engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)

    # Get available voices
    voices = engine.getProperty('voices')

    # Print all available voices
    for voice in voices:
        print(f"Voice: {voice.name}, ID: {voice.id}, Languages: {voice.languages}")

    # Attempt to find a Hindi voice
    hindi_voice = None
    for voice in voices:
        if 'hi' in voice.languages or 'Hindi' in voice.name:
            hindi_voice = voice.id
            break
    
    if hindi_voice:
        engine.setProperty('voice', hindi_voice)
    else:
        print("Hindi voice not found. Using default voice.")

    # Use the engine to say the text
    engine.say(text)
    
    # Wait for the speech to finish
    engine.runAndWait()

def translate_text(text, target_language="hi"):
    translator = GoogleTranslator(source='auto', target=target_language)
    translation = translator.translate(text)
    return translation

# Ensure PyTorch is installed
try:
    import torch
except ImportError as e:
    print("PyTorch is not installed. Please install it using 'pip install torch'")
    exit()

# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Convert the frame to a PIL image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Process the image and generate a caption
    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs, max_length=150, num_beams=5, early_stopping=True)
    caption = processor.decode(out[0], skip_special_tokens=True)
    print("Original Caption: ", caption)

    # Translate the caption to Hindi
    hindi_caption = translate_text(caption, "hi")
    print("Translated Caption: ", hindi_caption)
    CHUNK_SIZE = 1024
    url = "https://api.elevenlabs.io/v1/text-to-speech/EXAVITQu4vr4xnSDxMaL"
    headers = {
        "Accept": "audio/mpeg",
        "Content-Type": "application/json",
        "xi-api-key": "ea3d93d2636da80240545f56f82bd13f"
    }
    data = {
        "text": hindi_caption,
        "model_id": "eleven_monolingual_v1",
        "voice_settings": {
        "stability": 0.5,
        "similarity_boost": 0.5
        }
    }

    response = requests.post(url, json=data, headers=headers)

    if response.status_code == 200:
        with open('output.mp3', 'wb') as f:
            for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                if chunk:
                    f.write(chunk)
        print("Audio file saved as 'output.mp3'")
    else:
        print(f"Error: {response.status_code} - {response.text}")
        

    # Display the frame with the caption (optional)
    # cv2.putText(frame, hindi_caption, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    # cv2.imshow('BLIP Real-Time Captioning', frame)

    # Read aloud the translated caption
    read_aloud(caption)
    read_aloud_hindi(hindi_caption)

    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


In [1]:
import pyttsx3

def read_aloud(text):
    # Initialize the TTS engine
    engine = pyttsx3.init()

    # Set properties (optional)
    engine.setProperty('rate', 150)    # Speed of speech
    engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)

    # Get available voices
    voices = engine.getProperty('voices')

    # Print all available voices
    for voice in voices:
        print(f"Voice: {voice.name}, ID: {voice.id}, Languages: {voice.languages}")

    # Attempt to find a Hindi voice
    hindi_voice = None
    for voice in voices:
        if 'hi' in voice.languages or 'Hindi' in voice.name:
            hindi_voice = voice.id
            break
    
    if hindi_voice:
        engine.setProperty('voice', hindi_voice)
    else:
        print("Hindi voice not found. Using default voice.")

    # Use the engine to say the text
    engine.say(text)
    
    # Wait for the speech to finish
    engine.runAndWait()

# Example usage
text = "नमस्ते, आप कैसे हैं?"
read_aloud(text)


Voice: Microsoft David Desktop - English (United States), ID: HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0, Languages: []
Voice: Microsoft Zira Desktop - English (United States), ID: HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0, Languages: []
Hindi voice not found. Using default voice.
