Project Name - VisionGen


🎯 This project generates captions for images and videos using a vision-language model (BLIP), and converts them into speech using gTTS — creating a multi-sensory AI experience.


In [None]:
!pip install transformers accelerate torchvision opencv-python pyttsx3 sentencepiece


In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
from google.colab import files
import cv2
import matplotlib.pyplot as plt

# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Image upload - Captioning
print("📷 Upload an image file")
uploaded_img = files.upload()
img_path = next(iter(uploaded_img))
image = Image.open(img_path).convert('RGB')

inputs = processor(images=image, return_tensors="pt").to(model.device)

out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)

print(f"\n🖼️ Image Caption: {caption}")


In [None]:
# Video uploading
print("\n🎬 Upload a video file")
uploaded_vid = files.upload()
vid_path = next(iter(uploaded_vid))

cap = cv2.VideoCapture(vid_path)

In [None]:
# Extract Frames
frames = []
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frames.append(frame)
cap.release()
print(f"\n📹 Extracted {len(frames)} frames from the video.")

In [None]:
captions = []

# Caption every 5th frame
for i in range(0, len(frames), 5):
    img = Image.fromarray(cv2.cvtColor(frames[i], cv2.COLOR_BGR2RGB))
    inputs = processor(images=img, return_tensors="pt").to(model.device)

    # Beam search for better results
    out = model.generate(**inputs, num_beams=5, max_length=50, early_stopping=True)
    caption = processor.decode(out[0], skip_special_tokens=True)

    captions.append((i, caption))
    print(f"🖼️ Frame {i}: {caption}")


In [None]:
with open("video_frame_captions.txt", "w") as f:
    for idx, cap in captions:
        f.write(f"Frame {idx}: {cap}\n")

print("\n✅ Captions saved to video_frame_captions.txt")


In [None]:
files.download("video_frame_captions.txt")


In [None]:
!pip install gTTS


In [None]:
from gtts import gTTS
from IPython.display import Audio

caption_text = "A dog playing with a ball in the garden."  # Replace with your caption
tts = gTTS(text=caption_text, lang='en')
tts.save("caption_audio.mp3")

# Play audio in notebook
Audio("caption_audio.mp3")


In [None]:
for i, cap in captions:
    print(f"🖼️ Frame {i}: {cap}")

    tts = gTTS(text=cap, lang='en')
    filename = f"frame_{i}_audio.mp3"
    tts.save(filename)

    display(Audio(filename))


In [None]:
Audio("caption_audio.mp3", autoplay=True)


In [None]:
with open("all_captions.txt", "w") as f:
    for i, cap in captions:
        f.write(f"Frame {i}: {cap}\n")

from google.colab import files
files.download("all_captions.txt")
