<a href="https://colab.research.google.com/github/farmountain/SmartGlass-AI-Agent/blob/main/colab_notebooks/Session5_Meta_RayBan_SDK_Simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🕶️ Session 05: Meta Ray-Ban SDK Simulation
Simulate how Meta Ray-Ban smart glasses would capture voice and vision, and how to process that data using AI models.

In [None]:
# ✅ Install dependencies
!pip install -q openai-whisper transformers torch torchvision torchaudio pydub gTTS opencv-python Pillow


## 🎥 Simulate Image & Audio Input (like Meta Ray-Ban smart glasses)

In [None]:
from gtts import gTTS
from pydub import AudioSegment
from IPython.display import Audio, Image, display
import whisper
from PIL import Image as PILImage
from google.colab import files

# Generate a simulated voice input
tts = gTTS("Where am I and what do you see?", lang='en')
tts.save("input_audio.mp3")

# Convert to WAV for Whisper
sound = AudioSegment.from_file("input_audio.mp3")
sound.export("input_audio.wav", format="wav")

# Upload image file
uploaded = files.upload()
for filename in uploaded.keys():
  img = PILImage.open(filename)
  print(f"User uploaded file '{filename}'")


# Display sample image simulating first-person vision
display(img)
display(Audio("input_audio.wav"))

## 🗣️ Transcribe Voice Input (Simulated Mic)

In [None]:
model = whisper.load_model("base")
result = model.transcribe("input_audio.wav")
transcribed_text = result["text"]
print("🎧 Transcribed:", transcribed_text)


## 👁️ Describe Visual Scene Using CLIP

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Use the 'img' variable which holds the uploaded image
texts = ["a person walking", "a street", "a store front", "a building", "a crowd of people"]

inputs = clip_processor(text=texts, images=img, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)
probs = outputs.logits_per_image.softmax(dim=1)[0]

for text, prob in zip(texts, probs):
    print(f"🔍 {text}: {prob.item() * 100:.2f}%")

## 🤖 Generate Context-Aware Response (GPT-2)

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")
vision_desc = texts[probs.argmax()]
prompt = f"The user asked: '{transcribed_text}' and the smart glasses saw: '{vision_desc}'. How should I respond?"

response = generator(prompt, max_new_tokens=100, do_sample=True, return_full_text=False)[0]['generated_text']
print("🧠 GPT-2 Response:", response.strip())