<a href="https://colab.research.google.com/github/farmountain/SmartGlass-AI-Agent/blob/main/Session1_Multimodal_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📘 Session 01: Multimodal Basics
**Goal:** Build a basic pipeline using Whisper (speech-to-text), CLIP (vision embedding), and GPT-2 (language response).

This is the foundation for building an AI agent that can hear, see, and speak on smart glasses like Meta Ray-Ban Wayfarer.

In [1]:
# ✅ Install required libraries
!pip install -q openai-whisper transformers torchaudio pydub Pillow

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.2 kB[0m [31m23.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [None]:
# ✅ Load Whisper for audio-to-text
import whisper
model = whisper.load_model('base')
# Upload audio file (simulate smart glasses mic)
from google.colab import files
uploaded = files.upload()
filename = next(iter(uploaded))
result = model.transcribe(filename)
print('🗣️ Transcription:', result['text'])

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 141MiB/s]


In [None]:
# ✅ Load CLIP to describe uploaded image
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import requests

clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

image_path = files.upload()
image = Image.open(next(iter(image_path)))
texts = ["a photo of a city street", "a photo of a dog", "a store front", "a person", "a mountain"]
inputs = clip_processor(text=texts, images=image, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)
probs = outputs.logits_per_image.softmax(dim=1)
for text, prob in zip(texts, probs[0]):
    print(f"🔍 {text}: {prob.item()*100:.2f}%")

In [None]:
# ✅ GPT-2 generates a reply based on what it saw and heard
from transformers import pipeline
generator = pipeline("text-generation", model="gpt2")
prompt = f"I saw: {texts[probs[0].argmax().item()]}. I heard: {result['text']}. What should I say?"
response = generator(prompt, max_length=50, do_sample=True)[0]['generated_text']
print("🤖 GPT-2 Response:\n", response)