# Tiny-MultiModal-Larimar Example Usage

This notebook demonstrates how to use the Tiny-MultiModal-Larimar model for various multimodal tasks.

In [None]:
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

# Import our model components
from src.modules.multimodal_vae import TinyMultiModalVAE
from src.modules.data import MultiModalDataset
from src.modules.lightning_model import TinyMultiModalLitModel
import yaml

## 1. Load Configuration and Model

In [None]:
# Load configuration
with open('configs/config_tiny_multimodal.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Initialize model
model = TinyMultiModalVAE(config['model'])
model.eval()

print(f"Model loaded with {sum(p.numel() for p in model.parameters())} parameters")

## 2. Vision Encoding Example

In [None]:
# Create a dummy image (or load a real one)
# For demo purposes, we'll create a random image
dummy_image = torch.randn(1, 3, 224, 224)  # Batch size 1, RGB, 224x224

# Encode the image
with torch.no_grad():
    vision_features = model.vision_encoder(dummy_image)
    
print(f"Vision features shape: {vision_features.shape}")
print(f"Vision features mean: {vision_features.mean().item():.4f}")

## 3. Text Encoding Example

In [None]:
# Example text input
text_input = "A beautiful sunset over the ocean"

# Tokenize (this is simplified - in practice you'd use proper tokenization)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokens = tokenizer(text_input, return_tensors='pt', padding=True, truncation=True)

# Encode the text
with torch.no_grad():
    text_features = model.text_encoder(tokens['input_ids'], tokens['attention_mask'])
    
print(f"Text features shape: {text_features.shape}")
print(f"Text features mean: {text_features.mean().item():.4f}")

## 4. Multimodal Fusion Example

In [None]:
# Fuse vision and text features
with torch.no_grad():
    fused_features = model.multimodal_fusion(vision_features, text_features)
    
print(f"Fused features shape: {fused_features.shape}")
print(f"Fused features mean: {fused_features.mean().item():.4f}")

## 5. Memory System Example

In [None]:
# Test memory system
batch_size = 1
seq_len = 10
dummy_input = torch.randn(batch_size, seq_len, model.memory.memory_dim)

with torch.no_grad():
    memory_output = model.memory(dummy_input)
    
print(f"Memory output shape: {memory_output.shape}")
print(f"Memory slots used: {model.memory.memory_slots}")

## 6. End-to-End VAE Example

In [None]:
# Create dummy multimodal input
batch_size = 2
images = torch.randn(batch_size, 3, 224, 224)
text_ids = torch.randint(0, 1000, (batch_size, 20))  # 20 tokens
text_mask = torch.ones(batch_size, 20)

# Forward pass through the full model
with torch.no_grad():
    outputs = model(images, text_ids, text_mask)
    
print("Full model outputs:")
for key, value in outputs.items():
    if isinstance(value, torch.Tensor):
        print(f"  {key}: {value.shape}")
    else:
        print(f"  {key}: {value}")

## 7. Training Setup Example

In [None]:
# Show how to set up training
from src.modules.data import MultiModalDataModule
import pytorch_lightning as pl

# Create data module
data_module = MultiModalDataModule(
    data_dir='data',
    batch_size=config['training']['batch_size'],
    num_workers=2
)

# Create lightning model
lit_model = TinyMultiModalLitModel(config)

# Create trainer (don't actually train in this example)
trainer = pl.Trainer(
    max_epochs=1,
    accelerator='auto',
    devices=1,
    logger=False,
    enable_checkpointing=False
)

print("Training setup complete!")
print(f"Lightning model: {type(lit_model).__name__}")
print(f"Data module: {type(data_module).__name__}")

## 8. Inference Example

In [None]:
# Example of how to use the model for inference
def generate_caption(model, image_tensor, max_length=50):
    """
    Generate a caption for an image using the model.
    """
    model.eval()
    with torch.no_grad():
        # Encode image
        vision_features = model.vision_encoder(image_tensor.unsqueeze(0))
        
        # Create empty text input for generation
        generated_ids = []
        
        # Simple greedy generation (in practice, you'd use more sophisticated decoding)
        for _ in range(max_length):
            # This is a simplified example - real implementation would be more complex
            break
            
    return "Generated caption would appear here"

# Example usage
example_image = torch.randn(3, 224, 224)
caption = generate_caption(model, example_image)
print(f"Generated caption: {caption}")

## 9. Model Analysis

In [None]:
# Analyze model structure
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Model Analysis:")
print(f"Total parameters: {count_parameters(model):,}")
print(f"Vision encoder parameters: {count_parameters(model.vision_encoder):,}")
print(f"Text encoder parameters: {count_parameters(model.text_encoder):,}")
print(f"Memory parameters: {count_parameters(model.memory):,}")
print(f"Decoder parameters: {count_parameters(model.decoder):,}")

# Memory usage
model_size = sum(p.numel() * p.element_size() for p in model.parameters())
print(f"Model size: {model_size / 1024**2:.2f} MB")

## 10. Next Steps

This notebook provides a basic overview of the Tiny-MultiModal-Larimar model. For actual usage:

1. **Training**: Use `train.py` with your dataset
2. **Inference**: Use `inference.py` for generating captions or analyzing images
3. **Evaluation**: Use `scripts/evaluate.py` to assess model performance
4. **Data Preparation**: Use `scripts/prepare_data.py` to download and prepare datasets

The model is designed to be lightweight while maintaining the cognitive and episodic memory aspects of the original Larimar architecture.