# CyborgDB + Vec2Text Vulnerability Demonstration

This notebook shows how sensitive data stored in CyborgDB **cannot** be reconstructed using vec2text
 
### Attack Chain:
1. Sensitive texts → OpenAI embeddings → CyborgDB storage
2. Extract encrypted embeddings from PostgreSQL database
3. Use vec2text to *attempt* reconstruct original sensitive text

In [None]:
from openai import OpenAI
import secrets
import time
from pathlib import Path

import numpy as np
import vec2text
import torch

# OpenAI setup
embedding_model = "text-embedding-ada-002"
openai_client = OpenAI()

# Load vec2text corrector for inversion
print(f"Loading vec2text corrector for OpenAI {embedding_model}...")
corrector = vec2text.load_pretrained_corrector("text-embedding-ada-002")


In [None]:
from cyborgdb_lite import Client, DBConfig, IndexIVFFlat

# CyborgDB setup
cyborgdb_client = Client(
    DBConfig(location="postgres", connection_string="host=localhost port=5432 dbname=postgres", table_name="index"),
    DBConfig(location="postgres", connection_string="host=localhost port=5432 dbname=postgres", table_name="items"),
    DBConfig(location="postgres", connection_string="host=localhost port=5432 dbname=postgres", table_name="config"),
)

encrypted_index = cyborgdb_client.create_index(
    index_name=f"cyborgdb-vec2text-demo-{secrets.token_hex(8)}",
    index_key=secrets.token_bytes(32),
    index_config=IndexIVFFlat(
        dimension=1536,
        n_lists=128
))

In [None]:
# 2. Sensitive Test Corpus (same as vec2text demo)

sensitive_documents = [
    # Financial/Identity Information
    "My social security number is 818-88-8118",
    "Password for admin account is SecurePass123!",
    "Credit card number 4532-1234-5678-9012 expires next month",
    "API key for production: sk-proj-1234567890abcdef",
    "Bank account routing number 021000021 checking 1234567890",
    
    # Medical Information
    "Patient diagnosed with Type 2 diabetes, prescribed metformin",
    "Blood test results show cholesterol level of 277 mg/dL",
    
    # Personal Information
    "John Doe lives at 1 World Trade Center, New York, NY 10007",
    "Phone number +1(212) 653-0688; email info@cyborg.co",
]

print(f"Testing with {len(sensitive_documents)} sensitive documents")

In [None]:
# 3. Generate OpenAI Embeddings and Store in ChromaDB

def get_embeddings_openai(text_list):
    """Get embeddings from OpenAI API"""
    print(f"Generating embeddings for {len(text_list)} texts...")
    response = openai_client.embeddings.create(
        input=text_list,
        model=embedding_model,
        encoding_format="float",
    )
    return [e.embedding for e in response.data]

# Get embeddings
embeddings = get_embeddings_openai(sensitive_documents)
print(f"Generated {len(embeddings)} embeddings of dimension {len(embeddings[0])}")

# Construct items
items = [
    {
        "id": f"sensitive_doc_{i}",
        "embedding": embedding,
        "contents": doc
    }
    for i, (doc, embedding) in enumerate(zip(sensitive_documents, embeddings))
]

# Store in ChromaDB
encrypted_index.upsert(items)

In [None]:
# 4. Extract Embeddings from CyborgDB PostgreSQL Database

print("\n=== EXTRACTING EMBEDDINGS FROM CYBORGDB POSTGRESQL BACKEND ===")

import psycopg2
from psycopg2.extras import RealDictCursor

# Connect directly to PostgreSQL
try:
    pg_client = psycopg2.connect(
        host="localhost",
        database="cyborgdb",  # Adjust database name as needed
        user="postgres",      # Adjust username as needed
        password="password",  # Adjust password as needed
        port=5432
    )
    cursor = pg_client.cursor(cursor_factory=RealDictCursor)
    print(f"✅ Connected to PostgreSQL server")
except Exception as e:
    print(f"❌ Failed to connect to PostgreSQL: {e}")
    print("Please ensure PostgreSQL is running and adjust connection parameters")
    raise

# Explore database schema
cursor.execute("""
    SELECT table_name 
    FROM information_schema.tables 
    WHERE table_schema = 'public' 
    ORDER BY table_name;
""")
tables = cursor.fetchall()
print(f"Found {len(tables)} tables in database:")
for table in tables:
    print(f"  - {table['table_name']}")

# Look for tables related to our index
index_tables = [t['table_name'] for t in tables if 'cyborgdb' in t['table_name'].lower() or 'vec2text' in t['table_name'].lower() or 'embedding' in t['table_name'].lower()]
print(f"\nTables related to our index: {index_tables}")

# If no specific tables found, look for vector/embedding related tables
if not index_tables:
    vector_tables = [t['table_name'] for t in tables if any(keyword in t['table_name'].lower() for keyword in ['vector', 'embedding', 'index', 'item', 'metadata'])]
    print(f"Vector-related tables: {vector_tables}")
    index_tables = vector_tables

In [None]:
# 5. Convert to PyTorch & Invert Each Embedding

print("\n=== RUNNING VEC2TEXT INVERSION ON EXTRACTED EMBEDDINGS ===")

# Convert to PyTorch tensor
embeddings_tensor = torch.tensor(extracted_embeddings, dtype=torch.float32)

# Move to appropriate device
if torch.backends.mps.is_available():
    embeddings_tensor = embeddings_tensor.to('mps')
elif torch.cuda.is_available():
    embeddings_tensor = embeddings_tensor.cuda()

print(f"Embeddings tensor shape: {embeddings_tensor.shape}")
print(f"Device: {embeddings_tensor.device}")

results = []

print("\n" + "="*80)
print("INVERSION RESULTS")
print("="*80)

for i, (original_doc, embedding) in enumerate(zip(extracted_documents, embeddings_tensor)):
    print(f"\nDocument #{i+1}:")
    print(f"\nOriginal (from database): {original_doc}")
    
    start_time = time.time()
    
    # Invert embedding
    reconstructed_list = vec2text.invert_embeddings(
        embeddings=embedding.unsqueeze(0),  # Add batch dimension
        corrector=corrector,
        num_steps=20,
        sequence_beam_width=1,
    )
    reconstructed = reconstructed_list[0]
    
    inversion_time = time.time() - start_time
    
    print(f"Reconstructed: {reconstructed}")
    
    # Calculate similarity
    orig_emb_cpu = embedding.cpu()
    new_emb = get_embeddings_openai([reconstructed])[0]
    new_emb_tensor = torch.tensor(new_emb)
    similarity = torch.nn.functional.cosine_similarity(orig_emb_cpu, new_emb_tensor, dim=0).item()
    
    # Check exact match
    exact_match = original_doc.lower().strip() == reconstructed.lower().strip()
    
    print(f"Similarity: {similarity:.4f}")
    print(f"Inversion time: {inversion_time:.2f}s")
    print(f"Exact match: {exact_match}")
    
    results.append({
        'original': original_doc,
        'reconstructed': reconstructed,
        'similarity': similarity,
        'time': inversion_time,
        'exact_match': exact_match,
    })
    
    print("-" * 80)

In [None]:
# 6. Summary Analysis

print("\n" + "="*80)
print("VULNERABILITY SUMMARY")
print("="*80)

total_docs = len(results)
exact_matches = sum(1 for r in results if r['exact_match'])
high_similarity = sum(1 for r in results if r['similarity'] > 0.95)
sensitive_recovered = sum(1 for r in results if r['sensitive_recovered'])

print(f"Total documents processed: {total_docs}")
print(f"Exact reconstructions: {exact_matches} ({exact_matches/total_docs*100:.1f}%)")
print(f"High similarity (>95%): {high_similarity} ({high_similarity/total_docs*100:.1f}%)")

avg_similarity = np.mean([r['similarity'] for r in results])
avg_time = np.mean([r['time'] for r in results])

print(f"Average similarity: {avg_similarity:.4f}")
print(f"Average inversion time: {avg_time:.2f}s")

print(f"\nATTACK CHAIN COMPLETE:")
print(f"   1. Stored sensitive data in ChromaDB")
print(f"   2. Extracted embeddings from SQLite database") 
print(f"   3. Reconstructed {sensitive_recovered}/{total_docs} sensitive documents")