# CyborgDB + Vec2Text Vulnerability Demonstration

This notebook shows how sensitive data stored in CyborgDB **cannot** be reconstructed using vec2text
 
### Attack Chain:
1. Sensitive texts → OpenAI embeddings → CyborgDB storage
2. Extract encrypted embeddings from PostgreSQL database
3. Use vec2text to *attempt* reconstruct original sensitive text

In [None]:
# 1. Set up OpenAI embedding & vec2text corrector models

import os
import vec2text
from openai import OpenAI

# Environment variable setup
def setup_env():
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['OPENBLAS_NUM_THREADS'] = '1' 
    os.environ['MKL_NUM_THREADS'] = '1'
    os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
    os.environ['NUMEXPR_NUM_THREADS'] = '1'
    os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
    os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
    os.environ['DEMO_INDEX_KEY'] = bytes(b'00000000000000000000000000000000').hex()
setup_env()

# ANSI color codes for live demo
class Colors:
    RED = '\033[91m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'
def print_colored(text, color="", bold=False):
    """Print colored text for demo"""
    prefix = Colors.BOLD if bold else ""
    prefix += getattr(Colors, color.upper(), "")
    print(f"{prefix}{text}{Colors.END}")

# OpenAI setup
embedding_model = "text-embedding-ada-002"
openai_client = OpenAI()

# Load vec2text corrector for inversion
print(f"Loading vec2text corrector for OpenAI {embedding_model}...")
corrector = vec2text.load_pretrained_corrector("text-embedding-ada-002")


In [None]:
# 2. Set up CyborgDB

from cyborgdb_lite import Client, DBConfig, IndexIVFFlat

# Set up PostgreSQL connection parameters
# Make sure to change these to your actual PostgreSQL credentials
POSTGRES_HOST = "localhost"
POSTGRES_PORT = 5432
POSTGRES_DB = "postgres"
POSTGRES_USER = "nicolas"
POSTGRES_PASSWORD = "password"

postgres_connection_string = (
    f"host={POSTGRES_HOST} port={POSTGRES_PORT} dbname={POSTGRES_DB} "
    f"user={POSTGRES_USER} password={POSTGRES_PASSWORD}"
)

# CyborgDB setup
cyborgdb_client = Client(
    DBConfig(location="postgres", connection_string=postgres_connection_string, table_name="index_cc_demo"),
    DBConfig(location="postgres", connection_string=postgres_connection_string, table_name="items_cc_demo"),
    DBConfig(location="postgres", connection_string=postgres_connection_string, table_name="config_cc_demo"),
)

# Load and delete existing index if there's a conflict
try:
    old_index = cyborgdb_client.load_index(
        index_name=f"cyborgdb-vec2text-demo",
        index_key=bytes.fromhex(os.getenv('DEMO_INDEX_KEY')),
    )
    old_index.delete_index()
except:
    pass

# Create new index
encrypted_index = cyborgdb_client.create_index(
    index_name=f"cyborgdb-vec2text-demo",
    index_key=bytes.fromhex(os.getenv('DEMO_INDEX_KEY')),
    index_config=IndexIVFFlat(
        dimension=1536,
        n_lists=128
    )
)

In [None]:
# 3. Define sensitive documents for demo

sensitive_documents = [
    # Financial/Identity Information
    "My social security number is 818-88-8118",
    "Password for admin account is SecurePass123!",
    
    # Medical Information
    "Patient diagnosed with Type 2 diabetes, prescribed metformin",
    "Blood test results show cholesterol level of 277 mg/dL",
    
    # Personal Information
    "John Doe lives at 1 World Trade Center, New York, NY 10007",
    "Phone number +1(212) 653-0688; email info@cyborg.co"
]

In [None]:
# 4. Generate embeddings & store in CyborgDB

print("="*80)
print_colored("GENERATING EMBEDDINGS FROM SENSITIVE DOCUMENTS", bold=True)
print("="*80 + "\n")

def get_embeddings_openai(text_list):
    """Get embeddings from OpenAI API"""
    response = openai_client.embeddings.create(
        input=text_list,
        model=embedding_model,
        encoding_format="float",
    )
    return [e.embedding for e in response.data]

# Get embeddings
embeddings = get_embeddings_openai(sensitive_documents)
print(f"Generated {len(embeddings)} embeddings of dimension {len(embeddings[0])}")

# Construct items
items = [
    {
        "id": f"sensitive_doc_{i}",
        "vector": embedding,
        "contents": doc
    }
    for i, (doc, embedding) in enumerate(zip(sensitive_documents, embeddings))
]

# Store in CyborgDB
encrypted_index.upsert(items)

In [None]:
# 5. Extract embeddings from CyborgDB Postgres backend

import numpy as np
import psycopg2
from psycopg2.extras import RealDictCursor

print("="*80)
print_colored("EXTRACTING EMBEDDINGS FROM CYBORGDB POSTGRESQL BACKEND", bold=True)
print("="*80 + "\n")

# Connect to CyborgDB's PostgreSQL database
pg_client = psycopg2.connect(
    host=POSTGRES_HOST,
    database=POSTGRES_DB,
    user=POSTGRES_USER,
    password=POSTGRES_PASSWORD,
    port=POSTGRES_PORT
)
cursor = pg_client.cursor(cursor_factory=RealDictCursor)

# Get a few rows to examine the value column
cursor.execute("SELECT key, value, index_name FROM index_cc_demo;")
rows = cursor.fetchall()

extracted_embeddings = []
n = 0
for i, row in enumerate(rows):
    value = row['value']
    key = row['key']
    
    if value is not None:
        # Convert memory buffer to bytes
        if hasattr(value, 'tobytes'):
            bytes_data = value.tobytes()
        else:
            bytes_data = bytes(value)

        # Ignore rows too small (< dim & 4B)
        if len(bytes_data) < 1536 * 4:
            continue
        
        # Increment counter
        n += 1

        # Determine if value is encrypted
        entropy = -sum(
            (bytes_data.count(byte) / len(bytes_data)) * 
            np.log2(bytes_data.count(byte) / len(bytes_data)) 
        for byte in set(bytes_data))

        if entropy > 7.8 and len(bytes_data) != 1536 * 4:
            is_encrypted = True
        else: 
            is_encrypted = False
        
        if is_encrypted:
            # Trim to expected size
            expected_length = 1536 * 4
            bytes_data = bytes_data[:expected_length]

            # Try to interpret as float anyway
            embedding_values = np.frombuffer(bytes_data, dtype=np.float32)
            extracted_embeddings.append(embedding_values.tolist())

            print_colored(f"Extracted encrypted embedding {n} corresponding to file {key}; length {len(bytes_data)} bytes", "GREEN")
        else:
            embedding_values = np.frombuffer(bytes_data, dtype=np.float32)
            extracted_embeddings.append(embedding_values.tolist())

            print_colored(f"Extracted embedding {n} corresponding to file {key}; {len(embedding_values)} dimensions", "RED")

# Print the first embedding as an example
if extracted_embeddings:
    print_colored("\nFirst extracted embedding:", bold=True)
    print(extracted_embeddings[0])

In [None]:
# 6. Convert to PyTorch & invert each embedding

import time
import torch

# Convert to PyTorch tensor on accelerated hardware if possible
embeddings_tensor = torch.tensor(extracted_embeddings, dtype=torch.float32)
if torch.backends.mps.is_available():
    embeddings_tensor = embeddings_tensor.to('mps')
elif torch.cuda.is_available():
    embeddings_tensor = embeddings_tensor.cuda()

results = []

print("="*80)
print_colored("RUNNING EMBEDDING INVERSION", bold=True)
print("="*80)

# for i, (original_doc, embedding) in enumerate(zip(extracted_documents, embeddings_tensor)):
for i, (original_doc, embedding) in enumerate(zip(sensitive_documents, embeddings_tensor)):
    print_colored(f"\nDocument #{i+1}:", bold=True)
    print(f"\nOriginal:      \"{original_doc}\"")
    
    start_time = time.time()
    
    # Invert embedding
    reconstructed_list = vec2text.invert_embeddings(
        embeddings=embedding.unsqueeze(0),  # Add batch dimension
        corrector=corrector,
        num_steps=4,
        sequence_beam_width=1,
    )
    reconstructed = reconstructed_list[0]
    
    inversion_time = time.time() - start_time

    print(f"Reconstructed: \"{reconstructed}\"")
        
    # Calculate similarity
    if len(reconstructed) > 0:
        orig_emb_cpu = embedding.cpu()
        new_emb = get_embeddings_openai([reconstructed])[0]
        new_emb_tensor = torch.tensor(new_emb)
        similarity = torch.nn.functional.cosine_similarity(orig_emb_cpu, new_emb_tensor, dim=0).item()
    else:
        similarity = 0

    # Check exact match
    exact_match = original_doc.lower().strip() == reconstructed.lower().strip()

    if exact_match:
        print_colored(f"Exact match", "RED", bold=True)
    
    sim_color = "RED" if similarity > 0.99 else "YELLOW" if similarity > 0.95 else "GREEN"
    print_colored(f"Similarity: {similarity:.4f}", sim_color, bold=True)
    print(f"Inversion time: {inversion_time:.2f}s")

    results.append({
        'original': original_doc,
        'reconstructed': reconstructed,
        'similarity': similarity,
        'time': inversion_time,
        'exact_match': exact_match,
    })

In [None]:
# 7. Summary Analysis

import numpy as np

print("\n" + "="*80)
print_colored("VULNERABILITY SUMMARY", bold=True)
print("="*80 + "\n")

total_docs = len(results)
exact_matches = sum(1 for r in results if r['exact_match'])
high_similarity = sum(1 for r in results if r['similarity'] > 0.95)

print(f"Total documents processed: {total_docs}")
print(f"Exact reconstructions: {exact_matches} ({exact_matches/total_docs*100:.1f}%)")
print(f"High similarity (>95%): {high_similarity} ({high_similarity/total_docs*100:.1f}%)")

avg_similarity = np.mean([r['similarity'] for r in results])
avg_time = np.mean([r['time'] for r in results])
sim_color = "RED" if avg_similarity > 0.99 else "YELLOW" if avg_similarity > 0.95 else "GREEN"

print_colored(f"Average similarity: {avg_similarity*100:.2f}%", sim_color, bold=True)
print(f"Average inversion time: {avg_time:.2f}s")