# ChromaDB Plaintext Storage Demo

This notebook demonstrates that ChromaDB stores embeddings in plaintext

In [None]:
# 1. Setup ChromaDB

import chromadb
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import sqlite3
import json
import pickle
import pandas as pd
from pathlib import Path

# Initialize embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create ChromaDB client with persistent storage
persist_directory = "./chromadb_demo"
client = chromadb.PersistentClient(path=persist_directory)

# Create or get collection
try:
    collection = client.create_collection("sensitive_documents")
except:
    client.delete_collection("sensitive_documents")
    collection = client.create_collection("sensitive_documents")

print(f"ChromaDB storage location: {persist_directory}")

In [None]:
# 2. Add Sensitive Documents

# Sample sensitive documents
documents = [
    "Patient John Doe, SSN 123-45-6789, diagnosed with diabetes",
    "Credit card number 4532-1234-5678-9012 belongs to Jane Smith",
    "API key: sk-1234567890abcdef, expires 2024-12-31",
    "Database password: MyS3cr3tP@ssw0rd! for production server",
    "Employee ID 12345 salary: $120,000 annual compensation"
]

# Generate embeddings
embeddings = model.encode(documents)

# Add to ChromaDB
collection.add(
    documents=documents,
    embeddings=embeddings.tolist(),
    ids=[f"doc_{i}" for i in range(len(documents))],
    metadatas=[{"sensitive": True, "doc_num": i} for i in range(len(documents))]
)

print(f"Added {len(documents)} sensitive documents to ChromaDB")

In [None]:
# 3. Run a Normal Query on ChromaDB

# Normal query
query = "financial information"
query_embedding = model.encode([query])

results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=3
)

print("Query results:")
for i, doc in enumerate(results['documents'][0]):
    print(f"{i+1}. {doc}")

In [None]:
# 4. Dump ChromaDB Storage - SQLite Database

# ChromaDB uses SQLite for metadata and Parquet for embeddings
# Let's examine both

# First, let's look at the SQLite database
db_path = os.path.join(persist_directory, "chroma.sqlite3")
print(f"SQLite database path: {db_path}")

# Connect to SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# List all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("\nTables in ChromaDB SQLite:")
for table in tables:
    print(f"  - {table[0]}")

In [None]:
# # 5. Examine Raw Embedding Storage

# # ChromaDB stores embeddings in Parquet files
# # Let's find and examine them

# import pyarrow.parquet as pq
# import glob

# # Find parquet files
# parquet_files = glob.glob(os.path.join(persist_directory, "**/*.parquet"), recursive=True)
# print(f"Found {len(parquet_files)} Parquet files")

# if parquet_files:
#     # Read the first parquet file
#     parquet_path = parquet_files[0]
#     print(f"\nReading: {parquet_path}")
    
#     # Read parquet file
#     table = pq.read_table(parquet_path)
#     df = table.to_pandas()
    
#     print(f"\nParquet file shape: {df.shape}")
#     print(f"Columns: {df.columns.tolist()}")
    
#     # Display first few rows
#     print("\nFirst few rows of embeddings:")
#     print(df.head())

In [None]:
# 6. Direct Memory Inspection

import gc
import sys

# Force a query to ensure embeddings are loaded in memory
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=5
)

# Look for numpy arrays in memory
print("Searching for embedding arrays in memory...")
embedding_arrays = []
for obj in gc.get_objects():
    if isinstance(obj, np.ndarray) and obj.shape == (384,):  # all-MiniLM-L6-v2 produces 384-dim embeddings
        embedding_arrays.append(obj)

print(f"Found {len(embedding_arrays)} potential embedding vectors in memory")

if embedding_arrays:
    print("\nSample embedding vector (first 10 dimensions):")
    print(embedding_arrays[0][:10])

In [None]:
# 7. Demonstrate Plaintext Storage

# Let's read the raw data and show it's in plaintext
print("=== PLAINTEXT STORAGE DEMONSTRATION ===\n")

# 1. Documents are stored in plaintext
cursor.execute("SELECT * FROM embeddings_queue")
queue_data = cursor.fetchall()
if queue_data:
    print("Documents in embeddings_queue:")
    for row in queue_data:
        print(row)

# 2. Embeddings are stored as plain floating point numbers
if parquet_files:
    print("\n\nRaw embedding values from Parquet file:")
    # Get embedding column
    if 'embedding' in df.columns:
        first_embedding = df['embedding'].iloc[0]
        if isinstance(first_embedding, np.ndarray):
            print(f"First embedding vector (shape: {first_embedding.shape}):")
            print(f"First 20 values: {first_embedding[:20]}")
            print(f"Min: {first_embedding.min():.6f}, Max: {first_embedding.max():.6f}")
            print(f"Mean: {first_embedding.mean():.6f}, Std: {first_embedding.std():.6f}")

In [None]:
import struct

# First, check the column names
cursor.execute("PRAGMA table_info(embeddings_queue)")
columns = cursor.fetchall()
print("Embeddings_queue columns:")
for col in columns:
    print(f"  {col[1]} ({col[2]})")

# Get all data from embeddings_queue
cursor.execute("SELECT * FROM embeddings_queue LIMIT 5")
rows = cursor.fetchall()

print(f"\nFound {len(rows)} embedding records")

# Based on your data, columns appear to be:
# id, timestamp, seq_num, collection_id, doc_id, embedding_blob, data_type, metadata
for i, row in enumerate(rows):
    # Extract fields based on the actual structure
    queue_id = row[0]
    timestamp = row[1]
    doc_id = row[4]  # e.g., 'doc_0', 'doc_1'
    embedding_blob = row[5]  # The binary embedding data
    data_type = row[6]  # Should be 'FLOAT32'
    metadata = row[7]  # JSON metadata
    
    print(f"\n--- Document {i} (ID: {doc_id}) ---")
    print(f"Data type: {data_type}")
    print(f"Metadata: {metadata[:100]}...")  # First 100 chars of metadata
    
    # Parse FLOAT32 binary data
    # Each float32 is 4 bytes
    num_floats = len(embedding_blob) // 4
    
    # Unpack the binary data as float32 values
    embedding_values = struct.unpack(f'{num_floats}f', embedding_blob)
    
    print(f"Embedding dimension: {num_floats}")
    print(f"First 20 embedding values:")
    for j in range(min(20, len(embedding_values))):
        print(f"  [{j}]: {embedding_values[j]:.6f}")
    
    # Show statistics
    embedding_array = np.array(embedding_values)
    print(f"\nEmbedding statistics:")
    print(f"  Min: {embedding_array.min():.6f}")
    print(f"  Max: {embedding_array.max():.6f}")
    print(f"  Mean: {embedding_array.mean():.6f}")
    print(f"  Std: {embedding_array.std():.6f}")

# %% [markdown]
# ## 9. Demonstrate Direct Access to Sensitive Data

# %%
print("\n=== DIRECT ACCESS TO SENSITIVE DATA ===\n")

# Extract and display the actual sensitive documents
cursor.execute("SELECT * FROM embeddings_queue")
all_rows = cursor.fetchall()

print("All documents stored in PLAINTEXT:")
for row in all_rows:
    metadata = row[7]  # Metadata is in the 8th column
    # Parse the JSON metadata
    import json
    meta_dict = json.loads(metadata)
    if 'chroma:document' in meta_dict:
        print(f"\n• {meta_dict['chroma:document']}")

print("\n⚠️  Anyone with database access can read all sensitive information!")


In [None]:
# 8. Cleanup

# Cleanup connection
# conn.close()