In [1]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from fastlite import database

In [2]:
MODEL_NAME = "BAAI/bge-small-en-v1.5"

In [3]:
db = database('scraper.db')

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_NAME, device=device)
model.max_seq_length = 512

In [5]:
def generate_embeddings_for_chunks(db, batch_size=64):
    chunks = list(db.t.chunks())
    chunks_without_embeddings = [
        chunk for chunk in chunks
        if not list(db.t.embeddings.rows_where('chunk_id=?', [chunk['id']]))
    ]
    
    print(f"Total chunks: {len(chunks)}")
    print(f"Chunks without embeddings: {len(chunks_without_embeddings)}")
    
    if not chunks_without_embeddings:
        print("All chunks already have embeddings")
        return
    
    texts = [chunk['text'] for chunk in chunks_without_embeddings]
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_chunks = chunks_without_embeddings[i:i+batch_size]
        
        embeddings = model.encode(
            batch_texts,
            batch_size=batch_size,
            normalize_embeddings=True,
            show_progress_bar=True,
        )
        
        for chunk, embedding in zip(batch_chunks, embeddings):
            # Store as binary (bytes) for efficiency:
            # - BGE-small produces 384-dimensional float32 vectors
            # - Binary: 384 * 4 bytes = 1,536 bytes per embedding
            # - Text: would be ~10-15 bytes per number = ~4,000-6,000 bytes
            db.t.embeddings.insert(
                chunk_id=chunk['id'],
                embedding=embedding.tobytes(),
            )
        
        print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)} chunks")

In [6]:
generate_embeddings_for_chunks(db)

Total chunks: 1456
Chunks without embeddings: 1456


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 64/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 128/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 192/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 256/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 320/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 384/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 448/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 512/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 576/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 640/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 704/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 768/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 832/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 896/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 960/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 1024/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 1088/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 1152/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 1216/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 1280/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 1344/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 1408/1456 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 1456/1456 chunks


In [7]:
def show_sample_embedding(db, chunk_id=None):
    """Display a sample embedding in human-readable format"""
    if chunk_id is None:
        # Get first chunk with an embedding
        embeddings = list(db.t.embeddings(limit=1))
        if not embeddings:
            print("No embeddings found in database")
            return
        embedding_row = embeddings[0]
        chunk_id = embedding_row['chunk_id']
    else:
        embedding_row = list(db.t.embeddings.rows_where('chunk_id=?', [chunk_id]))
        if not embedding_row:
            print(f"No embedding found for chunk_id {chunk_id}")
            return
        embedding_row = embedding_row[0]
    
    # Get the chunk text
    chunk = db.t.chunks[embedding_row['chunk_id']]
    
    # Convert binary back to numpy array
    embedding = np.frombuffer(embedding_row['embedding'], dtype=np.float32)
    
    print(f"Chunk ID: {chunk_id}")
    print(f"Chunk text (first 200 chars): {chunk['text'][:200]}...")
    print(f"\nEmbedding shape: {embedding.shape}")
    print(f"Embedding dtype: {embedding.dtype}")
    print(f"Embedding size: {len(embedding_row['embedding'])} bytes (binary)")
    print(f"\nFirst 20 values:")
    print(embedding[:20])
    print(f"\nLast 20 values:")
    print(embedding[-20:])
    print(f"\nStats:")
    print(f"  Min: {embedding.min():.6f}")
    print(f"  Max: {embedding.max():.6f}")
    print(f"  Mean: {embedding.mean():.6f}")
    print(f"  Std: {embedding.std():.6f}")
    print(f"\nFull embedding (all {len(embedding)} values):")
    # print(embedding.tolist())
    
    return embedding

In [8]:
show_sample_embedding(db)

Chunk ID: 1
Chunk text (first 200 chars): Context: Home > Residential > Customers & Accounts

#### Articles

  * [Bankruptcy (A)](https://connections/?docs=residential/customers-accounts/bankruptcy-a)
    * [RCS Bankruptcy Processing Procedur...

Embedding shape: (384,)
Embedding dtype: float32
Embedding size: 1536 bytes (binary)

First 20 values:
[-3.1617600e-02 -3.6749464e-02 -1.6078871e-02 -1.8794421e-02
  1.4254829e-03 -2.8504124e-02  1.5606851e-03  3.0569160e-02
  5.3110600e-02 -9.9264374e-03 -2.9262526e-02 -8.9799505e-06
  1.7035879e-02 -1.1348437e-02  5.9134230e-02 -2.2345078e-03
 -1.6351186e-02  3.6863990e-02  1.3559079e-02  1.0617254e-01]

Last 20 values:
[-0.00368802  0.00064237 -0.09047387 -0.03361855  0.01928963 -0.02809929
 -0.03500346 -0.00032537 -0.016559    0.00218545  0.08518906  0.0030776
 -0.02593457  0.04122056 -0.02109344 -0.00365679  0.04378175  0.04363751
 -0.01606809  0.01370035]

Stats:
  Min: -0.335114
  Max: 0.443193
  Mean: 0.000338
  Std: 0.051030

Full emb

array([-3.16176005e-02, -3.67494635e-02, -1.60788707e-02, -1.87944211e-02,
        1.42548291e-03, -2.85041239e-02,  1.56068511e-03,  3.05691604e-02,
        5.31105995e-02, -9.92643740e-03, -2.92625260e-02, -8.97995051e-06,
        1.70358792e-02, -1.13484366e-02,  5.91342300e-02, -2.23450782e-03,
       -1.63511857e-02,  3.68639901e-02,  1.35590788e-02,  1.06172539e-01,
        2.25303210e-02, -1.78234577e-02,  7.54028326e-03, -1.18127486e-04,
        7.32959509e-02,  4.81274463e-02, -1.04639204e-02, -1.03574591e-02,
       -4.74117734e-02, -1.30073264e-01, -4.77376096e-02,  3.51441875e-02,
        4.21837121e-02,  1.41512174e-02,  6.25263378e-02,  8.59099068e-03,
       -8.64969287e-03,  4.87454198e-02,  2.20542029e-02,  6.20149337e-02,
        3.28427106e-02, -9.43905208e-03, -3.18674296e-02, -5.12021640e-03,
        6.94505032e-03, -1.86656869e-03,  4.06883880e-02,  1.50584821e-02,
        6.96886852e-02,  1.11330356e-02, -1.29197789e-02, -1.87287908e-02,
        2.19886247e-02,  

In [9]:
def search_embeddings(db, query, top_k=5):
    """Search stored embeddings with a text query and return top matches."""
    embeddings = list(db.t.embeddings())
    if not embeddings:
        print("No embeddings found in database")
        return []

    query_embedding = model.encode(
        [query],
        normalize_embeddings=True,
        show_progress_bar=False,
    )[0]

    scored = []
    for row in embeddings:
        embedding = np.frombuffer(row["embedding"], dtype=np.float32)
        score = float(np.dot(query_embedding, embedding))
        chunk = db.t.chunks[row["chunk_id"]]
        scored.append((score, row["chunk_id"], chunk["text"]))

    scored.sort(key=lambda x: x[0], reverse=True)

    print(f'Query: "{query}"')
    for score, chunk_id, text in scored[:top_k]:
        preview = text.replace("\n", " ").strip()[:200]
        print(f"score={score:.4f} chunk_id={chunk_id} text={preview}...")

    return scored[:top_k]

In [10]:
def show_parent_extracts(db, scored_results, max_chars=None):
    """Display parent extracts for scored chunk results, de-duplicated by extract."""
    if not scored_results:
        print("No results to display")
        return

    seen_extract_ids = set()
    for score, chunk_id, _ in scored_results:
        chunk = db.t.chunks[chunk_id]
        extract_id = chunk["extract_id"]
        if extract_id in seen_extract_ids:
            continue
        seen_extract_ids.add(extract_id)

        extract = db.t.extracts[extract_id]
        text = extract["text"].strip()
        if max_chars is not None:
            text = text[:max_chars]
        print(f"\nscore={score:.4f} chunk_id={chunk_id} extract_id={extract_id}")
        print(text)

In [11]:
results = search_embeddings(db, "residential electric rate", top_k=5)
show_parent_extracts(db, results)

Query: "residential electric rate"
score=0.7856 chunk_id=833 text=Context: Home > Residential > Billing, Payments, & Refunds > Rates, Fees & The Customer’s Bill (B) > Rates & Fees  # __Electric Rate Overview  The rate in which electric is consumed is measured in kil...
score=0.7647 chunk_id=835 text=Context: Home > Residential > Billing, Payments, & Refunds > Rates, Fees & The Customer’s Bill (B) > Rates & Fees  han 75 kW but less than 1,000 kW per billing period.   * **General Service Large Dema...
score=0.7516 chunk_id=996 text=Context: Home > Residential > Outage & Distribution > Electric Generation Overview (A) > Electric Meter  [![](https://connections/wp-content/uploads/2023/02/20250828_142507-300x139.jpg)](https://conne...
score=0.7465 chunk_id=24 text=Context: Home > Residential > Solar > JEA SolarSmart (B)  # __Additional Monthly Cost Estimate  kWh Consumed per Month| 10%    SolarSmart| 25%    SolarSmart| 50%    SolarSmart| 75%    Solar Smart| 100...
score=0.7465 chunk_id=1448