In [1]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from fastlite import database

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME = "BAAI/bge-small-en-v1.5"

In [3]:
db = database('scraper.db')

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_NAME, device=device)
model.max_seq_length = 512

In [5]:
def generate_embeddings_for_chunks(db, batch_size=64):
    chunks = list(db.t.chunks())
    chunks_without_embeddings = [
        chunk for chunk in chunks
        if not list(db.t.embeddings.rows_where('chunk_id=?', [chunk['id']]))
    ]
    
    print(f"Total chunks: {len(chunks)}")
    print(f"Chunks without embeddings: {len(chunks_without_embeddings)}")
    
    if not chunks_without_embeddings:
        print("All chunks already have embeddings")
        return
    
    texts = [chunk['text'] for chunk in chunks_without_embeddings]
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_chunks = chunks_without_embeddings[i:i+batch_size]
        
        embeddings = model.encode(
            batch_texts,
            batch_size=batch_size,
            normalize_embeddings=True,
            show_progress_bar=True,
        )
        
        for chunk, embedding in zip(batch_chunks, embeddings):
            # Store as binary (bytes) for efficiency:
            # - BGE-small produces 384-dimensional float32 vectors
            # - Binary: 384 * 4 bytes = 1,536 bytes per embedding
            # - Text: would be ~10-15 bytes per number = ~4,000-6,000 bytes
            db.t.embeddings.insert(
                chunk_id=chunk['id'],
                embedding=embedding.tobytes(),
            )
        
        print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)} chunks")

In [6]:
generate_embeddings_for_chunks(db)

Total chunks: 271
Chunks without embeddings: 271


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


Processed 64/271 chunks


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.46it/s]


Processed 128/271 chunks


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.48it/s]


Processed 192/271 chunks


Batches: 100%|██████████| 1/1 [00:00<00:00,  9.04it/s]


Processed 256/271 chunks


Batches: 100%|██████████| 1/1 [00:00<00:00, 38.12it/s]

Processed 271/271 chunks





In [7]:
def show_sample_embedding(db, chunk_id=None):
    """Display a sample embedding in human-readable format"""
    if chunk_id is None:
        # Get first chunk with an embedding
        embeddings = list(db.t.embeddings(limit=1))
        if not embeddings:
            print("No embeddings found in database")
            return
        embedding_row = embeddings[0]
        chunk_id = embedding_row['chunk_id']
    else:
        embedding_row = list(db.t.embeddings.rows_where('chunk_id=?', [chunk_id]))
        if not embedding_row:
            print(f"No embedding found for chunk_id {chunk_id}")
            return
        embedding_row = embedding_row[0]
    
    # Get the chunk text
    chunk = db.t.chunks[embedding_row['chunk_id']]
    
    # Convert binary back to numpy array
    embedding = np.frombuffer(embedding_row['embedding'], dtype=np.float32)
    
    print(f"Chunk ID: {chunk_id}")
    print(f"Chunk text (first 200 chars): {chunk['text'][:200]}...")
    print(f"\nEmbedding shape: {embedding.shape}")
    print(f"Embedding dtype: {embedding.dtype}")
    print(f"Embedding size: {len(embedding_row['embedding'])} bytes (binary)")
    print(f"\nFirst 20 values:")
    print(embedding[:20])
    print(f"\nLast 20 values:")
    print(embedding[-20:])
    print(f"\nStats:")
    print(f"  Min: {embedding.min():.6f}")
    print(f"  Max: {embedding.max():.6f}")
    print(f"  Mean: {embedding.mean():.6f}")
    print(f"  Std: {embedding.std():.6f}")
    print(f"\nFull embedding (all {len(embedding)} values):")
    # print(embedding.tolist())
    
    return embedding

In [8]:
show_sample_embedding(db)

Chunk ID: 1
Chunk text (first 200 chars): Context: Home > My Account > Rates > SolarMax

As clean energy sources become increasingly popular, many large companies want to demonstrate their commitment to sustainability by “going green” and red...

Embedding shape: (384,)
Embedding dtype: float32
Embedding size: 1536 bytes (binary)

First 20 values:
[-0.02648777  0.00028695  0.00152587  0.02085175  0.00298518  0.01324802
  0.01688763 -0.00172297  0.04616194  0.03232403  0.03161076  0.01727285
 -0.02860877  0.03264195  0.03865863  0.01524653  0.0061617  -0.03522492
  0.00859361  0.0273168 ]

Last 20 values:
[-0.05804415 -0.04378345 -0.12754838 -0.03412132 -0.01192081  0.0071623
 -0.00431318 -0.00424956  0.00515858 -0.05683081 -0.01646976  0.02984179
 -0.0421149  -0.01165898 -0.06974824  0.00445326  0.04026973 -0.01999755
 -0.03151636  0.00126042]

Stats:
  Min: -0.330158
  Max: 0.328519
  Mean: 0.000285
  Std: 0.051030

Full embedding (all 384 values):


array([-2.64877733e-02,  2.86945491e-04,  1.52586761e-03,  2.08517481e-02,
        2.98518152e-03,  1.32480180e-02,  1.68876313e-02, -1.72296981e-03,
        4.61619385e-02,  3.23240310e-02,  3.16107608e-02,  1.72728486e-02,
       -2.86087748e-02,  3.26419473e-02,  3.86586338e-02,  1.52465263e-02,
        6.16169954e-03, -3.52249183e-02,  8.59361235e-03,  2.73167975e-02,
        7.93066695e-02, -5.59270307e-02,  3.83231253e-03, -1.70859434e-02,
        6.22737855e-02, -2.57952064e-02, -4.00393121e-02, -2.88361497e-02,
       -7.99865574e-02, -1.48153841e-01, -2.40339916e-02, -1.95714887e-02,
        7.06236064e-03,  3.27154133e-03,  2.94442661e-03,  5.14248274e-02,
       -2.06040628e-02, -7.47305080e-02, -2.14331932e-02,  7.91432485e-02,
       -1.47514567e-02, -1.88269233e-03, -1.70939006e-02, -8.67925677e-03,
       -5.46040498e-02,  7.22847804e-02,  4.65393215e-02, -2.10457984e-02,
        1.11277122e-02,  3.48885618e-02,  2.52261758e-02, -2.88687572e-02,
        1.64042618e-02,  

In [9]:
def search_embeddings(db, query, top_k=5):
    """Search stored embeddings with a text query and return top matches."""
    embeddings = list(db.t.embeddings())
    if not embeddings:
        print("No embeddings found in database")
        return []

    query_embedding = model.encode(
        [query],
        normalize_embeddings=True,
        show_progress_bar=False,
    )[0]

    scored = []
    for row in embeddings:
        embedding = np.frombuffer(row["embedding"], dtype=np.float32)
        score = float(np.dot(query_embedding, embedding))
        chunk = db.t.chunks[row["chunk_id"]]
        scored.append((score, row["chunk_id"], chunk["text"]))

    scored.sort(key=lambda x: x[0], reverse=True)

    print(f'Query: "{query}"')
    for score, chunk_id, text in scored[:top_k]:
        preview = text.replace("\n", " ").strip()[:200]
        print(f"score={score:.4f} chunk_id={chunk_id} text={preview}...")

    return scored[:top_k]

In [10]:
def show_parent_extracts(db, scored_results, max_chars=None):
    """Display parent extracts for scored chunk results, de-duplicated by extract."""
    if not scored_results:
        print("No results to display")
        return

    seen_extract_ids = set()
    for score, chunk_id, _ in scored_results:
        chunk = db.t.chunks[chunk_id]
        extract_id = chunk["extract_id"]
        if extract_id in seen_extract_ids:
            continue
        seen_extract_ids.add(extract_id)

        extract = db.t.extracts[extract_id]
        text = extract["text"].strip()
        if max_chars is not None:
            text = text[:max_chars]
        print(f"\nscore={score:.4f} chunk_id={chunk_id} extract_id={extract_id}")
        print(text)

In [11]:
results = search_embeddings(db, "residential electric rate", top_k=5)
show_parent_extracts(db, results)

Query: "residential electric rate"
score=0.8012 chunk_id=97 text=Context: Home > My Account > Rates  General Service Demand Electric Rates (Large Commercial) Rate Class | Basic Monthly Charge | Demand Rate per kW | Energy Rate per kWh  | Fuel Rate per kWh | Environ...
score=0.7956 chunk_id=100 text=Context: Home > My Account > Rates  General Service Large Demand Electric Rates (Industrial) Rate Class | Basic Monthly Charge | Demand Rate per kW  | Energy Rate per kWh | Fuel Rate per kWh | Environ...
score=0.7856 chunk_id=103 text=Context: Home > My Account > Rates  Standby and Supplemental Electric Rates Rate Class | Basic Monthly Charge | Facilities Demand Rate per kW  | Supplemental Demand Rate per kW | Standby Demand Rate p...
score=0.7856 chunk_id=113 text=Context: Home > My Account > Rates  ged by the State of Florida on the commercial electric customers of JEA. The State tax rate for electric service is 7% but may be increased by any local option sale...
score=0.7828 chunk_id=91 t