# Embedding

This notebook contains the code for creating text embeddings.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


# Classes and functions

In [2]:
class EmbeddingGenerator:
    def __init__(self, model_name="all-MiniLM-L6-v2", provider="hf"):
        """
        provider: "hf" for HuggingFace, "openai" for OpenAI embeddings
        model_name: any SentenceTransformer model or OpenAI embedding model name
        """
        self.provider = provider
        self.model_name = model_name
        
        if provider == "hf":
            self.model = SentenceTransformer(model_name)
        elif provider == "openai":
            from openai import OpenAI
            self.client = OpenAI()
        else:
            raise ValueError("provider must be 'hf' or 'openai'")

    def embed_batch(self, texts):
        """
        Compute embeddings for a list of texts
        """
        if self.provider == "hf":
            return self.model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
        elif self.provider == "openai":
            # Example: OpenAI embeddings (text-embedding-3-large)
            embeddings = []
            for t in texts:
                resp = self.client.embeddings.create(
                    input=t, model=self.model_name
                )
                embeddings.append(resp.data[0].embedding)
            return np.array(embeddings)

In [3]:
def add_embeddings(df, text_column="chunk_embed_text", model_name="all-MiniLM-L6-v2", provider="hf", batch_size=64):
    """
    Embed chunks and add them as a new column: 'embedding'
    Also returns a NumPy matrix for fast clustering.
    """
    embedder = EmbeddingGenerator(model_name=model_name, provider=provider)
    all_embeddings = []

    for i in tqdm(range(0, len(df), batch_size)):
        batch_texts = df[text_column].iloc[i:i+batch_size].tolist()
        batch_embeddings = embedder.embed_batch(batch_texts)
        all_embeddings.extend(batch_embeddings)

    # Store as list for DataFrame
    #df["embedding"] = all_embeddings

    # Also return as NumPy matrix for clustering
    return df, np.vstack(all_embeddings)

# Workflow

In [5]:
df_semantic_train = pd.read_excel("./data/prepared/squad_train_v1_semantic_chunking.xlsx")

In [6]:
df_semantic_train, emb_semantic_train = add_embeddings(df_semantic_train, text_column="chunk_embed_text" ,model_name="all-MiniLM-L6-v2", provider="hf")

100%|██████████| 6288/6288 [13:40<00:00,  7.67it/s]


In [7]:
df_semantic_train.shape

(402424, 13)

In [8]:
emb_semantic_train.shape

(402424, 384)

In [9]:
df_semantic_val = pd.read_excel("./data/prepared/squad_val_v1_semantic_chunking.xlsx")

In [10]:
df_semantic_val, emb_semantic_val = add_embeddings(df_semantic_val, text_column="chunk_embed_text" ,model_name="all-MiniLM-L6-v2", provider="hf")

100%|██████████| 2475/2475 [05:36<00:00,  7.36it/s]


In [11]:
print(df_semantic_val.shape)
print(emb_semantic_val.shape)

(158351, 13)
(158351, 384)


# Save embeddings as numpy array

In [12]:
np.save("./data/tensors/squad_train_v1_semantic_chunking.npy", emb_semantic_train)
np.save("./data/tensors/squad_val_v1_semantic_chunking.npy", emb_semantic_val)