# Embedding

This notebook contains the code for creating text embeddings.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


# Classes and functions

In [2]:
class EmbeddingGenerator:
    def __init__(self, model_name="all-MiniLM-L6-v2", provider="hf"):
        """
        provider: "hf" for HuggingFace, "openai" for OpenAI embeddings
        model_name: any SentenceTransformer model or OpenAI embedding model name
        """
        self.provider = provider
        self.model_name = model_name
        
        if provider == "hf":
            self.model = SentenceTransformer(model_name)
        elif provider == "openai":
            from openai import OpenAI
            self.client = OpenAI()
        else:
            raise ValueError("provider must be 'hf' or 'openai'")

    def embed_batch(self, texts):
        """
        Compute embeddings for a list of texts
        """
        if self.provider == "hf":
            return self.model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
        elif self.provider == "openai":
            # Example: OpenAI embeddings (text-embedding-3-large)
            embeddings = []
            for t in texts:
                resp = self.client.embeddings.create(
                    input=t, model=self.model_name
                )
                embeddings.append(resp.data[0].embedding)
            return np.array(embeddings)

In [3]:
def add_embeddings(df, text_column="chunk_embed_text", model_name="all-MiniLM-L6-v2", provider="hf", batch_size=64):
    """
    Embed chunks and add them as a new column: 'embedding'
    Also returns a NumPy matrix for fast clustering.
    """
    embedder = EmbeddingGenerator(model_name=model_name, provider=provider)
    all_embeddings = []

    for i in tqdm(range(0, len(df), batch_size)):
        batch_texts = df[text_column].iloc[i:i+batch_size].tolist()
        batch_embeddings = embedder.embed_batch(batch_texts)
        all_embeddings.extend(batch_embeddings)

    # Store as list for DataFrame
    #df["embedding"] = all_embeddings

    # Also return as NumPy matrix for clustering
    return df, np.vstack(all_embeddings)

# Workflow

In [5]:
df_semantic_train = pd.read_excel("../data/prepared/squad_train_v2_semantic_chunking.xlsx")

In [6]:
df_semantic_train, emb_semantic_train = add_embeddings(df_semantic_train, text_column="chunk_embed_text" ,model_name="all-MiniLM-L6-v2", provider="hf")

100%|██████████| 1313/1313 [17:53<00:00,  1.22it/s]


In [7]:
df_semantic_train.shape

(84007, 9)

In [8]:
emb_semantic_train.shape

(84007, 384)

In [9]:
df_semantic_val = pd.read_excel("../data/prepared/squad_val_v2_semantic_chunking.xlsx")

In [10]:
df_semantic_val, emb_semantic_val = add_embeddings(df_semantic_val, text_column="chunk_embed_text" ,model_name="all-MiniLM-L6-v2", provider="hf")

100%|██████████| 146/146 [02:00<00:00,  1.21it/s]


In [11]:
print(df_semantic_val.shape)
print(emb_semantic_val.shape)

(9308, 9)
(9308, 384)


# Save embeddings as numpy array

In [12]:
np.save("../data/tensors/squad_train_v2_semantic_chunking.npy", emb_semantic_train)
np.save("../data/tensors/squad_val_v2_semantic_chunking.npy", emb_semantic_val)

# Prepare missing queris

In [4]:
X_semantic_train = np.load("../data/tensors/squad_train_v2_semantic_chunking_l2.npy")
df_semantic_train = pd.read_excel("../data/labelled/squad_train_v2_semantic_chunking_clustered.xlsx")
df_queries_train = pd.read_excel("../data/prepared/squad_train_v2_queries.xlsx")

In [5]:
unique_contexts = df_queries_train['context_id'].unique()
n_unanswerable = int(len(unique_contexts) * 0.15)
np.random.seed(42)
unanswerable_contexts = np.random.choice(unique_contexts, size=n_unanswerable, replace=False)

# 2️⃣ Mark queries as unanswerable
df_queries_train.loc[df_queries_train['context_id'].isin(unanswerable_contexts), 'answer_start'] = None

# 3️⃣ Create mask for train rows to keep
mask_keep = ~df_semantic_train['context_id'].isin(unanswerable_contexts)

# 4️⃣ Filter train dataframe
df_train = df_semantic_train[mask_keep].reset_index(drop=True)

# 5️⃣ Filter embeddings accordingly
X_semantic_train_filtered = X_semantic_train[mask_keep.values]

In [6]:
df_train.shape, X_semantic_train_filtered.shape

((71426, 10), (71426, 384))

In [10]:
df_queries_train['context_id'].nunique(), df_queries_train['answer_start'].isna().sum(), df_queries_train.shape

(18891, np.int64(13134), (87599, 4))

In [13]:
df_train['context_id'].nunique(), df_train.shape

(16058, (71426, 10))

In [14]:
np.save("../data/tensors/squad_train_v2_semantic_chunking_l2_missing.npy", X_semantic_train_filtered)
df_train.to_excel("../data/labelled/squad_train_v2_semantic_chunking_clustered_missing.xlsx", index=False)
df_queries_train.to_excel("../data/prepared/squad_train_v2_queries_missing.xlsx", index=False)

In [16]:
df_train['cluster'].nunique()

500

In [18]:
df_train['cluster'].value_counts()

cluster
157    359
119    344
17     337
34     333
209    294
      ... 
75      46
130     42
362     38
278     33
407     31
Name: count, Length: 500, dtype: int64

In [19]:
df_semantic_train['cluster'].value_counts()

cluster
119    402
17     399
157    397
34     378
294    333
      ... 
75      52
362     48
130     46
278     39
407     35
Name: count, Length: 500, dtype: int64