In [1]:
import os
import json
import faiss
import pickle
from src.tagging import build_tfidf_tags
from src.embedder import SentenceTransformer

%load_ext autoreload
%autoreload 1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
all_chunks = json.load(open("data/chunks/custom_sections.json"))

In [None]:
chunks = []
sources = []
metadata = []

In [4]:
for i, c in enumerate(all_chunks):
	# has_table = bool(DocumentChunker.TABLE_RE.search(c))
	meta = {
		"filename": "data/chunks/custom_sections.json",
		"chunk_id": i,
		"mode": "section",
		"keep_tables": True,
		"char_len": len(c['content']),
		"word_len": len(c['content'].split()),
		"has_table": False,
		# "section_hints": headers[:10],  # small header sample
	}
	# if isinstance(strategy, SlidingTokenStrategy):
	# 	meta["max_tokens"] = strategy.max_tokens
	# 	meta["overlap_tokens"] = strategy.overlap_tokens
	# 	meta["tokenizer_name"] = strategy.tokenizer_name

	chunks.append(c['content'])
	sources.append(c['heading'])
	metadata.append(meta)

In [5]:
vectorizer, chunk_tags = build_tfidf_tags(
    chunks,
    ngram_range=(1, 3),
    max_features=25000,
    min_df=2,
    max_df=0.6,
    top_k_per_chunk=10,
)

In [6]:
for i, tags in enumerate(chunk_tags):
	metadata[i]["tags"] = tags

In [None]:
embedder = SentenceTransformer("models/Qwen3-Embedding-4B-Q8_0.gguf", n_ctx=40960, n_threads=16)
embeddings = embedder.encode(
	chunks, batch_size=4, show_progress_bar=True
)

In [13]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

In [14]:
out_prefix = "test_custom"

In [16]:
faiss.write_index(index, f"{out_prefix}.faiss")
with open(f"{out_prefix}_chunks.pkl", "wb") as f:
	pickle.dump(chunks, f)
with open(f"{out_prefix}_sources.pkl", "wb") as f:
	pickle.dump(sources, f)
with open(f"{out_prefix}_meta.pkl", "wb") as f:
	pickle.dump(metadata, f)

# persist tagging artifacts under meta/
os.makedirs("meta", exist_ok=True)
with open(os.path.join("meta", f"{out_prefix}_tfidf.pkl"), "wb") as f:
	pickle.dump(vectorizer, f)
with open(os.path.join("meta", f"{out_prefix}_tags.pkl"), "wb") as f:
	pickle.dump(chunk_tags, f)