# Hybrid Chunking and FAISS Search
1. Import libraries and configuration
2. Core functions
3. Query execution

In [14]:
# Import libraries
from __future__ import annotations
import re
from typing import List
import os
from os.path import join as joint
import numpy as np
import pandas as pd
import pickle
import faiss
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Configuration
DATA_DIR = r"C:\Users\USER\RAG_learning-project"
BOARD_NAME = "gossiping"
MODEL_NAME = "shibing624/text2vec-base-multilingual"

# File paths
save_dir = joint(DATA_DIR, BOARD_NAME)
index_path = joint(save_dir, "index_text2vec.faiss")
metadata_path = joint(save_dir, "metadata_text2vec.pkl")

In [17]:
df = pd.read_csv(save_dir + f"/{BOARD_NAME}_content.csv")

In [None]:
# Core functions
def initial_split(text: str) -> List[str]:
    """Split text using hybrid rules without discarding separators."""
    pattern = re.compile(
        r"(?:\n\s*\n)+"      # multiple newlines
        r"|(?=【[^】]+】)"      # section headers
        r"|(?=\d+[\.\u3001])"  # ordered lists like 1. or 1、
    )
    parts = [part.strip() for part in pattern.split(text) if part.strip()]
    return parts

def build_documents_hybrid_from_df(
    df: pd.DataFrame, 
    chunk_size: int = 300, 
    chunk_overlap: int = 30
) -> List[Document]:
    """Convert DataFrame rows into Document objects using hybrid chunking.

    Args:
        df: The DataFrame containing a ``content`` column and metadata columns.
        chunk_size: Maximum characters per chunk.
        chunk_overlap: Overlap size passed to ``RecursiveCharacterTextSplitter``.

    Returns:
        List[Document]: Documents built from all rows of ``df``.
    """
    documents: List[Document] = []
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    for idx, row in df.iterrows():
        # Convert row to dictionary to ensure proper access
        row_dict = row.to_dict()
        if "content" not in row_dict:
            continue
            
        text = str(row_dict["content"])
        # Create metadata excluding content column
        metadata = {k: v for k, v in row_dict.items() if k != "content"}

        segments = initial_split(text)
        for segment in segments:
            if len(segment) > chunk_size:
                sub_segments = splitter.split_text(segment)
            else:
                sub_segments = [segment]
            for sub in sub_segments:
                if sub.strip():  # Only add non-empty segments
                    documents.append(Document(page_content=sub, metadata=metadata))

    return documents

def build_faiss_index_from_documents(
    documents: List[Document], 
    save_dir: str,
    model_name: str = MODEL_NAME
) -> tuple:
    """Build FAISS index from documents and save to specified directory."""
    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Define file paths
    index_path = joint(save_dir, "index_text2vec.faiss")
    metadata_path = joint(save_dir, "metadata_text2vec.pkl")
    
    # Step 1: Load model
    model = SentenceTransformer(model_name)

    # Step 2: Extract texts and metadata
    texts = [doc.page_content for doc in documents]
    metadata_list = [doc.metadata | {"text": doc.page_content} for doc in documents]

    # Step 3: Encode
    embeddings = model.encode(texts, show_progress_bar=True)
    embeddings = np.array(embeddings).astype("float32")

    # Step 4: Build and save FAISS
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, index_path)

    # Step 5: Save metadata
    with open(metadata_path, "wb") as f:
        pickle.dump(metadata_list, f)

    return len(documents), index_path, metadata_path

In [None]:
document = build_documents_hybrid_from_df(df)


In [20]:
build_faiss_index_from_documents(document,save_dir=save_dir)  # documents is a list of Document objects as you showed

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

(104,
 'C:\\Users\\USER\\RAG_learning-project\\gossiping\\index_text2vec.faiss',
 'C:\\Users\\USER\\RAG_learning-project\\gossiping\\metadata_text2vec.pkl')

In [27]:
# FAISS loading functions
def load_faiss_components(index_path: str, metadata_path: str) -> tuple:
    """Load FAISS index and metadata."""
    try:
        # Load FAISS index
        index = faiss.read_index(index_path)
        
        # Load metadata
        with open(metadata_path, 'rb') as f:
            metadata_list = pickle.load(f)
            
        print(f"Successfully loaded index with {index.ntotal} vectors")
        return index, metadata_list
    except Exception as e:
        print(f"Error loading FAISS components: {e}")
        raise

# Load FAISS components once
FAISS_INDEX, METADATA_LIST = load_faiss_components(index_path, metadata_path)

Successfully loaded index with 104 vectors


In [None]:
def search_index(
    query: str,
    index=FAISS_INDEX,
    metadata_list=METADATA_LIST,
    model_name: str = MODEL_NAME,
    top_k: int = 5,
    distance_threshold: float = 1.5  # Add threshold to filter irrelevant results
) -> List[tuple[dict, float]]:
    """Search the loaded FAISS index with distance threshold."""
    # Load model and encode query
    model = SentenceTransformer(model_name)
    query_vec = model.encode([query]).astype("float32")
    
    # Search
    distances, indices = index.search(query_vec, top_k * 2)  # Get more results initially
    
    # Get results with distances
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx < len(metadata_list) and dist < distance_threshold:
            results.append((metadata_list[idx], float(dist)))
    
    # Sort by distance and take top_k
    results.sort(key=lambda x: x[1])
    return results[:top_k]

def execute_query(query: str, top_k: int = 5) -> List[dict]:
    """Execute a query and format results with relevance scores."""
    results = search_index(query, top_k=top_k)
    
    if results:
        print(f"\n🔍 Query: {query}")
        for i, (res, distance) in enumerate(results, 1):
            relevance = max(0, (1 - distance/2)) * 100  # Convert distance to relevance score
            print(f"\n🔹 Result {i} (Relevance: {relevance:.1f}%)")
            print("📄 Title:", res.get("title"))
            print("📅 Date:", res.get("date"))
            print("🔗 Link:", res.get("link"))
            print("📝 Text:", res.get("text")[:200], "...")
    else:
        print(f"\n❌ No relevant results found for query: {query}")
    
    return [r[0] for r in results]  # Return just the metadata dictionaries

# Test with different queries
test_queries = [
    "颱風造成什麼災情？",
    "台南有什麼狀況？",
    "通訊中斷的情況？",
]

for query in test_queries:
    print("\n" + "="*50)
    results = execute_query(query, top_k=3)


🔹 Result 1
📄 Title: [新聞] 丹娜絲災後21天 台南學甲區仍持續沒網路
📅 Date: 7/27
🔗 Link: https://www.ptt.cc/bbs/Gossiping/M.1753595102.A.562.html
📝 Text: 1.媒體來源:
聯合報 ...

🔹 Result 2
📄 Title: [新聞] 大罷免大挫敗戰犯還在卸責！楊蕙如：沒救
📅 Date: 7/27
🔗 Link: https://www.ptt.cc/bbs/Gossiping/M.1753630801.A.7AC.html
📝 Text: 8.htm ...

🔹 Result 3
📄 Title: [新聞] 謝淑薇：自由不是拿來傷人的武器 一發文
📅 Date: 7/27
🔗 Link: https://www.ptt.cc/bbs/Gossiping/M.1753630970.A.44C.html
📝 Text: 8.htm ...

🔹 Result 4
📄 Title: [新聞] 災區「至今手機難通訊」沒第四台 網傻眼
📅 Date: 7/28
🔗 Link: https://www.ptt.cc/bbs/Gossiping/M.1753635718.A.4B2.html
📝 Text: 讓受災居民早日恢復正常生活。
網友感嘆「只因你不是郭國文吧」、「中華電信也是只有一格訊號，沒網路」、「老宅三
民路也如此，沒Wi-Fi沒第四台」、「真爛，什麼政府」。 ...

🔹 Result 5
📄 Title: [新聞] 災區「至今手機難通訊」沒第四台 網傻眼
📅 Date: 7/28
🔗 Link: https://www.ptt.cc/bbs/Gossiping/M.1753635718.A.4B2.html
📝 Text: 2.記者署名:
祝潤霖
災區「至今手機難通訊」沒第四台 網傻眼：你不是郭國文 ...
