# Books Preparation

In [None]:
!unzip /content/categories.zip


Archive:  /content/categories.zip
   creating: categories/
   creating: categories/الفقه الحنبلي/
  inflating: categories/الفقه الحنبلي/كشاف-القناع-عن-متن-الإقناع_البُهُوتي.txt  
   creating: categories/الفقه الحنفي/
  inflating: categories/الفقه الحنفي/البحر-الرائق-شرح-كنز-الدقائق-ومنحة-الخالق-وتكملة-الطوري_ابن-نجيم،-زين-الدين.txt  
   creating: categories/الفقه الشافعي/
  inflating: categories/الفقه الشافعي/نهاية-المحتاج-إلى-شرح-المنهاج_الرملي،-شمس-الدين.txt  
   creating: categories/الفقه العام/
  inflating: categories/الفقه العام/المجموع شرح المهذب النووي - أبو زكريا محيي الدين يحيى بن شرف النووي.txt  
   creating: categories/الفقه المالكي/
  inflating: categories/الفقه المالكي/شرح-مختصر-خليل-للخرشي_محمد-بن-عبد-الله-الخرشي.txt  


In [1]:
import os
def generate_books_dictionary(root_dir):
    """
    Generate a books dictionary from a directory structure where each subdirectory
    represents a category, and .txt files are named with the desired book_name.


    Example:
        books/
        ├── الفقه المالكي/
        │   ├── شرح مختصر خليل.txt
        ├── الفقه العام/
        │   ├── المجموع شرح المهذب_النووي.txt
        Returns:
        [
            {'path': 'books/Ethics/Arabic_AI_Ethics.txt', 'name': 'Arabic AI Ethics', 'category': 'Ethics'},
            {'path': 'books/Technology/Arabic_ML_Basics.txt', 'name': 'Arabic ML Basics', 'category': 'Technology'}
        ]
    """
    books = []

    # Ensure root_dir exists
    if not os.path.isdir(root_dir):
        raise ValueError(f"Root directory does not exist: {root_dir}")

    # Iterate through category directories
    for category in os.listdir(root_dir):
        category_path = os.path.join(root_dir, category)
        if not os.path.isdir(category_path):
            continue  # Skip non-directories

        # Iterate through .txt files in the category directory
        for file_name in os.listdir(category_path):
            if not file_name.endswith(".txt"):
                continue  # Skip non-.txt files

            file_path = os.path.join(category_path, file_name)

            # Generate book_name by removing .pdf.txt and replacing underscores with spaces
            book_name = file_name[:-4].replace("-", " ")

            # Add to books list
            books.append({
                "path": file_path,
                "name": book_name,
                "category": category
            })

    # Validate books list
    if not books:
        raise ValueError(f"No .txt files found in {root_dir}")

    # Verify file readability and UTF-8 encoding
    for book in books:
        try:
            with open(book["path"], "r", encoding="utf-8") as f:
                f.read(100)  # Read first 100 chars to test encoding
        except UnicodeDecodeError:
            raise ValueError(f"File is not UTF-8 encoded: {book['path']}")
        except FileNotFoundError:
            raise ValueError(f"File not found: {book['path']}")

    return books



In [2]:
# check books dictionary

root_dir = "/content/categories"  # Replace with your root directory path
books = generate_books_dictionary(root_dir)
for book in books:
    print(book["category"])
    break

ValueError: Root directory does not exist: /content/categories

# Vector Database Generation

In [3]:
! pip install langchain



In [4]:
def split_by_page_separator(file_path):
    """
    Split a .txt file into pages using page separator word.
    Returns a list of (page_content, page_number) tuples.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Split by "الصفحة:"
    pages = text.split("الصفحة:")
    # Create list of (page_content, page_number) tuples
    page_list = [(page.strip(), i + 1) for i, page in enumerate(pages) if page.strip()]

    return page_list

In [5]:
from langchain.docstore.document import Document

def create_page_documents(book):
    """
    Create LangChain Document objects for each page of a book.
    Returns:
        List[Document]: List of Document objects with page content and metadata.
    """
    # Split text into pages
    page_list = split_by_page_separator(book["path"])

    # Create Documents
    documents = [
        Document(
            page_content=page_content,
            metadata={
                "book_name": book["name"],
                "category": book["category"],
                "page": page_number
            }
        )
        for page_content, page_number in page_list
    ]

    return documents

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents(documents):
    """
    Split documents into smaller chunks for embedding.
    Returns:
        List[Document]: List of chunked Document objects with metadata.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Approx. 1000 characters per chunk
        chunk_overlap=400  # Overlap for context
    )

    chunks = text_splitter.split_documents(documents)

    return chunks

In [7]:
!pip install -U langchain-community



In [8]:
!pip install chromadb



In [9]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import os
# Initialize embeddings (multilingual for Arabic)
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large" #sentence-transformers/LaBSE
)

  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [10]:

def store_embeddings(chunks, persist_directory="./Fiqh_Books_db"):
    """
    Generate embeddings for chunks and store in Chroma.

    Args:
        chunks (List[Document]): List of chunked Document objects.
        persist_directory (str): Directory to store Chroma database.
    """


    # Store in Chroma
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    print(f"Vector store created at {persist_directory}")

    # Validate metadata
    docs = vectorstore.get()
    for meta in docs["metadatas"][:5]:  # Check first 5 chunks
        print(f"Metadata: {meta}")

In [11]:
def preprocess_arabic_books(books, persist_directory="./Fiqh_Books_db"):
    """
    Preprocess Arabic text books: load, add metadata, chunk, embed, and store.

    Args:
        books (List[Dict]): List of dicts with 'path', 'name', 'category'.
        persist_directory (str): Directory to store Chroma database.
    """
    all_chunks = []

    # Process each book
    for book in books:
        # Create page-level documents
        page_documents = create_page_documents(book)

        # Chunk documents
        chunks = chunk_documents(page_documents)
        print(f"Generated {len(chunks)} chunks for {book['name']}")
        print(f"First Chunk Content:{chunks[0].page_content} \nMetadata:")
        for key, value in chunks[0].metadata.items():
            print(f"{key}: {value}")
        print("***********************************")
        all_chunks.extend(chunks)

    # Store embeddings
    store_embeddings(all_chunks, persist_directory)

In [None]:
 # Create vector store
if not os.path.exists("./Fiqh_Books_db"):
    preprocess_arabic_books(books)

Generated 15720 chunks for كشاف القناع عن متن الإقناع_البُهُوتي
First Chunk Content:كشاف القناع عن متن الإقناع البهوتي الكتاب: كشاف القناع عن متن الإقناعالمؤلف: منصور بن يونس بن صلاح الدين ابن حسن بن إدريس البهوتى الحنبلى (المتوفى: 1051هـ)الناشر: دار الكتب العلميةعدد الأجزاء:6[ترقيم الكتاب موافق للمطبوع]
* * *[مقدمة الكتاب]بسم الله الرحمن الرحيم الحمد لله الذي شرح صدورنا بالهداية إلى الإسلام، ووفقنا للتفقه في الدين وما شرعه من بديع محكم الأحكام، أحمده سبحانه وتعالى على جزيل الإنعام، وأشكره أن علم بالقلم علم الإنسان ما لم يعلم فأتقن وأحكم أي إحكام
الجزء: 1 ¦ 
Metadata:
book_name: كشاف القناع عن متن الإقناع_البُهُوتي
category: الفقه الحنبلي
page: 1
***********************************
Generated 25247 chunks for البحر الرائق شرح كنز الدقائق ومنحة الخالق وتكملة الطوري_ابن نجيم، زين الدين
First Chunk Content:البحر الرائق شرح كنز الدقائق ومنحة الخالق وتكملة الطوريابن نجيمالكتاب: البحر الرائق شرح كنز الدقائقالمؤلف: زين الدين بن إبراهيم بن محمد، المعروف بابن نجيم المصري (المتوفى: 970هـ)وفي آخره

  vectorstore.persist()


Vector store created at ./Fiqh_Books_db
Metadata: {'book_name': 'كشاف القناع عن متن الإقناع_البُهُوتي', 'page': 1, 'category': 'الفقه الحنبلي'}
Metadata: {'category': 'الفقه الحنبلي', 'book_name': 'كشاف القناع عن متن الإقناع_البُهُوتي', 'page': 2}
Metadata: {'page': 2, 'book_name': 'كشاف القناع عن متن الإقناع_البُهُوتي', 'category': 'الفقه الحنبلي'}
Metadata: {'book_name': 'كشاف القناع عن متن الإقناع_البُهُوتي', 'page': 2, 'category': 'الفقه الحنبلي'}
Metadata: {'page': 2, 'category': 'الفقه الحنبلي', 'book_name': 'كشاف القناع عن متن الإقناع_البُهُوتي'}


In [None]:
!cp -r /content/Fiqh_Books_db /content/drive/MyDrive/



# Retrival Example

In [12]:
# import drive to get saved database
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import os


In [None]:
# Initialize the same embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large"
)

In [29]:

# Step 2: Load vector store
def load_vector_store(persist_directory="/content/drive/MyDrive/Fiqh_Books_db", metadata_filter=None):
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    retriever_kwargs = {"search_type": "similarity", "search_kwargs": {"k": 10}}
    if metadata_filter:
        retriever_kwargs["search_kwargs"]["filter"] = metadata_filter
    retriever = vectorstore.as_retriever(**retriever_kwargs)
    return vectorstore, retriever
print("Vector store loaded successfully.")

def retrieve_chunks(vectorstore, query, metadata_filter=None, k=30):
    """
    Retrieve relevant chunks from the vector store based on a query, with optional metadata filtering.

    Args:
        vectorstore: Chroma vector store.
        query (str): User query (e.g., in Arabic).
        metadata_filter (dict, optional): Metadata filter (e.g., {"book_name": "AI Ethics"}).
        k (int): Number of chunks to retrieve (default: 20).

    Returns:
        List[Document]: List of retrieved chunks with metadata.
    """
    # Retrieve chunks with optional metadata filter
    search_kwargs = {"k": k}
    if metadata_filter:
        search_kwargs["filter"] = metadata_filter

    retrieved_docs = vectorstore.similarity_search(
        query=query,
        **search_kwargs
    )

    return retrieved_docs

query = "ما هي أركان الحج؟"
vectorstore, retriever = load_vector_store()
retrieved_docs = retrieve_chunks(vectorstore, query, metadata_filter={"category": "الفقه العام"})
context = ""
for i, doc in enumerate(retrieved_docs):
    context += f"\nRetrieved Chunk {i + 1}:\nContent: {doc.page_content}\nMetadata: {doc.metadata.get('page')}\n"
print(context)

Vector store loaded successfully.

Retrieved Chunk 1:
Content: المصنف في آخر باب المواقيت والله أعلم * قال المصنف رحمه الله* (اركان الحج أربعة الاحرام والوقوف بعرفه وطواف الافاضة والسعي بين الصفا والمروة* وواجباته الاحرام من الميقات والرمي وفي الوقوف بعرفة إلى أن تغرب الشمس والمبيت بالمزدلفة والمبيت بمنى في ليالي الرمي وفي طواف الوداع قولان (احدهما) أنه واجب (والثاني) ليس بواجب* وسننه الغسل وطواف القدوم والرمل والاضطباع في الطواف والسعي واستلام الركن وتقبيله والسعي في موضع السعي والمشي في موضع المشي والخطب والاذكار والادعية* وأفعال العمرة كلها أركان الا الحلق* فمن ترك ركنا لم يتم نسكه ولا تحلل حتى يأتي به* ومن ترك واجبا لزمه الدم* ومن ترك سنة لم يلزمه شئ)*(الشرح) قال أصحابنا أعمال الحج ثلاثة أقسام - أركان - وواجبات - وسنن - (أما) الأركان فخمسة - الإحرام - والوقوف - وطواف الإفاضة - والسعي - والحلق إذا قلنا بالأصح إن الحلق نسك وإن قلنا ليس بنسك فأركانه الأربعة الأولى (وأما) الواجبات فاثنان متفق عليهما وأربعة مختلف
Metadata: 4241

Retrieved Chunk 2:
Content: فالأركان لا يتم الحج ويجزئ حتى

In [36]:
import pandas as pd
import time
from pathlib import Path
import json
from typing import List, Dict, Any
import os # For potential environment variables or path joining

# --- Vector Store and Retrieval Imports & Setup (Actual Implementations) ---
from langchain_community.vectorstores import Chroma # Corrected import
# from langchain_community.embeddings import HuggingFaceEmbeddings # Assuming 'embeddings' is globally defined
from langchain_core.documents import Document # For type hinting and if your VS returns this

# --- Actual Embeddings and Vector Store Loading for WORKER ---
print("WORKER: Initializing embeddings...")
try:
    # This line assumes 'embeddings' is a pre-initialized HuggingFaceEmbeddings object
    # available in the global scope of your Colab notebook.
    # Example if not:
    # from langchain_community.embeddings import HuggingFaceEmbeddings
    # embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"
    # embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
    embeddings_worker = embeddings # Critical: 'embeddings' must be defined before this script runs
    print("WORKER: Embeddings initialized successfully.")
except NameError: # If 'embeddings' is not defined
    print("WORKER: FATAL Error initializing embeddings: 'embeddings' global variable not found. Please define it.")
    embeddings_worker = None
except Exception as e:
    print(f"WORKER: FATAL Error initializing embeddings: {e}")
    embeddings_worker = None

def load_actual_vector_store_for_worker(persist_dir: str) -> Chroma:
    """Loads the actual Chroma vector store for the worker."""
    if not embeddings_worker:
        print("WORKER: Cannot load vector store without embeddings.")
        return None
    print(f"WORKER: Attempting to load vector store from: {persist_dir}")
    try:
        if not Path(persist_dir).exists():
            print(f"WORKER: FATAL - Persist directory not found: {persist_dir}")
            return None

        vectorstore_instance = Chroma(
            persist_directory=persist_dir,
            embedding_function=embeddings_worker
        )
        print("WORKER: Vector store loaded successfully.")
        return vectorstore_instance
    except Exception as e:
        print(f"WORKER: FATAL Error loading vector store from {persist_dir}: {e}")
        import traceback
        traceback.print_exc()
        return None

VECTOR_STORE_PERSIST_DIR = "/content/drive/MyDrive/Fiqh_Books_db" # Ensure this is correct
vectorstore_global_worker = None

if embeddings_worker:
    vectorstore_global_worker = load_actual_vector_store_for_worker(VECTOR_STORE_PERSIST_DIR)
else:
    print("WORKER: Skipping vector store load due to embedding initialization failure.")

if vectorstore_global_worker is None:
    print("WORKER: FATAL - Vector store could not be loaded. Worker will not be able to process requests.")

def perform_actual_retrieval(query_text: str, metadata_filter_dict: Dict = None, k_results: int = 10) -> List[Document]:
    """
    Performs the actual document retrieval from the globally loaded vector store.
    """
    if vectorstore_global_worker is None:
        print("ERROR (Worker): Global vector store for worker is not available or not loaded.")
        return []

    print(f"WORKER: Performing REAL retrieval for query='{query_text}', k={k_results}, filter={metadata_filter_dict}")
    try:
        # The original script had: k_results = 50
        # This line overrides the k_results parameter passed from the Excel sheet.
        # Keeping this behavior as per "don't change anything else".
        # If you want the k from Excel to be used, comment out or change the next line.
        effective_k = 30

        search_kwargs = {"k": effective_k}
        if metadata_filter_dict:
            search_kwargs["filter"] = metadata_filter_dict

        retrieved_docs = vectorstore_global_worker.similarity_search(
            query=query_text,
            **search_kwargs
        )
        print(f"WORKER: Retrieved {len(retrieved_docs)} documents from actual vector store using k={effective_k}.")
        retrieved_docs = vectorstore_global_worker.similarity_search(
        query=query_text,
        **search_kwargs
        )
        print(f"WORKER: Retrieved {len(retrieved_docs)} documents from actual vector store using k={effective_k}.")
        context = []
        for i, doc in enumerate(retrieved_docs):
            context.append(f"{doc.page_content}. Metadata: page: {doc.metadata.get('page')}. book_name: {doc.metadata.get('book_name')}")

        return context
    except Exception as e:
        print(f"WORKER: Error during similarity search with actual vector store: {e}")
        import traceback
        traceback.print_exc()
        return []

# def serialize_documents_to_json_string(retrieved_docs_list: List[Document]) -> str:
#     """
#     Serializes a list of Document objects into a JSON string.
#     (This function remains unchanged)
#     """
#     if not retrieved_docs_list:
#         return "[]"

#     docs_as_dicts = []
#     for doc in retrieved_docs_list:
#         if hasattr(doc, 'page_content') and hasattr(doc, 'metadata'):
#             docs_as_dicts.append({
#                 "page_content": doc.page_content,
#                 "metadata": doc.metadata
#             })
#         else:
#             print(f"WORKER: Warning - Document object {type(doc)} is missing page_content or metadata. Skipping.")
#     try:
#         return json.dumps(docs_as_dicts, ensure_ascii=False, indent=2)
#     except TypeError as e:
#         print(f"WORKER: Error serializing documents to JSON: {e}. Ensure metadata contains JSON-serializable types.")
#         return json.dumps([{"error": "Serialization failure on worker", "details": str(e)}], ensure_ascii=False)
import json
from typing import List

def serialize_documents_to_json_string(retrieved_docs_list: List[str]) -> str:
    """
    Serializes a list of text strings into a JSON string.
    """
    if not retrieved_docs_list:
        return "[]"

    docs_as_dicts = []
    for doc in retrieved_docs_list:
        if isinstance(doc, str):
            docs_as_dicts.append({
                "content": doc
            })
        else:
            print(f"WORKER: Warning - Item {type(doc)} is not a string. Skipping.")

    try:
        return json.dumps(docs_as_dicts, ensure_ascii=False, indent=2)
    except TypeError as e:
        print(f"WORKER: Error serializing documents to JSON: {e}. Ensure content contains JSON-serializable types.")
        return json.dumps([{"error": "Serialization failure on worker", "details": str(e)}], ensure_ascii=False)
# --- Configuration for the Excel Bridge ---
EXCEL_BRIDGE_PATH = "/content/drive/MyDrive/Fiqh_Books_db/retrieval_bridge.xlsx"
WORKER_CHECK_INTERVAL = 5

COL_INPUT_QUERY = "Input_Query"
COL_INPUT_FILTER = "Input_Filter_Dict"
COL_INPUT_K = "Input_K_Results"
# COL_OUTPUT_RESPONSE = "Output_Response_From_Other_Notebook" # Original single output column name
COL_OUTPUT_BASE_NAME = "Output_Response_From_Other_Notebook" # MODIFIED: Base name for output parts
MAX_CELL_CHARS = 30000 # MODIFIED: Max characters per Excel cell part

def process_retrieval_requests():
    """
    Monitors retrieval_bridge.xlsx for new requests, performs retrieval,
    and writes back JSON serialized documents, split into parts if necessary.
    """
    print("--- Retrieval Bridge Worker Started (Using Actual Vector Store & Multi-Part Output) ---")

    if vectorstore_global_worker is None:
        print("WORKER: Aborting process_retrieval_requests because vector store is not loaded.")
        return

    # Initial file check (optional creation)
    # if not Path(EXCEL_BRIDGE_PATH).exists():
    #     print(f"Bridge file {EXCEL_BRIDGE_PATH} not found by worker. Creating empty.")
    #     df_init = pd.DataFrame(columns=[
    #         COL_INPUT_QUERY, COL_INPUT_FILTER, COL_INPUT_K,
    #         f"{COL_OUTPUT_BASE_NAME}_Part_0" # Ensure at least the first part column
    #     ])
    #     df_init.to_excel(EXCEL_BRIDGE_PATH, index=False)

    while True:
        try:
            if not Path(EXCEL_BRIDGE_PATH).exists():
                print(f"WORKER: Waiting for bridge file to be created at {EXCEL_BRIDGE_PATH}...")
                time.sleep(WORKER_CHECK_INTERVAL)
                continue

            df_bridge = pd.read_excel(EXCEL_BRIDGE_PATH)

            first_output_part_col = f"{COL_OUTPUT_BASE_NAME}_Part_0" # MODIFIED

            all_cols_present = True
            # MODIFIED: required_cols now includes the first output part column
            required_cols = [COL_INPUT_QUERY, COL_INPUT_FILTER, COL_INPUT_K, first_output_part_col]
            for col in required_cols:
                if col not in df_bridge.columns:
                    print(f"WORKER: Column '{col}' missing in {EXCEL_BRIDGE_PATH}. Adding it.")
                    df_bridge[col] = pd.NA # Use pd.NA for object columns
                    all_cols_present = False

            if not all_cols_present:
                df_bridge.to_excel(EXCEL_BRIDGE_PATH, index=False)
                print("WORKER: Added missing columns. Will re-check in next cycle.")
                time.sleep(WORKER_CHECK_INTERVAL)
                continue

            # MODIFIED: Condition for new requests checks the first output part column
            new_requests = df_bridge[
                df_bridge[COL_INPUT_QUERY].notna() & \
                df_bridge[COL_INPUT_K].notna() & \
                df_bridge[first_output_part_col].isna() # Check if the first part is not yet written
            ]

            if not new_requests.empty:
                print(f"WORKER: Found {len(new_requests)} new retrieval requests.")
                for index, row in new_requests.iterrows():
                    query = str(row[COL_INPUT_QUERY])
                    filter_str = str(row[COL_INPUT_FILTER]) if pd.notna(row[COL_INPUT_FILTER]) else ""
                    k_val = row[COL_INPUT_K]

                    try:
                        k = int(k_val)
                        if k <= 0:
                            raise ValueError("K must be a positive integer.")
                    except ValueError as e:
                        print(f"WORKER: Invalid K value '{k_val}' for request at index {index}. Error: {e}. Skipping request.")
                        error_json_string = json.dumps([{"error": f"Invalid K value: {k_val}"}], ensure_ascii=False)
                        df_bridge.loc[index, first_output_part_col] = error_json_string
                        # Clear other potential part columns for this row
                        part_idx_clear = 1
                        while True:
                            next_part_col_clear = f"{COL_OUTPUT_BASE_NAME}_Part_{part_idx_clear}"
                            if next_part_col_clear in df_bridge.columns:
                                df_bridge.loc[index, next_part_col_clear] = pd.NA
                                part_idx_clear += 1
                            else:
                                break
                        continue

                    print(f"WORKER: Processing request at index {index}: Query='{query}', FilterStr='{filter_str}', K={k}")

                    metadata_filter = None
                    if filter_str.strip() and filter_str.lower() != 'nan' and filter_str.lower() != '<na>':
                        try:
                            metadata_filter = json.loads(filter_str)
                            if not isinstance(metadata_filter, dict):
                                print(f"WORKER: Warning - Parsed filter is not a dict: {metadata_filter}. Using no filter.")
                                metadata_filter = None
                        except json.JSONDecodeError:
                            print(f"WORKER: Warning - Could not decode filter string: '{filter_str}'. Using no filter.")
                            metadata_filter = None

                    response_json_string = ""
                    try:
                        retrieved_document_objects = perform_actual_retrieval(query, metadata_filter, k)
                        response_json_string = serialize_documents_to_json_string(retrieved_document_objects)
                        print(f"WORKER: Serialized {len(retrieved_document_objects)} documents to JSON for index {index}.")
                    except Exception as e_retrieve:
                        print(f"WORKER: Critical error during retrieval/serialization for index {index}: {e_retrieve}")
                        error_report = {"error": "Worker retrieval/serialization failed", "details": str(e_retrieve)}
                        response_json_string = json.dumps([error_report], ensure_ascii=False)

                    # --- MODIFIED: Write response_json_string into parts ---
                    start_char_idx = 0
                    part_num = 0

                    if not response_json_string: # Handle empty response string case
                        response_json_string = "[]" # Default to empty JSON list if fully empty

                    while start_char_idx < len(response_json_string):
                        current_part_col_name = f"{COL_OUTPUT_BASE_NAME}_Part_{part_num}"
                        chunk = response_json_string[start_char_idx : start_char_idx + MAX_CELL_CHARS]

                        if current_part_col_name not in df_bridge.columns:
                            df_bridge[current_part_col_name] = pd.NA
                            print(f"WORKER: Added new output part column to DataFrame: {current_part_col_name}")

                        df_bridge.loc[index, current_part_col_name] = chunk

                        start_char_idx += MAX_CELL_CHARS
                        part_num += 1

                    # Clear any subsequent part columns for this row if they exist from a previous, longer response
                    # (e.g. if this row was processed before with more parts)
                    while True:
                        potential_old_part_col = f"{COL_OUTPUT_BASE_NAME}_Part_{part_num}"
                        if potential_old_part_col in df_bridge.columns:
                            if pd.notna(df_bridge.loc[index, potential_old_part_col]):
                                df_bridge.loc[index, potential_old_part_col] = pd.NA
                            part_num += 1
                        else:
                            break
                    # --- End of MODIFIED writing logic ---

                df_bridge.to_excel(EXCEL_BRIDGE_PATH, index=False)
                print(f"WORKER: Updated {EXCEL_BRIDGE_PATH} with {len(new_requests)} responses (potentially multi-part).")
            else:
                print("WORKER: No new retrieval requests found.")
        except FileNotFoundError:
            print(f"WORKER: Bridge file {EXCEL_BRIDGE_PATH} not found. Waiting...")
        except Exception as e:
            print(f"WORKER: Error in main loop: {str(e)}")
            import traceback
            traceback.print_exc()

        time.sleep(WORKER_CHECK_INTERVAL)

if __name__ == "__main__":
    # Ensure drive is mounted if running in Colab and path is in Drive
    # from google.colab import drive
    # drive.mount('/content/drive', force_remount=True)

    # Ensure 'embeddings' global variable is defined before this point
    # For example:
    # from langchain_community.embeddings import HuggingFaceEmbeddings
    # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Or your chosen model

    if vectorstore_global_worker:
        process_retrieval_requests()
    else:
        print("WORKER: Exiting because global vector store was not loaded (or embeddings failed). Review initialization messages.")

WORKER: Initializing embeddings...
WORKER: Embeddings initialized successfully.
WORKER: Attempting to load vector store from: /content/drive/MyDrive/Fiqh_Books_db
WORKER: Vector store loaded successfully.
--- Retrieval Bridge Worker Started (Using Actual Vector Store & Multi-Part Output) ---
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No

  {
    "content": "المصنف في آخر باب المواقيت والله أعلم * قال المصنف رحمه الله* (اركان الحج أربعة الاحرام والوقوف بعرفه وطواف الافاضة والسعي بين الصفا والمروة* وواجباته الاحرام من الميقات والرمي وفي الوقوف بعرفة إلى أن تغرب الشمس والمبيت بالمزدلفة والمبيت بمنى في ليالي الرمي وفي طواف الوداع قولان (احدهما) أنه واجب (والثاني) ليس بواجب* وسننه الغسل وطواف القدوم والرمل والاضطباع في الطواف والسعي واستلام الركن وتقبيله والسعي في موضع السعي والمشي في موضع المشي والخطب والاذكار والادعية* وأفعال العمرة كلها أركان الا الحلق* فمن ترك ركنا لم يتم نسكه ولا تحلل حتى يأتي به* ومن ترك واجبا لزمه الدم* ومن ترك سنة لم يلزمه شئ)*(الشرح) قال أصحابنا أعمال الحج ثلاثة أقسام - أركان - وواجبات - وسنن - (أما) الأركان فخمسة - الإحرام - والوقوف - وطواف الإفاضة - والسعي - والحلق إذا قلنا بالأصح إن الحلق نسك وإن قلنا ليس بنسك فأركانه الأربعة الأولى (وأما) الواجبات فاثنان متفق عليهما وأربعة مختلف. Metadata: page: 4241. book_name: المجموع شرح المهذب النووي   أبو زكريا محيي الدين يحيى بن شرف النووي"
  },
  {
    "

WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new retrieval requests found.
WORKER: No new r

KeyboardInterrupt: 