In [None]:
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# ✅ Path to your multilingual FAQ file
FAQ_FILE = "faqs.txt"

# Initialize SentenceTransformer-based embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Function to extract Q&A pairs from the text (handles Q1:, A1:, etc.)
def extract_qa_pairs_from_text(text: str):
    qa_pairs = []
    lines = text.split("\n")

    current_q = ""
    current_a = ""

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Match questions like Q1:, Q2:, A1:, A2: (case insensitive)
        if line.lower().startswith("q") and (":" in line):
            if current_q and current_a:
                qa_pairs.append(f"{current_q}\n{current_a}")
            current_q = line
            current_a = ""
        elif line.lower().startswith("a") and (":" in line):
            current_a = line
        else:
            # Continue appending to current question or answer
            if current_a:
                current_a += " " + line
            elif current_q:
                current_q += " " + line

    if current_q and current_a:
        qa_pairs.append(f"{current_q}\n{current_a}")

    # Debug: Print the extracted Q&A pairs
    print(f"Extracted {len(qa_pairs)} Q&A pairs.")
    for i, qa in enumerate(qa_pairs[:5]):  # Print first 5 Q&A pairs
        print(f"Q&A {i+1}: {qa[:200]}...")  # Show a preview of the Q&A

    return [Document(page_content=qa, metadata={"source": "faqs.txt"}) for qa in qa_pairs]

# Function to process the FAQ file and store the data in FAISS
def process_faq_file():
    if not os.path.exists(FAQ_FILE):
        return f"❌ File '{FAQ_FILE}' not found."

    # Read and print raw content of the FAQ file
    with open(FAQ_FILE, "r", encoding="utf-8") as f:
        text = f.read()
    
    # Debug: Print raw content for inspection
    print(f"Raw content from {FAQ_FILE}:")
    print(text[:500])  # Print first 500 characters of the file for inspection

    qa_docs = extract_qa_pairs_from_text(text)

    # Debug: Check if qa_docs has content
    if not qa_docs:
        return "❌ No Q&A pairs extracted. Please check the format of 'faqs.txt'."

    # Optional chunking
    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50)
    chunks = splitter.split_documents(qa_docs)

    # Ensure there is content to embed
    if not chunks:
        return "❌ No content to process after splitting."

    # Extract text for embedding
    texts = [chunk.page_content for chunk in chunks]

    # Ensure text is not empty
    if not all(texts):
        return "❌ Some text chunks are empty."

    # Create FAISS index using the embeddings
    try:
        embeddings = embedding_model.embed_documents(texts)
        if not embeddings or len(embeddings) != len(texts):
            return "❌ Error: Mismatch in number of embeddings and texts."

        vector_store = FAISS.from_documents(chunks, embedding_model)
        vector_store.save_local("faiss_faq_db")

        return f"✅ {len(texts)} multilingual Q&A chunks stored in FAISS!"
    except Exception as e:
        return f"❌ Error during FAISS processing: {str(e)}"

# Run the script
if not os.path.exists("faiss_faq_db"):
    print(process_faq_file())
else:
    print("✅ FAISS FAQ DB already exists.")
