In [None]:
# ================================================
# 🔧 Step 1: Install Required Dependencies
# - faiss-cpu: for vector similarity search
# - sentence-transformers: to embed text as vectors
# - openai: to query GPT models
# - python-docx: to parse Word (.docx) files
# - PyMuPDF (fitz): to extract text from PDFs
# - XetHub is a backend system Hugging Face supports that allows faster and more efficient file downloads
# ================================================
!pip install -q faiss-cpu sentence-transformers openai python-docx PyMuPDF
!pip install huggingface_hub[hf_xet]

# ================================================
# 🔐 Step 2: Input your OpenAI API key securely
# - We use getpass so the key isn't visible
# - Sets the API key in an environment variable
# ================================================
from getpass import getpass
import os
import warnings

# Suppress unnecessary warnings from Hugging Face (if not using HF tokens)
warnings.filterwarnings("ignore", category=UserWarning)

# Ask for OpenAI API key and store it securely
os.environ["OPENAI_API_KEY"] = getpass("🔐 Enter your OpenAI API key: ")

# ================================================
# 📁 Step 3: Upload Documents
# - Supports .txt, .pdf, and .docx files
# - Uses Google Colab’s file upload interface
# ================================================
from google.colab import files
uploaded = files.upload()

# Optional: suppress Hugging Face token warnings in Colab
warnings.filterwarnings(
    "ignore",
    message="The secret `HF_TOKEN` does not exist in your Colab secrets.",
    category=UserWarning,
    module="huggingface_hub.utils._auth"
)

# ================================================
# 📄 Step 4: Extract Text from Uploaded Files
# - PDF: uses PyMuPDF
# - DOCX: uses python-docx
# - TXT: plain read
# ================================================
import fitz  # for PDF files
import docx  # for Word files

# Helper function to extract text based on file type
def extract_text_from_file(filename):
    if filename.endswith(".txt"):
        with open(filename, "r", encoding="utf-8") as f:
            return f.read()
    elif filename.endswith(".pdf"):
        text = ""
        with fitz.open(filename) as doc:
            for page in doc:
                text += page.get_text()
        return text
    elif filename.endswith(".docx"):
        doc = docx.Document(filename)
        return "\n".join([para.text for para in doc.paragraphs])
    else:
        return ""  # Unsupported file type

# Extract and collect text from uploaded files
documents = []
for fname in uploaded.keys():
    text = extract_text_from_file(fname)
    if text:
        documents.append(text)

# ================================================
# ✂️ Step 5: Chunk the Extracted Text
# - Chunks help with LLM context limits
# - Default: 500 characters with 50 character overlap
# ================================================
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Apply chunking to all extracted documents
all_chunks = []
for doc in documents:
    all_chunks.extend(chunk_text(doc))

# ================================================
# 🧠 Step 6: Embed the Chunks Using Sentence Transformers
# - Converts each chunk into a vector for semantic search
# - 'all-MiniLM-L6-v2' is a fast, general-purpose embedding model
# ================================================
from sentence_transformers import SentenceTransformer
import numpy as np

embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(all_chunks)

# ================================================
# 🧭 Step 7: Build a FAISS Index for Fast Similarity Search
# - FAISS lets us efficiently retrieve similar text chunks
# ================================================
import faiss

dimension = embeddings.shape[1]  # Dimensionality of embedding vectors
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))  # Add all embeddings to the index

# ================================================
# 🧑‍💻 Step 8: Set Up the OpenAI Client (v1.x SDK)
# - Uses the API key stored in the environment
# ================================================
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ================================================
# 🤖 Step 9: Define the RAG Query Function
# - Accepts a question and retrieves the top-k most relevant chunks
# - Builds a prompt with retrieved context and sends to GPT-4
# ================================================
def rag_query(question, k=3):
    # Embed the user question
    query_embedding = embed_model.encode([question])

    # Search the FAISS index for top-k relevant chunks
    distances, indices = index.search(np.array(query_embedding), k)

    # Retrieve top-k chunks and format them as context
    context = "\n---\n".join([all_chunks[i] for i in indices[0]])

    # Construct a simple prompt using the retrieved context
    prompt = f"""Use the context below to answer the question. Be concise and accurate.

Context:
{context}

Question: {question}
Answer:"""

    # Call the OpenAI GPT model
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Change as needed (e.g., "gpt-4" or "gpt-3.5-turbo")
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,  # Low temperature = more deterministic responses
    )

    return response.choices[0].message.content.strip()

# ================================================
# ❓ Step 10: Ask a Question to Your RAG System!
# - You can change the question below to test your documents
# ================================================
print(rag_query("What are the key topics discussed in the uploaded document(s)?"))