# YIL-GPT — GenAI & RAG Project

This notebook represents a multi-module project flattened into a single `.ipynb`.
Each section below corresponds to a Python module (e.g. `config.py`, `ingestion.py`, etc.).
You can either run cells directly here or export them back into separate `.py` files.


# config.py


In [None]:

"""config.py — Centralized configuration for YIL-GPT (GenAI + RAG system)."""

# Embedding & LLM configuration
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" 
LLM_MODEL = "gpt-4o-mini"  # placeholder for the main LLM

CONTEXT_WINDOW = 4096
CHUNK_SIZE = 800
CHUNK_OVERLAP = 120

# Vector DB configuration
VECTOR_DB_PATH = "data/vdb"
INDEX_NAME = "yilgpt_index"
TOP_K = 5

# Document paths
DATA_ROOT = "documents"
MANUALS_DIR = f"{DATA_ROOT}/manuals"
SOPS_DIR = f"{DATA_ROOT}/sops"
LOGS_DIR = f"{DATA_ROOT}/alarm_logs"


# utils/text_cleaning.py


In [None]:

"""Utility functions for text cleaning and normalization."""

import re

def normalize_whitespace(text: str) -> str:
    """Collapse multiple spaces/newlines into a single space."""
    return re.sub(r"\s+", " ", text).strip()

def clean_text(text: str) -> str:
    """Basic cleaning used before chunking and embedding.

    This is intentionally simple; extend with domain-specific rules if needed.
    """
    text = text.replace("\u00a0", " ")  # non‑breaking spaces
    text = normalize_whitespace(text)
    return text


# utils/pdf_reader.py


In [None]:

"""PDF / file reading helpers.

In a real project you might use:
- pdfplumber
- pymupdf
- pypdf

Here we only stub the interface so the rest of the pipeline is clear.
"""

from pathlib import Path


def extract_pdf_text(path: str) -> str:
    """Extract text from a PDF file.

    This is a placeholder implementation. Replace with a real extractor
    such as pdfplumber when running in production.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)
    # TODO: implement real PDF parsing
    return p.read_text(errors="ignore")


# utils/logger.py


In [None]:

"""Simple logging wrapper used across the project."""

import logging


LOGGER_NAME = "yil_gpt"


def get_logger(name: str = LOGGER_NAME) -> logging.Logger:
    logger = logging.getLogger(name)
    if not logger.handlers:
        logger.setLevel(logging.INFO)
        handler = logging.StreamHandler()
        fmt = logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
        handler.setFormatter(fmt)
        logger.addHandler(handler)
    return logger


# ingestion.py


In [None]:

"""Document ingestion and chunking logic for YIL-GPT."""

from typing import Dict, List
from pathlib import Path

from utils.text_cleaning import clean_text
from utils.pdf_reader import extract_pdf_text
from utils.logger import get_logger
from config import CHUNK_SIZE, CHUNK_OVERLAP

logger = get_logger()


def load_document(path: str) -> str:
    """Load a document. Uses PDF extractor as a default.

    You can extend this with logic for `.txt`, `.docx`, etc.
    """
    path_obj = Path(path)
    if path_obj.suffix.lower() == ".pdf":
        text = extract_pdf_text(path)
    else:
        text = path_obj.read_text(errors="ignore")
    return clean_text(text)


def create_chunks(text: str, metadata: Dict) -> List[Dict]:
    """Hybrid chunking strategy.

    - Uses a simple word-based sliding window
    - Chunk size & overlap controlled via config
    - Each chunk carries the original document metadata
    """
    words = text.split()
    chunks: List[Dict] = []
    start = 0
    while start < len(words):
        end = start + CHUNK_SIZE
        chunk_words = words[start:end]
        chunk_text = " ".join(chunk_words)
        if not chunk_text.strip():
            break
        chunks.append({
            "text": chunk_text,
            "metadata": metadata.copy(),
        })
        start += max(CHUNK_SIZE - CHUNK_OVERLAP, 1)
    logger.info("Created %d chunks", len(chunks))
    return chunks


# vector_db.py


In [None]:

"""In-memory Vector DB stub for YIL-GPT.

This is intentionally simple so the notebook is self‑contained.
In production you would replace this with FAISS, Chroma, Qdrant, Weaviate, etc.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional

import numpy as np

from config import TOP_K
from utils.logger import get_logger

logger = get_logger()


# --- Very simple embedding stub -------------------------------------------------

def embed_text(text: str) -> np.ndarray:
    """Dummy embedding function.

    Replace this with a real embedding model (e.g. SentenceTransformers).
    For now it just hashes text into a fixed‑size vector for demonstration.
    """
    rng = np.random.default_rng(abs(hash(text)) % (2**32))
    return rng.normal(size=384).astype("float32")


@dataclass
class VectorDB:
    """A toy Vector DB with linear search.

    This keeps the code easy to run in a notebook while still showing
    clearly how retrieval works conceptually.
    """

    vectors: List[np.ndarray] = field(default_factory=list)
    docs: List[Dict[str, Any]] = field(default_factory=list)

    def add_documents(self, docs: List[Dict[str, Any]]) -> None:
        for doc in docs:
            vec = embed_text(doc["text"])
            self.vectors.append(vec)
            self.docs.append(doc)
        logger.info("Indexed %d documents (total=%d)", len(docs), len(self.docs))

    def search(self, query: str, top_k: Optional[int] = None, filters: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
        if top_k is None:
            top_k = TOP_K
        q_vec = embed_text(query)

        # Apply simple metadata filtering
        candidate_indices = list(range(len(self.docs)))
        if filters:
            def match(meta, filters):
                return all(meta.get(k) == v for k, v in filters.items())
            candidate_indices = [i for i in candidate_indices if match(self.docs[i]["metadata"], filters)]

        if not candidate_indices:
            return []

        # Cosine similarity
        mat = np.stack([self.vectors[i] for i in candidate_indices])
        q_norm = q_vec / (np.linalg.norm(q_vec) + 1e-9)
        mat_norm = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-9)
        sims = mat_norm @ q_norm

        top_idx = np.argsort(-sims)[:top_k]
        results: List[Dict[str, Any]] = []
        for rank, idx in enumerate(top_idx):
            doc_idx = candidate_indices[int(idx)]
            d = self.docs[doc_idx].copy()
            d["score"] = float(sims[int(idx)])
            d["rank"] = int(rank)
            results.append(d)
        return results


# llm_client.py


In [None]:

"""LLM client wrapper used for query rewriting and answer generation.

This uses placeholder logic so that the notebook is runnable without real API keys.
Replace `rewrite_query` and `generate_answer` with actual LLM calls in production.
"""

from typing import List, Dict

from utils.logger import get_logger

logger = get_logger()


class LLMClient:
    def __init__(self, model_name: str = None):
        self.model_name = model_name or "gpt-4o-mini"

    def rewrite_query(self, query: str) -> str:
        """Lightweight query rewriting stub.

        In production, this would call an LLM to:
        - infer system/unit
        - normalize terminology
        - expand abbreviations
        """
        logger.info("Rewriting query: %s", query)
        return query.strip()

    def generate_answer(self, query: str, retrieved_docs: List[Dict]) -> str:
        """Generate an answer from retrieved docs.

        For demonstration, we simply concatenate top documents.
        In production this would call an LLM with a RAG prompt template.
        """
        logger.info("Generating answer for query with %d context docs", len(retrieved_docs))
        context_summary = "\n\n".join(
            f"[Doc {i} | score={d.get('score', 0):.3f}]\n{d['text'][:400]}..." for i, d in enumerate(retrieved_docs)
        )
        answer = (
            "YIL-GPT (stub) answer\n\n"
            f"Question: {query}\n\n"
            "Context used (top documents):\n"
            f"{context_summary}"
        )
        return answer


# retrieval.py


In [None]:

"""Retrieval pipeline for YIL-GPT.

Implements:
- Query rewriting
- Metadata-aware vector search
- Returns top-k documents for RAG
"""

from typing import Dict, List

from vector_db import VectorDB
from llm_client import LLMClient
from utils.logger import get_logger

logger = get_logger()


class Retriever:
    def __init__(self, vector_db: VectorDB | None = None, llm: LLMClient | None = None):
        self.vdb = vector_db or VectorDB()
        self.llm = llm or LLMClient()

    def retrieve(self, query: str, filters: Dict | None = None) -> List[Dict]:
        rewritten = self.llm.rewrite_query(query)
        logger.info("Running retrieval with filters=%s", filters)
        results = self.vdb.search(rewritten, filters=filters)
        return results


# rag_pipeline.py


In [None]:

"""High-level RAG pipeline for YIL-GPT."""

from typing import Dict

from llm_client import LLMClient
from retrieval import Retriever
from vector_db import VectorDB


class YILGPT:
    """Facade over the full RAG stack.

    Usage:
        bot = YILGPT()
        answer = bot.query("Why did compressor C-101 trip yesterday?")
    """

    def __init__(self, vector_db: VectorDB | None = None):
        self.vdb = vector_db or VectorDB()
        self.llm = LLMClient()
        self.retriever = Retriever(self.vdb, self.llm)

    def query(self, user_query: str, filters: Dict | None = None) -> str:
        retrieved = self.retriever.retrieve(user_query, filters=filters)
        answer = self.llm.generate_answer(user_query, retrieved)
        return answer


# api.py


In [None]:

"""Simple FastAPI wrapper exposing YIL-GPT as an HTTP API.

To run (after installing fastapi + uvicorn):

    uvicorn api:app --reload

Then send POST requests to /query with JSON: {"query": "..."}
"""

try:
    from fastapi import FastAPI
    from pydantic import BaseModel

    from rag_pipeline import YILGPT

    app = FastAPI(title="YIL-GPT API")
    bot = YILGPT()

    class QueryRequest(BaseModel):
        query: str


    @app.post("/query")
    def query(req: QueryRequest):
        answer = bot.query(req.query)
        return {"answer": answer}

except Exception as e:  # FastAPI might not be installed in the notebook env
    # This allows the rest of the notebook to run without FastAPI.
    print("FastAPI not available or failed to import:", e)


# README (project-level)

This notebook corresponds to a modular GenAI + RAG project called **YIL-GPT**.
In a real repo these sections would live in separate `.py` files.

Key ideas covered:
- Chunking strategy for long industrial manuals and logs
- Embedding + Vector DB for semantic search
- Retrieval pipeline (rewrite → filter → search)
- RAG answer generation and hallucination-aware prompting
- Optional FastAPI layer for serving the model
