In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

# CONFIGURATION
BASE_URL = "https://wiki.metakgp.org"
SEED_URL = "https://wiki.metakgp.org/w/Special:AllPages"

# Namespaces to strictly ignore
IGNORED_NAMESPACES = [
    "Special:", "Talk:", "User:", "User_talk:", "Metakgp:", 
    "Metakgp_talk:", "File:", "File_talk:", "MediaWiki:", 
    "Template:", "Help:", "Category:", "Category_talk:"
]

def crawl_all_urls():
    print("üï∏Ô∏è  Starting Universal Crawler (v3)...")
    current_url = SEED_URL
    all_links = []
    page_counter = 1

    while current_url:
        print(f"üìñ Reading Page {page_counter}...")
        
        try:
            response = requests.get(current_url)
            if response.status_code != 200:
                print(f"‚ùå Failed to load: {current_url}")
                break

            soup = BeautifulSoup(response.text, 'html.parser')

            # --- 1. REMOVE NOISE (Sidebar & Footer) ---
            # We destroy the sidebar and footer from the soup object before searching.
            # This ensures we don't accidentally grab "Main Page" or "About" links.
            for garbage in soup.find_all(class_=['mw-panel', 'vector-menu-portal', 'footer', 'mw-footer']):
                garbage.decompose()
            for garbage in soup.find_all(id=['mw-panel', 'footer', 'mw-navigation']):
                garbage.decompose()

            # --- 2. FIND ALL REMAINING LINKS ---
            # Now the only links left should be in the content area.
            links = soup.find_all('a', href=True)
            found_on_this_page = 0
            
            for link in links:
                href = link['href']
                full_url = urljoin(BASE_URL, href)
                
                # --- 3. FILTER LOGIC ---
                # A. Must be a Wiki link
                if "/wiki/" not in href:
                    continue
                
                # B. Must NOT be an Admin/System page
                is_banned = False
                for ns in IGNORED_NAMESPACES:
                    if ns in href:
                        is_banned = True
                        break
                
                if is_banned:
                    continue

                # C. Must NOT be the "Next Page" pagination link
                if "Next page" in link.text or "Previous page" in link.text:
                    continue

                # If we passed all checks, it's a valid article!
                all_links.append(full_url)
                found_on_this_page += 1

            print(f"   -> Found {found_on_this_page} valid links on this page.")
            
            # --- DEBUG: If 0 found, print what we DID see to help debug ---
            if found_on_this_page == 0:
                print("   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):")
                for l in links[:5]:
                    print(f"      - Text: '{l.text}' | Href: '{l['href']}'")

            # --- 4. PAGINATION ---
            # We look for the "Next page" link specifically.
            next_link = None
            # Re-fetch all links including navigation (since we decomposed them earlier, 
            # we might need to check if we deleted the nav. 
            # Actually, the 'Next' link is usually in the content body or top/bottom of list.
            # If we decomposed 'mw-navigation', we might have killed it.
            # Let's check the UN-MODIFIED text for pagination link.
            
            # Strategy: Search the raw text for the 'Next page' link pattern if soup failed
            pagination_soup = BeautifulSoup(response.text, 'html.parser') # Fresh soup
            nav_links = pagination_soup.find_all("a", href=True)
            
            for link in nav_links:
                if "Next page" in link.text:
                    next_link = urljoin(BASE_URL, link['href'])
                    break
            
            if next_link:
                current_url = next_link
                page_counter += 1
                time.sleep(0.5)
            else:
                print("‚úÖ Reached end of the list (No 'Next page' link found).")
                break

        except Exception as e:
            print(f"CRITICAL ERROR: {e}")
            break

    print(f"\nüéâ Crawler Finished! Found {len(all_links)} total pages.")
    return all_links

if __name__ == "__main__":
    final_list = crawl_all_urls()

üï∏Ô∏è  Starting Universal Crawler (v3)...
üìñ Reading Page 1...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):
      - Text: 'Jump to content' | Href: '#bodyContent'
      - Text: 'Main page' | Href: '/w/Main_Page'
      - Text: 'Yellow pages' | Href: '/w/Yellow_pages'
      - Text: 'Recent changes' | Href: '/w/Special:RecentChanges'
      - Text: 'Random article' | Href: '/w/Special:Random'
üìñ Reading Page 2...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):
      - Text: 'Jump to content' | Href: '#bodyContent'
      - Text: 'Main page' | Href: '/w/Main_Page'
      - Text: 'Yellow pages' | Href: '/w/Yellow_pages'
      - Text: 'Recent changes' | Href: '/w/Special:RecentChanges'
      - Text: 'Random article' | Href: '/w/Special:Random'
üìñ Reading Page 3...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and 

In [7]:
import json
import glob
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# CONFIGURATION
INPUT_DIR = "C:\programming\prg\Devsoc-hackathon\scraped_data"  # Directory containing your batch_*.json files
OUTPUT_FILE = "graph_chunks.json" # Where we save the processed chunks (optional debug)

def load_all_data(directory):
    """Loads all batch JSON files into a single list."""
    all_pages = []
    files = glob.glob(os.path.join(directory, "batch_*.json"))
    print(f"üìÇ Loading data from {len(files)} files...")
    
    for f_path in files:
        try:
            with open(f_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                all_pages.extend(data)
        except Exception as e:
            print(f"‚ö†Ô∏è Error reading {f_path}: {e}")
            
    print(f"‚úÖ Loaded {len(all_pages)} source pages.")
    return all_pages

def create_graph_chunks():
    # 1. Load Data
    raw_pages = load_all_data(INPUT_DIR)
    
    # 2. Define the Splitter
    # We use a smaller chunk size to keep facts precise.
    # Overlap is critical to not cut a sentence in half.
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    graph_documents = []
    
    print("üï∏Ô∏è  Generating Graph Chunks...")
    
    for page in raw_pages:
        # Extract Core Data
        title = page.get('title', 'Unknown')
        url = page.get('url', 'Unknown')
        last_mod = page.get('last_modified', 'Unknown')
        
        # --- GRAPH EDGE LOGIC ---
        # The 'graph_connections' list (from your scraper) is the KEY.
        # We must attach these neighbors to *every* chunk of this page.
        neighbors = page.get('graph_connections', [])
        neighbors_str = ", ".join(neighbors[:50]) # Limit to 50 links to save space
        
        # Clean Content
        content = page.get('content', '')
        if not content: continue
            
        # Split the content
        text_chunks = splitter.split_text(content)
        
        for i, chunk_text in enumerate(text_chunks):
            
            # --- THE "GRAPH CHUNK" MAGIC ---
            # We inject the metadata directly into the TEXT so the LLM reads it.
            # This allows the LLM to say: "I see a link to 'Gymkhana' here, let me ask about that."
            
            contextualized_text = f"""
SOURCE_PAGE: {title}
LAST_UPDATED: {last_mod}
RELATED_TOPICS: {neighbors_str}
---------------------
{chunk_text}
"""
            # Create the Document Object (Standard LangChain format)
            # We also keep clean metadata for code-level filtering
            doc = Document(
                page_content=contextualized_text,
                metadata={
                    "source": url,
                    "title": title,
                    "chunk_id": i,
                    "last_modified": last_mod,
                    "graph_neighbors": neighbors # Keep the raw list for code logic
                }
            )
            
            graph_documents.append(doc)

    print(f"‚úÖ Generated {len(graph_documents)} Graph Chunks.")
    return graph_documents

# --- EXECUTION ---
if __name__ == "__main__":
    final_chunks = create_graph_chunks()
    
    # Debug: Print one chunk to see the structure
    if final_chunks:
        print("\n--- SAMPLE GRAPH CHUNK ---")
        print(final_chunks[20].page_content)
        print("\n--- METADATA ---")
        print(final_chunks[20].metadata)

  INPUT_DIR = "C:\programming\prg\Devsoc-hackathon\scraped_data"  # Directory containing your batch_*.json files


üìÇ Loading data from 24 files...
‚úÖ Loaded 3582 source pages.
üï∏Ô∏è  Generating Graph Chunks...
‚úÖ Generated 9344 Graph Chunks.

--- SAMPLE GRAPH CHUNK ---

SOURCE_PAGE: AG60044: Advanced Groundwater Hydrology
LAST_UPDATED: Unknown
RELATED_TOPICS: 
---------------------
meteorological fluctuations, tidal fluctuations, impacts of urbanization,
earthquakes and external loads on groundwater, land subsidence.
Aquifer Tests and Parameter Estimation: Need of aquifer tests, type and design
of aquifer tests, test procedures, merits and demerits of pumping test, steady
and transient methods for determining aquifer parameters from pumping test
data, recovery test, analysis of step-drawdown test data, overview of slug
tests.
Groundwater Quality and Contamination: Definitions, water-quality parameters
and characteristics, monitoring of groundwater quality, water-quality criteria
and standards, collection of groundwater samples, vadose zone monitoring,
groundwater contamination, sources and c

In [None]:
import os
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


# CONFIGURATION
DB_DIR = "C:/programming/prg/Devsoc-hackathon/chroma_db_graph"  # Separate DB for Graph RAG
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# 2. Force GPU Usage (Crucial for Speed!)
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # MPNet performs better with normalization

print(f"üß† Loading Model: {EMBEDDING_MODEL} on GPU...")

def ingest_knowledge_graph():
    # 1. Generate the "Smart" Graph Chunks
    print("üöÄ Starting Knowledge Graph Ingestion...")
    graph_docs = create_graph_chunks()
    
    if not graph_docs:
        print("‚ùå No documents found. Check your json files.")
        return

    print(f"üß© Prepared {len(graph_docs)} Graph-Enhanced Chunks.")

    print("üîß Sanitizing metadata for ChromaDB compatibility...")
    for doc in graph_docs:
        if "graph_neighbors" in doc.metadata:
            # Convert ['Link A', 'Link B'] -> "Link A, Link B"
            neighbors = doc.metadata["graph_neighbors"]
            if isinstance(neighbors, list):
                doc.metadata["graph_neighbors"] = ", ".join(neighbors)
            else:
                doc.metadata["graph_neighbors"] = str(neighbors)

    # 2. Initialize the Embedding Model (The "Translator" to Math)
    print(f"üß† Loading Model: {EMBEDDING_MODEL}...")
    embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

    # 3. Store in Vector Database (The "Memory")
    print(f"üíæ Saving to {DB_DIR}...")
    
    # We use batching to ensure we don't crash memory
    BATCH_SIZE = 100
    total_batches = (len(graph_docs) // BATCH_SIZE) + 1
    
    vectorstore = Chroma(
        persist_directory=DB_DIR, 
        embedding_function=embeddings
    )
    
    for i in range(0, len(graph_docs), BATCH_SIZE):
        batch = graph_docs[i : i + BATCH_SIZE]
        print(f"   -> Indexing Batch {i//BATCH_SIZE + 1}/{total_batches}")
        vectorstore.add_documents(batch)
        
    print("‚úÖ Knowledge Graph Successfully Built!")
    print(f"   You can now query this DB at: {DB_DIR}")

if __name__ == "__main__":
    ingest_knowledge_graph()

üß† Loading Model: sentence-transformers/all-mpnet-base-v2 on GPU...
üöÄ Starting Knowledge Graph Ingestion...
üìÇ Loading data from 24 files...
‚úÖ Loaded 3582 source pages.
üï∏Ô∏è  Generating Graph Chunks...
‚úÖ Generated 9344 Graph Chunks.
üß© Prepared 9344 Graph-Enhanced Chunks.
üîß Sanitizing metadata for ChromaDB compatibility...
üß† Loading Model: sentence-transformers/all-mpnet-base-v2...


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 1a5302f7-4a45-48cb-8ce5-6f1924bb38df)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/./config_sentence_transformers.json
Retrying in 1s [Retry 1/5].


üíæ Saving to C:/programming/prg/Devsoc-hackathon/chroma_db_graph...
   -> Indexing Batch 1/94
   -> Indexing Batch 2/94
   -> Indexing Batch 3/94
   -> Indexing Batch 4/94
   -> Indexing Batch 5/94
   -> Indexing Batch 6/94
   -> Indexing Batch 7/94
   -> Indexing Batch 8/94
   -> Indexing Batch 9/94
   -> Indexing Batch 10/94
   -> Indexing Batch 11/94
   -> Indexing Batch 12/94
   -> Indexing Batch 13/94
   -> Indexing Batch 14/94
   -> Indexing Batch 15/94
   -> Indexing Batch 16/94
   -> Indexing Batch 17/94
   -> Indexing Batch 18/94
   -> Indexing Batch 19/94
   -> Indexing Batch 20/94
   -> Indexing Batch 21/94
   -> Indexing Batch 22/94
   -> Indexing Batch 23/94
   -> Indexing Batch 24/94
   -> Indexing Batch 25/94
   -> Indexing Batch 26/94
   -> Indexing Batch 27/94
   -> Indexing Batch 28/94
   -> Indexing Batch 29/94
   -> Indexing Batch 30/94
   -> Indexing Batch 31/94
   -> Indexing Batch 32/94
   -> Indexing Batch 33/94
   -> Indexing Batch 34/94
   -> Indexing Batch 

In [8]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# CONFIGURATION
DB_DIR = "C:/programming/prg/Devsoc-hackathon/chroma_db_graph"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# 1. Initialize
print("üß† Loading Model...")
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={'device': 'cuda'}, # Use your RTX 4050
    encode_kwargs={'normalize_embeddings': True}
)

vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)

# 2. Ask a Question
query = "Who are the governors of the Technology Literary Society?"
print(f"\nüîé Query: {query}")

results = vectorstore.similarity_search(query, k=3)

# 3. Show Results
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(f"üìÑ Source: {doc.metadata['title']}")
    print(f"üîó Related: {doc.metadata.get('graph_neighbors', 'None')[:50]}...")
    print(f"üìù Text Snippet:\n{doc.page_content[:200]}...")

üß† Loading Model...

üîé Query: Who are the governors of the Technology Literary Society?

--- Result 1 ---
üìÑ Source: Technology Literary Society
üîó Related: ...
üìù Text Snippet:

SOURCE_PAGE: Technology Literary Society
LAST_UPDATED: Unknown
RELATED_TOPICS: 
---------------------
From the year 2010-11, there have been only 4 Governors in TLS every year.
There has also been a ...

--- Result 2 ---
üìÑ Source: Technology Literary Society
üîó Related: ...
üìù Text Snippet:

SOURCE_PAGE: Technology Literary Society
LAST_UPDATED: Unknown
RELATED_TOPICS: 
---------------------
From the year 2010-11, there have been only 4 Governors in TLS every year.
There has also been a ...

--- Result 3 ---
üìÑ Source: Constitution of the Technology Students' Gymkhana
üîó Related: ...
üìù Text Snippet:

SOURCE_PAGE: Constitution of the Technology Students' Gymkhana
LAST_UPDATED: Unknown
RELATED_TOPICS: 
---------------------
a) Entertainment Subcommittee
b) Dramatics Subcommittee
c) Journ

In [None]:
import os
import re
import time
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
# --- CHANGE 1: Import Groq ---
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# ==========================================
# CONFIGURATION
# ==========================================
DB_DIR = "C:/programming/prg/Devsoc-hackathon/chroma_db_graph"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# --- CHANGE 2: Paste your Groq Key here ---
# Get it from: https://console.groq.com/keys
MY_GROQ_KEY = ""

class GraphRAGAgent:
    def __init__(self):
        print("üß† Initializing Graph Agent (Groq Llama-3.3-70B)...")
        
        # 1. Load Memory (Local GPU)
        self.embeddings = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL,
            model_kwargs={'device': 'cuda'},
            encode_kwargs={'normalize_embeddings': True}
        )
        self.db = Chroma(persist_directory=DB_DIR, embedding_function=self.embeddings)
        
        # 2. Load Brain (Groq)
        if not MY_GROQ_KEY or "..." in MY_GROQ_KEY:
            raise ValueError("‚ùå You forgot to paste your Groq API Key!")

        # --- CHANGE 3: Initialize ChatGroq ---
        self.llm = ChatGroq(
            api_key=MY_GROQ_KEY,
            model="llama-3.3-70b-versatile", # Powerful & Fast model
            temperature=0
        )
        
        # 3. Navigator Prompt (Kept exactly the same)
        self.navigator_prompt = ChatPromptTemplate.from_template("""
        You are a precise Graph RAG Agent.
        
        GOAL: {goal}
        
        CONTEXT FROM DATABASE:
        --------------------------------------------------
        {context}
        --------------------------------------------------
        
        INSTRUCTIONS:
        1. Analyze the context and the "RELATED_TOPICS" links.
        2. If the text answers the GOAL, reply ONLY with:
           ANSWER: [The answer]
        3. If you need to search a related topic to find the answer, reply ONLY with:
           HOP: [Topic Name]
           
        CONSTRAINT: Output ONLY the line starting with ANSWER or HOP. No explanations.
        """)
        
    def search(self, query):
        results = self.db.similarity_search(query, k=1)
        return results[0] if results else None

    def solve(self, user_query, max_hops=3):
        print(f"\nüöÄ STARTING TRAVERSAL: '{user_query}'")
        current_query = user_query
        visited_context = []
        
        for step in range(max_hops):
            print(f"\nüë£ Step {step + 1}: Searching for '{current_query}'...")
            
            # Keep sleep to avoid rate limits (Groq has limits too!)
            time.sleep(2) 
            
            node = self.search(current_query)
            if not node:
                print("   ‚ùå Dead end. No information found.")
                break
                
            content = node.page_content
            source = node.metadata.get('title', 'Unknown')
            neighbors = node.metadata.get('graph_neighbors', 'None')
            
            print(f"   üìÑ Found Node: {source}")
            
            visited_context.append(f"SOURCE: {source}\nRELATED_TOPICS: {neighbors}\nCONTENT: {content}")
            full_context = "\n\n".join(visited_context)
            
            chain = self.navigator_prompt | self.llm | StrOutputParser()
            try:
                decision = chain.invoke({"goal": user_query, "context": full_context})
                decision = decision.strip()
                print(f"   ü§î Decision: {decision}")
                
                match = re.search(r"(ANSWER|HOP):\s*(.*)", decision, re.DOTALL)
                
                if match:
                    action = match.group(1)
                    value = match.group(2).strip()
                    
                    # ... inside solve() method, when you find an ANSWER ...

                    if action == "ANSWER":
                        # --- MOE INTEGRATION ---
                        print("\n‚úã Holding Answer for Verification...")
                        
                        # We pass the full history of text we read as "Context"
                        full_context_log = "\n".join(visited_context)
                        
                        is_valid, reason = self.verifier.verify(user_query, value, full_context_log)
                        
                        if is_valid:
                            return value
                        else:
                            return f"‚ùå Verification Failed: {reason} \n(Original Draft: {value})"
                    elif action == "HOP":
                        print(f"   üîó Hopping to -> {value}")
                        current_query = value
                else:
                    if step == max_hops - 1:
                        return decision
                    
            except Exception as e:
                return f"‚ùå Error: {e}"

        return "‚ùå Max steps reached without a final answer."

if __name__ == "__main__":
    agent = GraphRAGAgent()
    
    # Test Query
    answer = agent.solve("Who is Elon Musk")
    
    print("\n" + "="*40)
    print("FINAL RESULT:")
    print("="*40)
    print(answer)

üß† Initializing Graph Agent (Groq Llama-3.3-70B)...

üöÄ STARTING TRAVERSAL: 'Who is Elon Musk'

üë£ Step 1: Searching for 'Who is Elon Musk'...
   üìÑ Found Node: EP60007: Techno-Entrepreneurial Leadership
   ü§î Decision: HOP: Techno-Entrepreneurial Leadership
   üîó Hopping to -> Techno-Entrepreneurial Leadership

üë£ Step 2: Searching for 'Techno-Entrepreneurial Leadership'...
   üìÑ Found Node: EP60007: Techno-Entrepreneurial Leadership
   ü§î Decision: HOP: Elon Musk
   üîó Hopping to -> Elon Musk

üë£ Step 3: Searching for 'Elon Musk'...
   üìÑ Found Node: EP60007: Techno-Entrepreneurial Leadership
   ü§î Decision: HOP: Elon Musk
   üîó Hopping to -> Elon Musk

FINAL RESULT:
‚ùå Max steps reached without a final answer.


In [None]:
import os
import re
import time
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
# --- CHANGE 1: Import Google Gemini ---
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

# ==========================================
# CONFIGURATION
# ==========================================
DB_DIR = "C:/programming/prg/Devsoc-hackathon/chroma_db_graph"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# --- CHANGE 2: Paste your Google Gemini Key here ---
# Get it from: https://aistudio.google.com/app/apikey
MY_GEMINI_KEY = ""

# ==========================================
# 1. THE MIXTURE OF EXPERTS (THE AUDITOR)
# ==========================================
class MoEVerifier:
    def __init__(self, llm):
        self.llm = llm

    def verify(self, question, answer, context_used):
        print(f"\n   üïµÔ∏è  MoE Verifier is grading the answer...")
        
        # --- EXPERT 1: SOURCE MATCHER ---
        source_prompt = ChatPromptTemplate.from_template("""
        You are a strict Fact Checker.
        
        CONTEXT FROM DATABASE:
        {context}
        
        PROPOSED ANSWER:
        {answer}
        
        TASK:
        Does the CONTEXT fully support the PROPOSED ANSWER? 
        If the answer contains names, dates, or facts NOT in the context, you must flag it.
        
        OUTPUT JSON: {{ "is_supported": boolean, "reason": "string" }}
        """)
        
        try:
            chain1 = source_prompt | self.llm | JsonOutputParser()
            result1 = chain1.invoke({"context": context_used, "answer": answer})
            
            if not result1['is_supported']:
                print(f"      ‚ùå REJECTED by Source Matcher: {result1['reason']}")
                return False, f"Hallucination Detected: {result1['reason']}"
        except:
            pass # If json parse fails, we skip strict check to be safe

        # --- EXPERT 2: LOGIC GUARD ---
        logic_prompt = ChatPromptTemplate.from_template("""
        You are a Logic Analyst.
        
        QUESTION: {question}
        ANSWER: {answer}
        
        TASK:
        Does the ANSWER directly address the QUESTION?
        
        OUTPUT JSON: {{ "is_relevant": boolean, "reason": "string" }}
        """)
        
        try:
            chain2 = logic_prompt | self.llm | JsonOutputParser()
            result2 = chain2.invoke({"question": question, "answer": answer})
            
            if not result2['is_relevant']:
                print(f"      ‚ùå REJECTED by Logic Guard: {result2['reason']}")
                return False, f"Irrelevant Answer: {result2['reason']}"
        except:
            pass

        print("      ‚úÖ Verified.")
        return True, "Verified"

# ==========================================
# 2. THE GRAPH AGENT (THE RESEARCHER)
# ==========================================
class GraphRAGAgent:
    def __init__(self):
        print("üß† Initializing Graph Agent & Verifiers (Gemini Flash)...")
        
        # Load Memory
        self.embeddings = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL,
            model_kwargs={'device': 'cuda'},
            encode_kwargs={'normalize_embeddings': True}
        )
        self.db = Chroma(persist_directory=DB_DIR, embedding_function=self.embeddings)
        
        # Load Brain
        if not MY_GEMINI_KEY or "..." in MY_GEMINI_KEY:
            raise ValueError("‚ùå You forgot to paste your Gemini API Key!")

        # --- CHANGE 3: Initialize Gemini ---
        self.llm = ChatGoogleGenerativeAI(
            google_api_key=MY_GEMINI_KEY,
            model="gemini-2.5-flash", # Or "gemini-2.5-flash-exp" if you have access
            temperature=0
        )
        
        # Attach Verifier
        self.verifier = MoEVerifier(self.llm)
        
        # Navigator Prompt
        self.navigator_prompt = ChatPromptTemplate.from_template("""
        You are a Research Agent.
        
        GOAL: {goal}
        
        CURRENT INFORMATION FOUND:
        {context}
        
        INSTRUCTIONS:
        1. Read the text. 
        2. If you have the COMPLETE answer, output: ANSWER: [Your Answer]
        3. If you need to search a related topic, output: HOP: [Topic Name]
        
        CONSTRAINT: Output ONLY the line starting with ANSWER or HOP.
        """)
        
    def search(self, query):
        results = self.db.similarity_search(query, k=1)
        return results[0] if results else None

    def solve(self, user_query, max_hops=3):
        print(f"\nüöÄ THINKING PROCESS: '{user_query}'")
        current_query = user_query
        visited_context = []
        
        for step in range(max_hops):
            print(f"   üë£ Step {step + 1}: Searching for '{current_query}'...")
            
            # Gemini has higher rate limits, but a small sleep is still good practice
            time.sleep(1) 
            
            node = self.search(current_query)
            if not node:
                print("      ‚ö†Ô∏è Dead end in graph.")
                break
                
            content = node.page_content
            source = node.metadata.get('title', 'Unknown')
            neighbors = node.metadata.get('graph_neighbors', 'None')
            
            print(f"      üìÑ Reading: {source}")
            
            # Store context for the Verifier to check later
            visited_context.append(f"SOURCE: {source}\nCONTENT: {content}\nLINKS: {neighbors}")
            full_context = "\n\n".join(visited_context)
            
            # Ask LLM what to do
            chain = self.navigator_prompt | self.llm | StrOutputParser()
            try:
                decision = chain.invoke({"goal": user_query, "context": full_context})
                decision = decision.strip()
                
                # Parse Decision
                match = re.search(r"(ANSWER|HOP):\s*(.*)", decision, re.DOTALL)
                if match:
                    action = match.group(1)
                    value = match.group(2).strip()
                    
                    if action == "ANSWER":
                        # *** CRITICAL STEP: VERIFY BEFORE RETURNING ***
                        is_valid, reason = self.verifier.verify(user_query, value, full_context)
                        
                        if is_valid:
                            return value
                        else:
                            return f"‚ùå I found an answer, but my internal auditor rejected it.\nReason: {reason}"
                    
                    elif action == "HOP":
                        print(f"      üîó Hopping to -> {value}")
                        current_query = value
                else:
                    if step == max_hops - 1: return decision
                    
            except Exception as e:
                return f"‚ùå Error: {e}"

        return "‚ùå I searched the graph but could not find a complete answer within the limit."

# ==========================================
# 3. THE CHAT LOOP (USER INTERFACE)
# ==========================================
if __name__ == "__main__":
    agent = GraphRAGAgent()
    
    # PUT YOUR QUESTION HERE
    question = "What is Gymkhana?"
    
    print(f"Asking: {question}")
    response = agent.solve(question)
    
    print("FINAL ANSWER:", response)

üß† Initializing Graph Agent & Verifiers (Gemini Flash)...
Asking: What is Gymkhana?

üöÄ THINKING PROCESS: 'What is Gymkhana?'
   üë£ Step 1: Searching for 'What is Gymkhana?'...
      üìÑ Reading: Constitution of the Technology Students' Gymkhana
      üîó Hopping to -> Gymkhana definition
   üë£ Step 2: Searching for 'Gymkhana definition'...
      üìÑ Reading: Constitution of the Technology Students' Gymkhana
      üîó Hopping to -> Gymkhana (general term)
   üë£ Step 3: Searching for 'Gymkhana (general term)'...
      üìÑ Reading: Sports Facilities

   üïµÔ∏è  MoE Verifier is grading the answer...
      ‚úÖ Verified.
FINAL ANSWER: Gymkhana, specifically the Technology Students' Gymkhana, is a student-managed hub for numerous extra-curricular and co-curricular activities, ranging from sports to music. It provides various facilities such as a gym, squash courts, billiards, and a swimming pool, which may require fees and forms available on its website.


In [None]:
import os
import re
import time
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

# ==========================================
# CONFIGURATION
# ==========================================
DB_DIR = "C:/programming/prg/Devsoc-hackathon/chroma_db_graph"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
MY_GEMINI_KEY = ""

# ==========================================
# 1. THE MIXTURE OF EXPERTS (THE AUDITOR)
# ==========================================
class MoEVerifier:
    def __init__(self, llm):
        self.llm = llm

    def verify(self, question, answer, context_used):
        print(f"\n   üïµÔ∏è  MoE Verification Council is in session...")
        
        # ---------------------------------------------------------
        # EXPERT 1: SOURCE MATCHER
        # Role: Checks if the answer is physically present in the source text.
        # ---------------------------------------------------------
        print("      üîç Expert 1 (Source Matcher) checking evidence...")
        source_prompt = ChatPromptTemplate.from_template("""
        You are the Source Matcher Expert.
        
        SOURCE TEXT:
        {context}
        
        CLAIM:
        {answer}
        
        TASK:
        Does the SOURCE TEXT contain the specific information required to support the CLAIM?
        Ignore logic or reasoning. Just check if the data points (names, dates, numbers) exist in the source.
        
        OUTPUT JSON: {{ "is_supported": boolean, "reason": "string" }}
        """)
        
        try:
            chain1 = source_prompt | self.llm | JsonOutputParser()
            result1 = chain1.invoke({"context": context_used, "answer": answer})
            if not result1['is_supported']:
                print(f"      ‚ùå REJECTED by Source Matcher: {result1['reason']}")
                return False, f"Source Error: {result1['reason']}"
        except Exception as e:
            print(f"      ‚ö†Ô∏è Expert 1 Error: {e}")

        # ---------------------------------------------------------
        # EXPERT 2: HALLUCINATION HUNTER
        # Role: Checks for invented details or external knowledge usage.
        # ---------------------------------------------------------
        print("      üëª Expert 2 (Hallucination Hunter) scanning for inventions...")
        hallucination_prompt = ChatPromptTemplate.from_template("""
        You are the Hallucination Hunter.
        
        SOURCE TEXT:
        {context}
        
        GENERATED ANSWER:
        {answer}
        
        TASK:
        Did the answer invent any details that are NOT in the source text?
        Even if the fact is true in the real world (like "The sun is hot"), if it is NOT in the source text, it is a HALLUCINATION.
        
        OUTPUT JSON: {{ "is_clean": boolean, "reason": "string" }}
        """)
        
        try:
            chain2 = hallucination_prompt | self.llm | JsonOutputParser()
            result2 = chain2.invoke({"context": context_used, "answer": answer})
            if not result2['is_clean']:
                print(f"      ‚ùå REJECTED by Hallucination Hunter: {result2['reason']}")
                return False, f"Hallucination Detected: {result2['reason']}"
        except Exception as e:
            print(f"      ‚ö†Ô∏è Expert 2 Error: {e}")

        # ---------------------------------------------------------
        # EXPERT 3: LOGIC EXPERT
        # Role: Checks if the conclusion actually follows from the premises.
        # ---------------------------------------------------------
        print("      üß† Expert 3 (Logic Expert) validating reasoning...")
        logic_prompt = ChatPromptTemplate.from_template("""
        You are the Logic Expert.
        
        USER QUESTION: {question}
        DERIVED ANSWER: {answer}
        PREMISES (CONTEXT): {context}
        
        TASK:
        Does the conclusion (Answer) logically follow from the Premises?
        Check for logical fallacies, jumping to conclusions, or answering a different question than asked.
        
        OUTPUT JSON: {{ "is_logical": boolean, "reason": "string" }}
        """)
        
        try:
            chain3 = logic_prompt | self.llm | JsonOutputParser()
            result3 = chain3.invoke({"question": question, "answer": answer, "context": context_used})
            if not result3['is_logical']:
                print(f"      ‚ùå REJECTED by Logic Expert: {result3['reason']}")
                return False, f"Logic Error: {result3['reason']}"
        except Exception as e:
            print(f"      ‚ö†Ô∏è Expert 3 Error: {e}")

        print("      ‚úÖ All Experts Passed. Answer Verified.")
        return True, "Verified"

# ==========================================
# 2. THE GRAPH AGENT (THE RESEARCHER)
# ==========================================
class GraphRAGAgent:
    def __init__(self):
        print("üß† Initializing Graph Agent & MoE Verifiers (Gemini Flash)...")
        
        # Load Memory
        self.embeddings = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL,
            model_kwargs={'device': 'cuda'},
            encode_kwargs={'normalize_embeddings': True}
        )
        self.db = Chroma(persist_directory=DB_DIR, embedding_function=self.embeddings)
        
        # Load Brain
        if not MY_GEMINI_KEY or "..." in MY_GEMINI_KEY:
            raise ValueError("‚ùå You forgot to paste your Gemini API Key!")

        self.llm = ChatGoogleGenerativeAI(
            google_api_key=MY_GEMINI_KEY,
            model="gemini-2.0-flash",
            temperature=0
        )
        
        # Attach Verifier
        self.verifier = MoEVerifier(self.llm)
        
        # Navigator Prompt
        self.navigator_prompt = ChatPromptTemplate.from_template("""
        You are a Research Agent.
        
        GOAL: {goal}
        
        CURRENT INFORMATION FOUND:
        {context}
        
        INSTRUCTIONS:
        1. Read the text. 
        2. If you have the COMPLETE answer, output: ANSWER: [Your Answer]
        3. If you need to search a related topic, output: HOP: [Topic Name]
        
        CONSTRAINT: Output ONLY the line starting with ANSWER or HOP.
        """)
        
    def search(self, query):
        results = self.db.similarity_search(query, k=1)
        return results[0] if results else None

    def solve(self, user_query, max_hops=3):
        print(f"\nüöÄ THINKING PROCESS: '{user_query}'")
        current_query = user_query
        visited_context = []
        
        for step in range(max_hops):
            print(f"   üë£ Step {step + 1}: Searching for '{current_query}'...")
            time.sleep(1) 
            
            node = self.search(current_query)
            if not node:
                print("      ‚ö†Ô∏è Dead end in graph.")
                break
                
            content = node.page_content
            source = node.metadata.get('title', 'Unknown')
            neighbors = node.metadata.get('graph_neighbors', 'None')
            
            print(f"      üìÑ Reading: {source}")
            
            # Store context for the Verifier to check later
            visited_context.append(f"SOURCE: {source}\nCONTENT: {content}\nLINKS: {neighbors}")
            full_context = "\n\n".join(visited_context)
            
            # Ask LLM what to do
            chain = self.navigator_prompt | self.llm | StrOutputParser()
            try:
                decision = chain.invoke({"goal": user_query, "context": full_context})
                decision = decision.strip()
                
                match = re.search(r"(ANSWER|HOP):\s*(.*)", decision, re.DOTALL)
                if match:
                    action = match.group(1)
                    value = match.group(2).strip()
                    
                    if action == "ANSWER":
                        # *** CRITICAL STEP: VERIFY BEFORE RETURNING ***
                        is_valid, reason = self.verifier.verify(user_query, value, full_context)
                        
                        if is_valid:
                            return value
                        else:
                            return f"‚ùå I found an answer, but my MoE Council rejected it.\nReason: {reason}"
                    
                    elif action == "HOP":
                        
                        print(f"      üîó Hopping to -> {value}")
                        current_query = value
                else:
                    if step == max_hops - 1: return decision
                    
            except Exception as e:
                return f"‚ùå Error: {e}"

        return "‚ùå I searched the graph but could not find a complete answer within the limit."

# ==========================================
# 3. THE CHAT LOOP (USER INTERFACE)
# ==========================================
if __name__ == "__main__":
    agent = GraphRAGAgent()
    
    # PUT YOUR QUESTION HERE
    question = "what is civil department?"
    
    print(f"Asking: {question}")
    response = agent.solve(question)
    
    print("FINAL ANSWER:", response)

üß† Initializing Graph Agent & MoE Verifiers (Gemini Flash)...
Asking: what is civil department?

üöÄ THINKING PROCESS: 'what is civil department?'
   üë£ Step 1: Searching for 'what is civil department?'...
      üìÑ Reading: Civil Services Club
FINAL ANSWER: ‚ùå Error: Error calling model 'gemini-2.0-flash' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_fr

In [None]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

# üîπ import your LLM function
# example:
# from graphmind import run_graphmind

app = FastAPI()

# ‚úÖ allow frontend to talk to backend
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

class ChatRequest(BaseModel):
    query: str

@app.post("/chat")
def chat(req: ChatRequest):
    question = req.query

    # üî• CALL YOUR EXISTING LLM CODE HERE
    # answer = run_graphmind(question)

    answer = "Backend connected successfully"  # temporary test

    return {
        "answer": answer
    }