In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

# CONFIGURATION
BASE_URL = "https://wiki.metakgp.org"
SEED_URL = "https://wiki.metakgp.org/w/Special:AllPages"

# Namespaces to strictly ignore
IGNORED_NAMESPACES = [
    "Special:", "Talk:", "User:", "User_talk:", "Metakgp:", 
    "Metakgp_talk:", "File:", "File_talk:", "MediaWiki:", 
    "Template:", "Help:", "Category:", "Category_talk:"
]

def crawl_all_urls():
    print("üï∏Ô∏è  Starting Universal Crawler (v3)...")
    current_url = SEED_URL
    all_links = []
    page_counter = 1

    while current_url:
        print(f"üìñ Reading Page {page_counter}...")
        
        try:
            response = requests.get(current_url)
            if response.status_code != 200:
                print(f"‚ùå Failed to load: {current_url}")
                break

            soup = BeautifulSoup(response.text, 'html.parser')

            # --- 1. REMOVE NOISE (Sidebar & Footer) ---
            # We destroy the sidebar and footer from the soup object before searching.
            # This ensures we don't accidentally grab "Main Page" or "About" links.
            for garbage in soup.find_all(class_=['mw-panel', 'vector-menu-portal', 'footer', 'mw-footer']):
                garbage.decompose()
            for garbage in soup.find_all(id=['mw-panel', 'footer', 'mw-navigation']):
                garbage.decompose()

            # --- 2. FIND ALL REMAINING LINKS ---
            # Now the only links left should be in the content area.
            links = soup.find_all('a', href=True)
            found_on_this_page = 0
            
            for link in links:
                href = link['href']
                full_url = urljoin(BASE_URL, href)
                
                # --- 3. FILTER LOGIC ---
                # A. Must be a Wiki link
                if "/wiki/" not in href:
                    continue
                
                # B. Must NOT be an Admin/System page
                is_banned = False
                for ns in IGNORED_NAMESPACES:
                    if ns in href:
                        is_banned = True
                        break
                
                if is_banned:
                    continue

                # C. Must NOT be the "Next Page" pagination link
                if "Next page" in link.text or "Previous page" in link.text:
                    continue

                # If we passed all checks, it's a valid article!
                all_links.append(full_url)
                found_on_this_page += 1

            print(f"   -> Found {found_on_this_page} valid links on this page.")
            
            # --- DEBUG: If 0 found, print what we DID see to help debug ---
            if found_on_this_page == 0:
                print("   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):")
                for l in links[:5]:
                    print(f"      - Text: '{l.text}' | Href: '{l['href']}'")

            # --- 4. PAGINATION ---
            # We look for the "Next page" link specifically.
            next_link = None
            # Re-fetch all links including navigation (since we decomposed them earlier, 
            # we might need to check if we deleted the nav. 
            # Actually, the 'Next' link is usually in the content body or top/bottom of list.
            # If we decomposed 'mw-navigation', we might have killed it.
            # Let's check the UN-MODIFIED text for pagination link.
            
            # Strategy: Search the raw text for the 'Next page' link pattern if soup failed
            pagination_soup = BeautifulSoup(response.text, 'html.parser') # Fresh soup
            nav_links = pagination_soup.find_all("a", href=True)
            
            for link in nav_links:
                if "Next page" in link.text:
                    next_link = urljoin(BASE_URL, link['href'])
                    break
            
            if next_link:
                current_url = next_link
                page_counter += 1
                time.sleep(0.5)
            else:
                print("‚úÖ Reached end of the list (No 'Next page' link found).")
                break

        except Exception as e:
            print(f"CRITICAL ERROR: {e}")
            break

    print(f"\nüéâ Crawler Finished! Found {len(all_links)} total pages.")
    return all_links

if __name__ == "__main__":
    final_list = crawl_all_urls()

üï∏Ô∏è  Starting Universal Crawler (v3)...
üìñ Reading Page 1...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):
      - Text: 'Jump to content' | Href: '#bodyContent'
      - Text: 'Main page' | Href: '/w/Main_Page'
      - Text: 'Yellow pages' | Href: '/w/Yellow_pages'
      - Text: 'Recent changes' | Href: '/w/Special:RecentChanges'
      - Text: 'Random article' | Href: '/w/Special:Random'
üìñ Reading Page 2...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):
      - Text: 'Jump to content' | Href: '#bodyContent'
      - Text: 'Main page' | Href: '/w/Main_Page'
      - Text: 'Yellow pages' | Href: '/w/Yellow_pages'
      - Text: 'Recent changes' | Href: '/w/Special:RecentChanges'
      - Text: 'Random article' | Href: '/w/Special:Random'
üìñ Reading Page 3...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and 

In [2]:
import json
import glob
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# CONFIGURATION
INPUT_DIR = "C:\programming\prg\Devsoc-hackathon\scraped_data"  # Directory containing your batch_*.json files
OUTPUT_FILE = "graph_chunks.json" # Where we save the processed chunks (optional debug)

def load_all_data(directory):
    """Loads all batch JSON files into a single list."""
    all_pages = []
    files = glob.glob(os.path.join(directory, "batch_*.json"))
    print(f"üìÇ Loading data from {len(files)} files...")
    
    for f_path in files:
        try:
            with open(f_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                all_pages.extend(data)
        except Exception as e:
            print(f"‚ö†Ô∏è Error reading {f_path}: {e}")
            
    print(f"‚úÖ Loaded {len(all_pages)} source pages.")
    return all_pages

def create_graph_chunks():
    # 1. Load Data
    raw_pages = load_all_data(INPUT_DIR)
    
    # 2. Define the Splitter
    # We use a smaller chunk size to keep facts precise.
    # Overlap is critical to not cut a sentence in half.
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    graph_documents = []
    
    print("üï∏Ô∏è  Generating Graph Chunks...")
    
    for page in raw_pages:
        # Extract Core Data
        title = page.get('title', 'Unknown')
        url = page.get('url', 'Unknown')
        last_mod = page.get('last_modified', 'Unknown')
        
        # --- GRAPH EDGE LOGIC ---
        # The 'graph_connections' list (from your scraper) is the KEY.
        # We must attach these neighbors to *every* chunk of this page.
        neighbors = page.get('graph_connections', [])
        neighbors_str = ", ".join(neighbors[:50]) # Limit to 50 links to save space
        
        # Clean Content
        content = page.get('content', '')
        if not content: continue
            
        # Split the content
        text_chunks = splitter.split_text(content)
        
        for i, chunk_text in enumerate(text_chunks):
            
            # --- THE "GRAPH CHUNK" MAGIC ---
            # We inject the metadata directly into the TEXT so the LLM reads it.
            # This allows the LLM to say: "I see a link to 'Gymkhana' here, let me ask about that."
            
            contextualized_text = f"""
SOURCE_PAGE: {title}
LAST_UPDATED: {last_mod}
RELATED_TOPICS: {neighbors_str}
---------------------
{chunk_text}
"""
            # Create the Document Object (Standard LangChain format)
            # We also keep clean metadata for code-level filtering
            doc = Document(
                page_content=contextualized_text,
                metadata={
                    "source": url,
                    "title": title,
                    "chunk_id": i,
                    "last_modified": last_mod,
                    "graph_neighbors": neighbors # Keep the raw list for code logic
                }
            )
            
            graph_documents.append(doc)

    print(f"‚úÖ Generated {len(graph_documents)} Graph Chunks.")
    return graph_documents

# --- EXECUTION ---
if __name__ == "__main__":
    final_chunks = create_graph_chunks()
    
    # Debug: Print one chunk to see the structure
    if final_chunks:
        print("\n--- SAMPLE GRAPH CHUNK ---")
        print(final_chunks[0].page_content)
        print("\n--- METADATA ---")
        print(final_chunks[0].metadata)

  INPUT_DIR = "C:\programming\prg\Devsoc-hackathon\scraped_data"  # Directory containing your batch_*.json files
  from .autonotebook import tqdm as notebook_tqdm
  INPUT_DIR = "C:\programming\prg\Devsoc-hackathon\scraped_data"  # Directory containing your batch_*.json files


KeyboardInterrupt: 

In [None]:
import os
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


# CONFIGURATION
DB_DIR = "C:/programming/prg/Devsoc-hackathon/chroma_db_graph"  # Separate DB for Graph RAG
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# 2. Force GPU Usage (Crucial for Speed!)
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # MPNet performs better with normalization

print(f"üß† Loading Model: {EMBEDDING_MODEL} on GPU...")

def ingest_knowledge_graph():
    # 1. Generate the "Smart" Graph Chunks
    print("üöÄ Starting Knowledge Graph Ingestion...")
    graph_docs = create_graph_chunks()
    
    if not graph_docs:
        print("‚ùå No documents found. Check your json files.")
        return

    print(f"üß© Prepared {len(graph_docs)} Graph-Enhanced Chunks.")

    print("üîß Sanitizing metadata for ChromaDB compatibility...")
    for doc in graph_docs:
        if "graph_neighbors" in doc.metadata:
            # Convert ['Link A', 'Link B'] -> "Link A, Link B"
            neighbors = doc.metadata["graph_neighbors"]
            if isinstance(neighbors, list):
                doc.metadata["graph_neighbors"] = ", ".join(neighbors)
            else:
                doc.metadata["graph_neighbors"] = str(neighbors)

    # 2. Initialize the Embedding Model (The "Translator" to Math)
    print(f"üß† Loading Model: {EMBEDDING_MODEL}...")
    embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

    # 3. Store in Vector Database (The "Memory")
    print(f"üíæ Saving to {DB_DIR}...")
    
    # We use batching to ensure we don't crash memory
    BATCH_SIZE = 100
    total_batches = (len(graph_docs) // BATCH_SIZE) + 1
    
    vectorstore = Chroma(
        persist_directory=DB_DIR, 
        embedding_function=embeddings
    )
    
    for i in range(0, len(graph_docs), BATCH_SIZE):
        batch = graph_docs[i : i + BATCH_SIZE]
        print(f"   -> Indexing Batch {i//BATCH_SIZE + 1}/{total_batches}")
        vectorstore.add_documents(batch)
        
    print("‚úÖ Knowledge Graph Successfully Built!")
    print(f"   You can now query this DB at: {DB_DIR}")

if __name__ == "__main__":
    ingest_knowledge_graph()

üß† Loading Model: sentence-transformers/all-mpnet-base-v2 on GPU...
üöÄ Starting Knowledge Graph Ingestion...
üìÇ Loading data from 24 files...
‚úÖ Loaded 3582 source pages.
üï∏Ô∏è  Generating Graph Chunks...
‚úÖ Generated 9344 Graph Chunks.
üß© Prepared 9344 Graph-Enhanced Chunks.
üîß Sanitizing metadata for ChromaDB compatibility...
üß† Loading Model: sentence-transformers/all-mpnet-base-v2...


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 1a5302f7-4a45-48cb-8ce5-6f1924bb38df)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/./config_sentence_transformers.json
Retrying in 1s [Retry 1/5].


üíæ Saving to C:/programming/prg/Devsoc-hackathon/chroma_db_graph...
   -> Indexing Batch 1/94
   -> Indexing Batch 2/94
   -> Indexing Batch 3/94
   -> Indexing Batch 4/94
   -> Indexing Batch 5/94
   -> Indexing Batch 6/94
   -> Indexing Batch 7/94
   -> Indexing Batch 8/94
   -> Indexing Batch 9/94
   -> Indexing Batch 10/94
   -> Indexing Batch 11/94
   -> Indexing Batch 12/94
   -> Indexing Batch 13/94
   -> Indexing Batch 14/94
   -> Indexing Batch 15/94
   -> Indexing Batch 16/94
   -> Indexing Batch 17/94
   -> Indexing Batch 18/94
   -> Indexing Batch 19/94
   -> Indexing Batch 20/94
   -> Indexing Batch 21/94
   -> Indexing Batch 22/94
   -> Indexing Batch 23/94
   -> Indexing Batch 24/94
   -> Indexing Batch 25/94
   -> Indexing Batch 26/94
   -> Indexing Batch 27/94
   -> Indexing Batch 28/94
   -> Indexing Batch 29/94
   -> Indexing Batch 30/94
   -> Indexing Batch 31/94
   -> Indexing Batch 32/94
   -> Indexing Batch 33/94
   -> Indexing Batch 34/94
   -> Indexing Batch 

In [5]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# CONFIGURATION
DB_DIR = "C:/programming/prg/Devsoc-hackathon/chroma_db_graph"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# 1. Initialize
print("üß† Loading Model...")
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={'device': 'cuda'}, # Use your RTX 4050
    encode_kwargs={'normalize_embeddings': True}
)

vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)

# 2. Ask a Question
query = "Who are the governors of the Technology Literary Society?"
print(f"\nüîé Query: {query}")

results = vectorstore.similarity_search(query, k=3)

# 3. Show Results
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(f"üìÑ Source: {doc.metadata['title']}")
    print(f"üîó Related: {doc.metadata.get('graph_neighbors', 'None')[:50]}...")
    print(f"üìù Text Snippet:\n{doc.page_content[:200]}...")

ImportError: cannot import name 'LanguageModelInput' from 'langchain_core.language_models' (c:\programming\newenv\Lib\site-packages\langchain_core\language_models\__init__.py)

In [2]:
import os
import re
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from time import sleep

# ==========================================
# CONFIGURATION
# ==========================================
# 1. Database Path (Must match your ingest script)
DB_DIR = "C:/programming/prg/Devsoc-hackathon/chroma_db_graph"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

# 2. LLM Setup (The Reasoning Engine)
# Get a free key at gemini/keys
MY_GEMINI_KEY= "AIzaSyBr2OYi0aLoqAXvT_dSsSRch3ixrdYWO_M" # <--- PASTE YOUR KEY HERE
os.environ["google_api_key"] = MY_GEMINI_KEY
# ==========================================
# CORE LOGIC
# ==========================================

class GraphRAGAgent:
    def __init__(self):
        print("üß† Initializing Graph Agent...")
        
        # 1. Load the Memory (Vector DB)
        self.embeddings = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL,
            model_kwargs={'device': 'cuda'},
            encode_kwargs={'normalize_embeddings': True}
        )
        self.db = Chroma(persist_directory=DB_DIR, embedding_function=self.embeddings)
        
        # 2. Load the Brain (LLM)
        self.llm = ChatGoogleGenerativeAI(
            google_api_key=MY_GEMINI_KEY,
            model="gemini-3-flash-preview", # Or "gemini-1.5-flash" for speed
            temperature=0.1 #SEE IF A LITTLE CREATIVITY HELPS
        )
       
        # 3. Define the "Navigator" Prompt 
        # This prompt forces the LLM to decide: Answer vs. I don't know
        self.navigator_prompt = ChatPromptTemplate.from_template("""
        You are a Graph RAG Agent.
        
        GOAL: {goal}
        
        CONTEXT NODE:
        --------------------------------------------------
        {context}
        --------------------------------------------------
        
        INSTRUCTIONS:
        1. Check "RELATED_TOPICS" in the text.
        2. If the text answers the GOAL, reply ONLY with:
           ANSWER: [The answer]
        3. If you need to search a related topic, reply ONLY with:
           HOP: [Topic Name]
           
        CONSTRAINT: Do not write any explanations. Start your response strictly with ANSWER or HOP.
        If In the end you did not get any answer, reply with: I don't know.
        """)
        
    def search(self, query):
        """Standard Retrieval"""
        results = self.db.similarity_search(query, k=1)
        if not results:
            return None
        return results[0] # Return the best chunk

    def solve(self, user_query, max_hops=5):
        print(f"\nüöÄ STARTING GRAPH TRAVERSAL: '{user_query}'")
        current_query = user_query
        visited_context = []
        
        for step in range(max_hops):
            sleep(1)
            print(f"\nüë£ Step {step + 1}: Searching for '{current_query}'...")
            
            node = self.search(current_query)
            if not node:
                print("   ‚ùå Dead end. No information found.")
                break
                
            content = node.page_content
            source = node.metadata.get('title', 'Unknown')
            # Fix: Ensure we actually get the neighbors if they exist in metadata
            neighbors = node.metadata.get('graph_neighbors', '')
            
            print(f"   üìÑ Found Node: {source}")
            
            # Inject neighbors explicitly into context so LLM sees them clearly
            visited_context.append(f"SOURCE: {source}\nRELATED_TOPICS: {neighbors}\nCONTENT: {content}")
            full_context = "\n\n".join(visited_context)
            
            chain = self.navigator_prompt | self.llm | StrOutputParser()
            try:
                decision = chain.invoke({"goal": user_query, "context": full_context})
                print(f"   ü§î Thought: {decision}")
                
                # --- ROBUST PARSING LOGIC ---
                # This regex finds "HOP: ..." or "ANSWER: ..." anywhere in the text
                # even if the LLM adds extra fluff.
                match = re.search(r"(ANSWER|HOP):\s*(.*)", decision, re.DOTALL)
                
                if match:
                    action = match.group(1) # "ANSWER" or "HOP"
                    value = match.group(2).strip()
                    
                    if action == "ANSWER":
                        return value
                    elif action == "HOP":
                        print(f"   üîó Graph Hop Triggered! Jumping to -> {value}")
                        current_query = value
                        continue
                else:
                    # If LLM failed format, treat it as an answer if it's the last step
                    if step == max_hops - 1:
                        return decision
                    
            except Exception as e:
                return f"‚ùå Error during reasoning: {e}"

        return "‚ùå I ran out of steps (Max Hops Reached)."

# ==========================================
# RUN IT
# ==========================================
if __name__ == "__main__":
    agent = GraphRAGAgent()
    
    # Test Query
    # This query requires a "Hop": 
    # It starts at "Inter-IIT" -> Finds "Sports" -> Finds "Specific Sport Details"
    response = agent.solve("What is Gymkhana?")
    print("\n" + "="*40)
    print("FINAL ANSWER:")
    print("="*40)
    print(response)

üß† Initializing Graph Agent...

üöÄ STARTING GRAPH TRAVERSAL: 'What is Gymkhana?'

üë£ Step 1: Searching for 'What is Gymkhana?'...
   üìÑ Found Node: Constitution of the Technology Students' Gymkhana
   ü§î Thought: ANSWER: Technology Students‚Äô Gymkhana is the hub of numerous extra-curricular and co-curricular activities ranging from sports to music, and it is managed by the students.

FINAL ANSWER:
Technology Students‚Äô Gymkhana is the hub of numerous extra-curricular and co-curricular activities ranging from sports to music, and it is managed by the students.
