In [19]:
# 🧠 RAG from Website using LangChain + FAISS + OpenAI

# This notebook will:
# - Scrape a website and its internal child pages
# - Chunk the text into small pieces
# - Embed the chunks using OpenAI Embeddings
# - Store them in a FAISS vector database
# - Use Retrieval-Augmented Generation (RAG) to answer user questions


In [20]:
# ✅ Step 1: Install required libraries (only run once)
!pip install -q langchain openai faiss-cpu beautifulsoup4 tiktoken langchain-community langchain-openai


In [21]:
# ✅ Step 2: Import necessary libraries

import os  # For setting environment variables
import requests  # To make HTTP requests to webpages
from bs4 import BeautifulSoup  # To parse and extract content from HTML pages
from urllib.parse import urljoin, urlparse  # For resolving and parsing URLs

# LangChain modules
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain  # ✅ Needed for custom LLM prompt chain
from langchain.chains.combine_documents import create_stuff_documents_chain    # ✅ Needed for strict prompt with context
from langchain.schema import Document


# For Google Colab secrets (API key storage)
from google.colab import userdata

# Set OpenAI API key from Colab secrets
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [22]:
# ✅ Step 3: Define functions to scrape the website and extract internal links

def get_child_links(base_url, max_links=10):
    """Fetches internal links from the base URL up to a given limit."""
    base = urlparse(base_url)  # Parse base URL
    domain = base.netloc  # Extract domain to compare later
    links = set()  # Use a set to avoid duplicates

    try:
        response = requests.get(base_url, timeout=10)  # Send GET request to the page
        soup = BeautifulSoup(response.text, "html.parser")  # Parse HTML content

        for a in soup.find_all("a", href=True):  # Find all <a href=...>
            full_url = urljoin(base_url, a['href'])  # Convert relative URL to full URL
            parsed = urlparse(full_url)

            if parsed.netloc == domain and parsed.scheme.startswith("http"):  # Only same-domain links
                links.add(full_url)

            if len(links) >= max_links:  # Stop if limit is reached
                break

        return list(links)

    except Exception as e:
        print("❌ Error while collecting links:", e)
        return []

def scrape_pages(base_url, max_links=10):
    """Scrapes visible text and returns a list of Documents with source metadata."""
    all_pages = [base_url] + get_child_links(base_url, max_links)
    documents = []

    print("🔗 Pages being scraped:")
    for page in all_pages:
        print(" -", page)
        try:
            res = requests.get(page, timeout=10)
            soup = BeautifulSoup(res.text, "html.parser")
            for tag in soup(["script", "style", "noscript"]):
                tag.decompose()
            text = soup.get_text(separator="\n").strip()
            documents.append(Document(page_content=text, metadata={"source": page}))
        except Exception as e:
            print(f"⚠️ Failed to read {page}: {e}")

    return documents  # A list of Document objects

In [23]:
# ✅ Step 4: Scrape, chunk and embed website content

# Set the URL of the website you want to analyze. We have used a newswebsite for testing
base_url = "https://timesofindia.indiatimes.com/"

# Step 1: Scrape the website and collect text
raw_docs = scrape_pages(base_url, max_links=10)     # scrapts and extract data from child pages as document. We have put default to 10 child pages

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = splitter.split_documents(raw_docs)

# 🛠️ DEBUG: Check metadata after splitting. Run this to check if the child page url is added as metadata in the docuements extracted. This is required to cross validate the LLM outcome is from the right child page url
for doc in documents[:3]:
    print(doc.metadata)

# 🛠️ DEBUG: Check content extracted. Run this to check what content is ectracted per child page.
#for doc in raw_docs[:5]:
#    print(doc.metadata['source'])
#    print(doc.page_content[:300])
#    print("="*80)

print(f"✅ Total number of chunks created: {len(documents)}")

# Step 3: Generate vector embeddings using OpenAI and store them in FAISS
embedding_model = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embedding_model)

# Save the FAISS vector index to local storage
vectorstore.save_local("website_index")


🔗 Pages being scraped:
 - https://timesofindia.indiatimes.com/
 - https://timesofindia.indiatimes.com/city/mumbai
 - https://timesofindia.indiatimes.com/
 - https://timesofindia.indiatimes.com
 - https://timesofindia.indiatimes.com/city
 - https://timesofindia.indiatimes.com/weather
 - https://timesofindia.indiatimes.com/?loc=in
 - https://timesofindia.indiatimes.com/city/bangalore
 - https://timesofindia.indiatimes.com/city/delhi
 - https://timesofindia.indiatimes.com/city/hyderabad
 - https://timesofindia.indiatimes.com/us
{'source': 'https://timesofindia.indiatimes.com/'}
{'source': 'https://timesofindia.indiatimes.com/'}
{'source': 'https://timesofindia.indiatimes.com/'}
✅ Total number of chunks created: 521


In [26]:
# ✅ Step 5: Load the vector store and prepare the RAG pipeline

# Load the FAISS index
vectorstore = FAISS.load_local("website_index", embedding_model, allow_dangerous_deserialization=True)

# Convert vectorstore into a retriever object
retriever = vectorstore.as_retriever()

# ✅ Step 1: Define strict RAG-only prompt
# Use ChatPromptTemplate instead of PromptTemplate for better compatibility with chat models
from langchain_core.prompts import ChatPromptTemplate # ✅ Import ChatPromptTemplate
strict_prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant. Use only the context provided below to answer the question.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question: {input}
Answer:
""")

# ✅ Step 2: Create a retrieval chain
from langchain.chains import create_retrieval_chain # ✅ Import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain # ✅ Import create_stuff_documents_chain

# Create a chain to combine documents
combine_docs_chain = create_stuff_documents_chain(
    llm=ChatOpenAI(model_name="gpt-4"),
    prompt=strict_prompt
)

# Create the retrieval chain
qa_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=combine_docs_chain
)

In [27]:
query = "Tell me about bangalore news"  # ask the LLM agent query. As we have used a newspaper website and have extracted only 10 child pages, we have asked a specific question here within the scope

# Pass query as dict matching the prompt variable name
result = qa_chain.invoke({"input": query})

print("🧠 Answer:")
print(result['answer'])  # ✅ this works

# Print source documents (from 'context' key). This is to check if the content LLM shares is from relevent child urls
print("\n📚 Sources:")
for doc in result['context']:
    print("-", doc.metadata.get('source', 'No metadata available'))



🧠 Answer:
The latest news in Bengaluru covers a wide range of topics such as politics, crime, sports, fashion, movies, culture, dance and music, industries, civic bodies, traffic updates, weather updates, new trends, education and BBMP issues. Recently, three burglars who stole gold and other valuables from a political party functionary's house in Bengaluru were arrested. There was also an incident where a man was attacked with a machete by two youths because he refused to give his phone for a call. Moreover, experts are expressing concerns over the cut in SSLC pass mark in Karnataka, arguing that it devalues education.

📚 Sources:
- https://timesofindia.indiatimes.com/city/bangalore
- https://timesofindia.indiatimes.com/city/bangalore
- https://timesofindia.indiatimes.com/city/bangalore
- https://timesofindia.indiatimes.com/city/bangalore
