In [3]:
!pip install langchain langchain-community langchain-openai openai requests beautifulsoup4




In [4]:
!pip install -q sentence-transformers


In [5]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [28]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import requests
from bs4 import BeautifulSoup
import torch
import gradio as gr
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")

# ---------------------------
# Global models (load once)
# ---------------------------
BERT_MODEL_NAME = "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
bert_model = AutoModelForQuestionAnswering.from_pretrained(BERT_MODEL_NAME)
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# ---------------------------
# Data processing
# ---------------------------
def process_url(url_link):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/115.0.0.0 Safari/537.36"
    }
    response = requests.get(url_link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = [p.get_text().strip() for p in soup.find_all("p") if len(p.get_text().strip()) > 30]
    text = " ".join(paragraphs)
    if not text:
        text = soup.get_text(separator=" ", strip=True)
    return text

def chunk_by_sentences(text, max_tokens=400, overlap=50):
    sentences = nltk.sent_tokenize(text)
    chunks, current_chunk = [], []
    current_length = 0

    for sent in sentences:
        sent_tokens = bert_tokenizer.tokenize(sent)
        if current_length + len(sent_tokens) > max_tokens:
            chunks.append(bert_tokenizer.convert_tokens_to_string(sum(current_chunk, [])))
            current_chunk = current_chunk[-overlap:]  # keep overlap
            current_length = len(current_chunk)
        current_chunk.append(sent_tokens)
        current_length += len(sent_tokens)

    if current_chunk:
        chunks.append(bert_tokenizer.convert_tokens_to_string(sum(current_chunk, [])))
    if not chunks:
        chunks = [text]  # fallback if text too short
    return chunks

def embeddings_vector_store(chunks):
    if not chunks:
        raise ValueError("No chunks were generated from the text!")
    vectors = embedding_model.encode(chunks, convert_to_numpy=True)
    if vectors.ndim == 1:
        vectors = np.expand_dims(vectors, axis=0)
    dim = vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(vectors)
    return index, chunks, embedding_model

def retriever(vectorstore, question, k=8):
    index, chunks, emb_model = vectorstore
    query_vector = emb_model.encode([question])
    distances, indices = index.search(query_vector, k)
    retrieved_chunks = [chunks[i] for i in indices[0] if i < len(chunks)]
    return retrieved_chunks

# ---------------------------
# QA with BERT
# ---------------------------
def run_bert(question, chunks):
    best_answer = ""
    best_score = -float('inf')
    for chunk in chunks:
        inputs = bert_tokenizer.encode_plus(
            question,
            chunk,
            return_tensors="pt",
            max_length=512,
            truncation=True
        )
        input_ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]

        with torch.no_grad():
            outputs = bert_model(input_ids, token_type_ids=token_type_ids)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits

        start_idx = torch.argmax(start_scores)
        end_idx = torch.argmax(end_scores)

        answer = bert_tokenizer.convert_tokens_to_string(
            bert_tokenizer.convert_ids_to_tokens(input_ids[0][start_idx:end_idx+1])
        )
        score = start_scores[0][start_idx] + end_scores[0][end_idx]

        if score > best_score:
            best_score = score
            best_answer = answer

    return best_answer

# ---------------------------
# Main pipeline
# ---------------------------
def answer_question_from_url(url, question, chunk_size=400, chunk_overlap=50, top_k=3):
    text = process_url(url)
    if not text.strip():
        return "❌ Could not extract text from the URL.", []

    chunked_text = chunk_by_sentences(text, chunk_size, chunk_overlap)
    vectorstore = embeddings_vector_store(chunked_text)
    retrieved_chunks = retriever(vectorstore, question, top_k)

    print("\n--- Retrieved Chunks ---\n")
    for i, c in enumerate(retrieved_chunks, 1):
        print(f"Chunk {i}:\n{c[:300]}...\n")

    answer = run_bert(question, retrieved_chunks)
    return answer, retrieved_chunks

# ---------------------------
# Gradio UI
# ---------------------------
with gr.Blocks(title="Websites Analyzer for Lazy Readers", theme=gr.themes.Soft()) as demo:
    gr.Markdown("📄🔎 **RAG website:** drop your URL, ask a question, read the answer — it's that easy!")

    with gr.Row():
        with gr.Column(scale=1):
            url_input = gr.Textbox(label="Website URL")
            query_input = gr.Textbox(label="Ask a question about the WEBSITE")
            top_k_slider = gr.Slider(1, 10, value=3, step=1, label="Top-K Chunks")
            ask_btn = gr.Button("Answer", variant="primary")
            clear_btn = gr.Button("Clear Models (free VRAM)")
            status = gr.Markdown("Status: _waiting for URL_")

        with gr.Column(scale=2):
            answer_output = gr.Markdown(label="Answer")
            with gr.Accordion("Show retrieved chunks", open=False):
                chunks_view = gr.Markdown()

    def answer_query(url, question, top_k):
        answer, retrieved = answer_question_from_url(url, question, top_k=top_k)
        retrieved_text = "\n\n---\n\n".join(retrieved)
        return answer, retrieved_text, "✅ Done!"

    def reset_models():
        return "", "", "Models cleared! VRAM freed."

    ask_btn.click(
        fn=answer_query,
        inputs=[url_input, query_input, top_k_slider],
        outputs=[answer_output, chunks_view, status]
    )

    clear_btn.click(
        fn=reset_models,
        inputs=[],
        outputs=[answer_output, chunks_view, status]
    )

demo.queue().launch(debug=True)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d15046ce3e30eac51d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



--- Retrieved Chunks ---

Chunk 1:
[ 9 ] [ 10 ] [ 11 ] [ 12 ] his maternal grandfather, joshua n. haldeman, who died in a plane crash when elon was a toddler, was an american - born canadian chiropractor, aviator and political activist in the technocracy movement [ 13 ] [ 14 ] who moved to south africa in 1950. [ 15 ] haldeman ' s an...

Chunk 2:
[ 31 ] elon has recounted trips to a wilderness school that he described as a " paramilitary lord of the flies " where " bullying was a virtue " and children were encouraged to fight over rations. [ 32 ] in one incident, after an altercation with a fellow pupil, elon was thrown down concrete steps a...

Chunk 3:
in early 2025, he served as senior advisor to united states president donald trump and as the de facto head of doge. after a public feud with trump, musk left the trump administration and announced he was creating his own political party, the america party. musk ' s political activities, views, and ...


--- Retrieved Chunks ---

Chun

