<a href="https://colab.research.google.com/github/giggsy1106/NLP-HW3_KOTA/blob/main/NLP_HW3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================
# End-to-End: Scrape → Clean → Sentence Split
# → TF-IDF + SBERT embeddings → Similarity
# (Robust extraction with fallbacks for paywalls/noisy HTML)
# ============================================
# --------- Install (run once) ----------
# pip install requests beautifulsoup4 lxml nltk scikit-learn sentence-transformers

import re
import json
import numpy as np
import requests
from bs4 import BeautifulSoup

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


# =========================
# Step 0: Helpers
# =========================
def normalize_text(s: str) -> str:
    """Clean up whitespace and weird unicode spacing."""
    s = re.sub(r"\s+", " ", s).strip()
    return s


def looks_like_paywall_or_junk(text: str) -> bool:
    """Heuristic check for common paywall / junk outcomes."""
    t = (text or "").lower()
    patterns = [
        "subscribe",
        "sign in",
        "already a subscriber",
        "continue reading",
        "enable javascript",
        "cookies",
        "accept all cookies",
        "to continue",
        "create an account",
        "this content is not available",
    ]
    # If too short or matches many paywall tokens, treat as junk
    hits = sum(1 for p in patterns if p in t)
    return (len(t) < 600) or (hits >= 2)


def extract_article_text_from_html(html: str) -> dict:
    """
    Try multiple extraction strategies:
    1) <article> paragraphs
    2) JSON-LD (NewsArticle/articleBody)
    3) Meta description + <p> fallback
    Returns dict with: {title, text, method}
    """
    soup = BeautifulSoup(html, "lxml")

    # Title (best-effort)
    title = None
    if soup.title and soup.title.get_text(strip=True):
        title = soup.title.get_text(strip=True)
    og_title = soup.find("meta", property="og:title")
    if og_title and og_title.get("content"):
        title = og_title["content"].strip()

    # --- Strategy 1: <article> tag paragraphs ---
    article = soup.find("article")
    if article:
        ps = [p.get_text(" ", strip=True) for p in article.find_all("p")]
        text = normalize_text(" ".join([p for p in ps if p]))
        if text and not looks_like_paywall_or_junk(text):
            return {"title": title, "text": text, "method": "article_tag_paragraphs"}

    # --- Strategy 2: JSON-LD (often contains articleBody) ---
    json_ld_blocks = soup.find_all("script", type="application/ld+json")
    for block in json_ld_blocks:
        try:
            data = json.loads(block.get_text(strip=True))
        except Exception:
            continue

        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue

            # Some sites nest in @graph
            graph = obj.get("@graph")
            if isinstance(graph, list):
                candidates.extend([g for g in graph if isinstance(g, dict)])

            # Check for articleBody in common schemas
            body = obj.get("articleBody") or obj.get("text")
            if isinstance(body, str) and body.strip():
                text = normalize_text(body)
                if text and not looks_like_paywall_or_junk(text):
                    # Title sometimes lives here too
                    if not title and isinstance(obj.get("headline"), str):
                        title = obj["headline"].strip()
                    return {"title": title, "text": text, "method": "json_ld_articleBody/text"}

    # --- Strategy 3: meta description + all <p> as last resort ---
    desc = ""
    og_desc = soup.find("meta", property="og:description")
    if og_desc and og_desc.get("content"):
        desc = og_desc["content"].strip()

    ps = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    text = normalize_text(" ".join([p for p in ps if p]))
    # combine desc at top if helpful
    if desc and desc.lower() not in text.lower():
        text = normalize_text(desc + " " + text)

    return {"title": title, "text": text, "method": "fallback_all_paragraphs"}


def safe_sentence_tokenize(text: str) -> list[str]:
    """Tokenize into sentences with NLTK, handling common NLTK resource quirks."""
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")

    # Some environments also need punkt_tab
    try:
        sents = nltk.sent_tokenize(text)
    except LookupError:
        nltk.download("punkt_tab")
        sents = nltk.sent_tokenize(text)

    # Basic cleanup + remove tiny junk sentences
    sents = [normalize_text(s) for s in sents if normalize_text(s)]
    sents = [s for s in sents if len(s) >= 20]
    return sents


# =========================
# Step 1: Fetch HTML
# =========================
url = "https://www.gutenberg.org/files/1342/1342-h/1342-h.htm" # Changed to a generic, public URL for testing

headers = {
    "User-Agent": "Mozilla/5.0",
    "Accept-Language": "en-US,en;q=0.9",
}

resp = requests.get(url, headers=headers, timeout=60) # Increased timeout to 60 seconds
resp.raise_for_status()
html = resp.text


# =========================
# Step 2: Extract + Clean Article Text
# =========================
extracted = extract_article_text_from_html(html)
title = extracted["title"]
raw_text = extracted["text"]
method = extracted["method"]

print("Extraction method:", method)
print("Title:", title)
print("\nFirst 700 chars:\n", raw_text[:700])
print("Length:", len(raw_text))

if looks_like_paywall_or_junk(raw_text):
    print("\n⚠️ Warning: This looks like paywall/junk content.")
    print("Try a different article URL, or use a source that allows scraping in your environment.")


# =========================
# Step 3: Sentence Split
# =========================
sentences = safe_sentence_tokenize(raw_text)

print("\nTotal sentences:", len(sentences))
print("\nFirst 10 sentences:")
for i, s in enumerate(sentences[:10], 1):
    print(f"{i}. {s}")


# Safety guard
if len(sentences) < 2:
    raise SystemExit("Not enough clean sentences extracted to proceed.")


# =========================
# Step 4: TF-IDF (demo on first 10 sentences)
# =========================
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(sentences[:10])

print("\nTF-IDF shape (first 10 sentences):", tfidf_matrix.shape)


# =========================
# Step 5: Sentence Embeddings (SBERT)
# =========================
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all sentences
embeddings = model.encode(sentences, convert_to_numpy=True, show_progress_bar=True)
print("\nEmbedding matrix shape:", embeddings.shape)
print("Embedding shape for first sentence:", embeddings[0].shape)


# =========================
# Step 6: Cosine Similarity (Sentence 1 vs Sentence 2)
# =========================
sim_1_2 = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
print("\nCosine similarity between sentence 1 and 2:", float(sim_1_2))


# =========================
# Step 7 (Useful): Find Top-K Sentences Similar to a Query
# =========================
# Query idea: use title if available; else first sentence
query_text = title if (title and len(title) > 10) else sentences[0]
query_emb = model.encode([query_text], convert_to_numpy=True)

scores = cosine_similarity(query_emb, embeddings)[0]  # similarity vs all sentences
top_k = 6
top_idx = np.argsort(scores)[::-1][:top_k]

print("\n=== Top-K sentences most similar to the query ===")
print("Query:", query_text)
for rank, idx in enumerate(top_idx, 1):
    print(f"\n{rank}) score={scores[idx]:.4f}")
    print(sentences[idx])


# =========================
# Step 8 (Optional): Simple Extractive Summary
# =========================
# Take top sentences by similarity, then re-order them by original position
summary_k = 5
summary_idx = sorted(top_idx[:summary_k])
summary = " ".join(sentences[i] for i in summary_idx)

print("\n=== Simple Extractive Summary ===")
print(summary)


Extraction method: fallback_all_paragraphs
Title: Pride and prejudice | Project Gutenberg

First 700 chars:
 PREFACE. List of Illustrations. Chapter: I., II., III., IV., V., VI., VII., VIII., IX., X., XI., XII., XIII., XIV., XV., XVI., XVII., XVIII., XIX., XX., XXI., XXII., XXIII., XXIV., XXV., XXVI., XXVII., XXVIII., XXIX., XXX., XXXI., XXXII., XXXIII., XXXIV., XXXV., XXXVI., XXXVII., XXXVIII., XXXIX., XL., XLI., XLII., XLIII., XLIV., XLV., XLVI., XLVII., XLVIII., XLIX., L., LI., LII., LIII., LIV., LV., LVI., LVII., LVIII., LIX., LX., LXI. by Jane Austen, with a Preface by George Saintsbury and Illustrations by Hugh Thomson CHISWICK PRESS:—CHARLES WHITTINGHAM AND CO. TOOKS COURT, CHANCERY LANE, LONDON. Walt Whitman has somewhere a fine and just distinction between âloving by allowanceâ and â
Length: 720732

Try a different article URL, or use a source that allows scraping in your environment.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Total sentences: 4694

First 10 sentences:
1. List of Illustrations.
2. Chapter: I., II., III., IV., V., VI., VII., VIII., IX., X., XI., XII., XIII., XIV., XV., XVI., XVII., XVIII., XIX., XX., XXI., XXII., XXIII., XXIV., XXV., XXVI., XXVII., XXVIII., XXIX., XXX., XXXI., XXXII., XXXIII., XXXIV., XXXV., XXXVI., XXXVII., XXXVIII., XXXIX., XL., XLI., XLII., XLIII., XLIV., XLV., XLVI., XLVII., XLVIII., XLIX., L., LI., LII., LIII., LIV., LV., LVI., LVII., LVIII., LIX., LX., LXI.
3. by Jane Austen, with a Preface by George Saintsbury and Illustrations by Hugh Thomson CHISWICK PRESS:—CHARLES WHITTINGHAM AND CO. TOOKS COURT, CHANCERY LANE, LONDON.
4. Walt Whitman has somewhere a fine and just distinction between âloving by allowanceâ and âloving with personal love.â This distinction applies to books as well as to men and women; and in the case of the not very numerous authors who are the objects of the personal affection, it brings a curious consequence with it.
5. There is much more d

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/147 [00:00<?, ?it/s]


Embedding matrix shape: (4694, 384)
Embedding shape for first sentence: (384,)

Cosine similarity between sentence 1 and 2: 0.20047548413276672

=== Top-K sentences most similar to the query ===
Query: Pride and prejudice | Project Gutenberg

1) score=0.6378
The goodness of the minor characters in Pride and Prejudice has been already alluded to, and it makes a detailed dwelling on their beauties difficult in any space, and impossible in this.

2) score=0.6184
And despite the ability which Miss Austen has shown in working out the story, I for one should put Pride and Prejudice far lower if it did not contain what seem to me the very masterpieces of Miss Austenâs humour and of her faculty of character-creation—masterpieces who may indeed admit John Thorpe, the Eltons, Mrs. Norris, and one or two others to their company, but who, in one instance certainly, and perhaps in others, are still superior to them.

3) score=0.5623
by Jane Austen, with a Preface by George Saintsbury and Illustr

In [4]:
import argparse
import json
import re
from pathlib import Path
import sys # Import sys

import numpy as np
import requests
from bs4 import BeautifulSoup

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


# -----------------------
# NLTK safety (your ask)
# -----------------------
def ensure_nltk():
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        raise RuntimeError(
            "NLTK punkt not found. Run: python -c \"import nltk; nltk.download('punkt')\""
        )


# -----------------------
# Text utilities
# -----------------------
def normalize_text(s: str) -> str:
    s = re.sub(r"\s+", " ", (s or "")).strip()
    return s


def looks_like_paywall_or_junk(text: str) -> bool:
    t = (text or "").lower()
    patterns = [
        "subscribe",
        "sign in",
        "already a subscriber",
        "continue reading",
        "enable javascript",
        "cookies",
        "accept all cookies",
        "create an account",
        "this content is not available",
    ]
    hits = sum(1 for p in patterns if p in t)
    return (len(t) < 600) or (hits >= 2)


def extract_article_text_from_html(html: str) -> dict:
    """
    Tries:
      1) <article> + <p> extraction
      2) JSON-LD articleBody/text
      3) all <p> fallback
    Returns: {title, text, method}
    """
    soup = BeautifulSoup(html, "lxml")

    # Title best-effort
    title = None
    if soup.title and soup.title.get_text(strip=True):
        title = soup.title.get_text(strip=True)
    og_title = soup.find("meta", property="og:title")
    if og_title and og_title.get("content"):
        title = og_title["content"].strip()

    # Strategy 1: <article>
    article = soup.find("article")
    if article:
        ps = [p.get_text(" ", strip=True) for p in article.find_all("p")]
        text = normalize_text(" ".join([p for p in ps if p]))
        if text and not looks_like_paywall_or_junk(text):
            return {"title": title, "text": text, "method": "article_tag_paragraphs"}

    # Strategy 2: JSON-LD
    json_ld_blocks = soup.find_all("script", type="application/ld+json")
    for block in json_ld_blocks:
        try:
            data = json.loads(block.get_text(strip=True))
        except Exception:
            continue

        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue

            # handle @graph
            graph = obj.get("@graph")
            if isinstance(graph, list):
                candidates.extend([g for g in graph if isinstance(g, dict)])

            body = obj.get("articleBody") or obj.get("text")
            if isinstance(body, str) and body.strip():
                text = normalize_text(body)
                if text and not looks_like_paywall_or_junk(text):
                    if not title and isinstance(obj.get("headline"), str):
                        title = obj["headline"].strip()
                    return {"title": title, "text": text, "method": "json_ld_articleBody/text"}

    # Strategy 3: fallback all <p>
    ps = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    text = normalize_text(" ".join([p for p in ps if p]))
    return {"title": title, "text": text, "method": "fallback_all_paragraphs"}


def sentence_split(text: str) -> list[str]:
    ensure_nltk()
    sents = nltk.sent_tokenize(text)
    sents = [normalize_text(s) for s in sents if normalize_text(s)]
    # remove tiny junk
    sents = [s for s in sents if len(s) >= 20]
    return sents


def read_input_text(url: str | None, html_file: str | None, text_file: str | None, timeout: int = 30) -> dict:
    """
    Exactly one of {url, html_file, text_file} should be provided.
    Returns dict: {title, text, source, method}
    """
    if url:
        headers = {"User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.9"}
        resp = requests.get(url, headers=headers, timeout=timeout)
        resp.raise_for_status()
        extracted = extract_article_text_from_html(resp.text)
        return {
            "title": extracted.get("title"),
            "text": extracted.get("text", ""),
            "source": url,
            "method": extracted.get("method", "unknown"),
        }

    if html_file:
        html = Path(html_file).read_text(encoding="utf-8", errors="ignore")
        extracted = extract_article_text_from_html(html)
        return {
            "title": extracted.get("title"),
            "text": extracted.get("text", ""),
            "source": html_file,
            "method": extracted.get("method", "unknown"),
        }

    if text_file:
        text = Path(text_file).read_text(encoding="utf-8", errors="ignore")
        return {"title": None, "text": normalize_text(text), "source": text_file, "method": "plain_text"}

    raise ValueError("Provide one input: --url OR --html-file OR --text-file")


def main():
    parser = argparse.ArgumentParser(description="Scrape/Load text → sentence split → TF-IDF + SBERT → similarity")
    parser.add_argument("--url", type=str, default=None, help="Article URL (may be paywalled)")
    parser.add_argument("--html-file", type=str, default=None, help="Local HTML file path")
    parser.add_argument("--text-file", type=str, default=None, help="Local text file path")
    parser.add_argument("--topk", type=int, default=6, help="Top-K similar sentences to query")
    parser.add_argument("--summary-k", type=int, default=5, help="How many sentences to include in extractive summary")
    parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout seconds")
    args, unknown = parser.parse_known_args() # Modified to parse_known_args

    # Validate inputs
    provided = [bool(args.url), bool(args.html_file), bool(args.text_file)]
    if sum(provided) != 1:
        raise SystemExit("❌ Provide exactly one: --url OR --html-file OR --text-file")

    data = read_input_text(args.url, args.html_file, args.text_file, timeout=args.timeout)
    title, raw_text, source, method = data["title"], data["text"], data["source"], data["method"]

    print("Source:", source)
    print("Extraction method:", method)
    print("Title:", title)
    print("\nFirst 700 chars:\n", raw_text[:700])
    print("Length:", len(raw_text))

    # Paywall/junk guard (important for WaPo)
    if looks_like_paywall_or_junk(raw_text) and args.url:
        print(
            "\n⚠️ Looks like paywall/junk HTML (common with Washington Post). "
            "Tip: save the HTML/text locally and run with --html-file or --text-file."
        )

    sentences = sentence_split(raw_text)
    print("\nTotal sentences:", len(sentences))
    if len(sentences) < 2:
        raise SystemExit("❌ Not enough clean sentences to continue.")

    print("\nFirst 10 sentences:")
    for i, s in enumerate(sentences[:10], 1):
        print(f"{i}. {s}")

    # TF-IDF demo on first 10 sentences
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(sentences[:10])
    print("\nTF-IDF shape (first 10 sentences):", tfidf_matrix.shape)

    # SBERT embeddings
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(sentences, convert_to_numpy=True, show_progress_bar=True)
    print("\nEmbedding matrix shape:", embeddings.shape)
    print("Embedding shape for first sentence:", embeddings[0].shape)

    # Cosine similarity between sentence 1 and 2
    sim_1_2 = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    print("\nCosine similarity between sentence 1 and 2:", float(sim_1_2))

    # Query: title if available else first sentence
    query_text = title if (title and len(title) > 10) else sentences[0]
    query_emb = model.encode([query_text], convert_to_numpy=True)
    scores = cosine_similarity(query_emb, embeddings)[0]

    topk = min(args.topk, len(sentences))
    top_idx = np.argsort(scores)[::-1][:topk]

    print("\n=== Top-K sentences most similar to the query ===")
    print("Query:", query_text)
    for rank, idx in enumerate(top_idx, 1):
        print(f"\n{rank}) score={scores[idx]:.4f}")
        print(sentences[idx])

    # Simple extractive summary: take best K, then re-order by original index
    summary_k = min(args.summary_k, len(top_idx))
    summary_idx = sorted(top_idx[:summary_k])
    summary = " ".join(sentences[i] for i in summary_idx)

    print("\n=== Simple Extractive Summary ===")
    print(summary)


if __name__ == "__main__":
    # Simulate command-line arguments for notebook execution
    original_argv = sys.argv
    sys.argv = ['script_name.py', '--url', 'https://www.gutenberg.org/files/1342/1342-h/1342-h.htm']
    try:
        main()
    finally:
        sys.argv = original_argv # Restore original sys.argv after execution


Source: https://www.gutenberg.org/files/1342/1342-h/1342-h.htm
Extraction method: fallback_all_paragraphs
Title: Pride and prejudice | Project Gutenberg

First 700 chars:
 PREFACE. List of Illustrations. Chapter: I., II., III., IV., V., VI., VII., VIII., IX., X., XI., XII., XIII., XIV., XV., XVI., XVII., XVIII., XIX., XX., XXI., XXII., XXIII., XXIV., XXV., XXVI., XXVII., XXVIII., XXIX., XXX., XXXI., XXXII., XXXIII., XXXIV., XXXV., XXXVI., XXXVII., XXXVIII., XXXIX., XL., XLI., XLII., XLIII., XLIV., XLV., XLVI., XLVII., XLVIII., XLIX., L., LI., LII., LIII., LIV., LV., LVI., LVII., LVIII., LIX., LX., LXI. by Jane Austen, with a Preface by George Saintsbury and Illustrations by Hugh Thomson CHISWICK PRESS:—CHARLES WHITTINGHAM AND CO. TOOKS COURT, CHANCERY LANE, LONDON. Walt Whitman has somewhere a fine and just distinction between âloving by allowanceâ and â
Length: 720732

Total sentences: 4694

First 10 sentences:
1. List of Illustrations.
2. Chapter: I., II., III., IV., V., VI., V

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/147 [00:00<?, ?it/s]


Embedding matrix shape: (4694, 384)
Embedding shape for first sentence: (384,)

Cosine similarity between sentence 1 and 2: 0.20047548413276672

=== Top-K sentences most similar to the query ===
Query: Pride and prejudice | Project Gutenberg

1) score=0.6378
The goodness of the minor characters in Pride and Prejudice has been already alluded to, and it makes a detailed dwelling on their beauties difficult in any space, and impossible in this.

2) score=0.6184
And despite the ability which Miss Austen has shown in working out the story, I for one should put Pride and Prejudice far lower if it did not contain what seem to me the very masterpieces of Miss Austenâs humour and of her faculty of character-creation—masterpieces who may indeed admit John Thorpe, the Eltons, Mrs. Norris, and one or two others to their company, but who, in one instance certainly, and perhaps in others, are still superior to them.

3) score=0.5623
by Jane Austen, with a Preface by George Saintsbury and Illustr

In [7]:
import sys
import argparse
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# ... keep all your existing helper functions above (ensure_nltk, extract, etc.) ...

def main(argv=None):
    """
    - Terminal: python main.py --url "..."  (argv=None → uses sys.argv)
    - Colab/Jupyter: main(["--text-file","article.txt"])  (argv=list)
    """
    parser = argparse.ArgumentParser(
        description="Scrape/Load text → sentence split → TF-IDF + SBERT → similarity",
        add_help=True,
    )
    parser.add_argument("--url", type=str, default=None, help="Article URL (may be paywalled)")
    parser.add_argument("--html-file", type=str, default=None, help="Local HTML file path")
    parser.add_argument("--text-file", type=str, default=None, help="Local text file path")
    parser.add_argument("--topk", type=int, default=6, help="Top-K similar sentences to query")
    parser.add_argument("--summary-k", type=int, default=5, help="Extractive summary sentence count")
    parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout seconds")

    # ✅ KEY FIX: ignore unknown args like "-f" from Jupyter
    args, _unknown = parser.parse_known_args(argv)

    provided = [bool(args.url), bool(args.html_file), bool(args.text_file)]
    if sum(provided) != 1:
        # ✅ In notebooks, don't crash hard; show help + clear message
        parser.print_help()
        raise ValueError("Provide exactly one: --url OR --html-file OR --text-file")

    data = read_input_text(args.url, args.html_file, args.text_file, timeout=args.timeout)
    title, raw_text, source, method = data["title"], data["text"], data["source"], data["method"]

    print("Source:", source)
    print("Extraction method:", method)
    print("Title:", title)
    print("\nFirst 700 chars:\n", raw_text[:700])
    print("Length:", len(raw_text))

    if looks_like_paywall_or_junk(raw_text) and args.url:
        print(
            "\n⚠️ Looks like paywall/junk HTML (common with WaPo). "
            "Use --html-file/--text-file for stable results."
        )

    sentences = sentence_split(raw_text)
    print("\nTotal sentences:", len(sentences))
    if len(sentences) < 2:
        raise ValueError("Not enough clean sentences to continue.")

    print("\nFirst 10 sentences:")
    for i, s in enumerate(sentences[:10], 1):
        print(f"{i}. {s}")

    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(sentences[:10])
    print("\nTF-IDF shape (first 10 sentences):", tfidf_matrix.shape)

    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(sentences, convert_to_numpy=True, show_progress_bar=True)
    print("\nEmbedding matrix shape:", embeddings.shape)
    print("Embedding shape for first sentence:", embeddings[0].shape)

    sim_1_2 = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    print("\nCosine similarity between sentence 1 and 2:", float(sim_1_2))

    query_text = title if (title and len(title) > 10) else sentences[0]
    query_emb = model.encode([query_text], convert_to_numpy=True)
    scores = cosine_similarity(query_emb, embeddings)[0]

    topk = min(args.topk, len(sentences))
    top_idx = np.argsort(scores)[::-1][:topk]

    print("\n=== Top-K sentences most similar to the query ===")
    print("Query:", query_text)
    for rank, idx in enumerate(top_idx, 1):
        print(f"\n{rank}) score={scores[idx]:.4f}")
        print(sentences[idx])

    summary_k = min(args.summary_k, len(top_idx))
    summary_idx = sorted(top_idx[:summary_k])
    summary = " ".join(sentences[i] for i in summary_idx)

    print("\n=== Simple Extractive Summary ===")
    print(summary)


if __name__ == "__main__":
    # Simulate command-line arguments for notebook execution
    main(argv=['--url', 'https://www.gutenberg.org/files/1342/1342-h/1342-h.htm'])

Source: https://www.gutenberg.org/files/1342/1342-h/1342-h.htm
Extraction method: fallback_all_paragraphs
Title: Pride and prejudice | Project Gutenberg

First 700 chars:
 PREFACE. List of Illustrations. Chapter: I., II., III., IV., V., VI., VII., VIII., IX., X., XI., XII., XIII., XIV., XV., XVI., XVII., XVIII., XIX., XX., XXI., XXII., XXIII., XXIV., XXV., XXVI., XXVII., XXVIII., XXIX., XXX., XXXI., XXXII., XXXIII., XXXIV., XXXV., XXXVI., XXXVII., XXXVIII., XXXIX., XL., XLI., XLII., XLIII., XLIV., XLV., XLVI., XLVII., XLVIII., XLIX., L., LI., LII., LIII., LIV., LV., LVI., LVII., LVIII., LIX., LX., LXI. by Jane Austen, with a Preface by George Saintsbury and Illustrations by Hugh Thomson CHISWICK PRESS:—CHARLES WHITTINGHAM AND CO. TOOKS COURT, CHANCERY LANE, LONDON. Walt Whitman has somewhere a fine and just distinction between âloving by allowanceâ and â
Length: 720732

Total sentences: 4694

First 10 sentences:
1. List of Illustrations.
2. Chapter: I., II., III., IV., V., VI., V

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/147 [00:00<?, ?it/s]


Embedding matrix shape: (4694, 384)
Embedding shape for first sentence: (384,)

Cosine similarity between sentence 1 and 2: 0.20047548413276672

=== Top-K sentences most similar to the query ===
Query: Pride and prejudice | Project Gutenberg

1) score=0.6378
The goodness of the minor characters in Pride and Prejudice has been already alluded to, and it makes a detailed dwelling on their beauties difficult in any space, and impossible in this.

2) score=0.6184
And despite the ability which Miss Austen has shown in working out the story, I for one should put Pride and Prejudice far lower if it did not contain what seem to me the very masterpieces of Miss Austenâs humour and of her faculty of character-creation—masterpieces who may indeed admit John Thorpe, the Eltons, Mrs. Norris, and one or two others to their company, but who, in one instance certainly, and perhaps in others, are still superior to them.

3) score=0.5623
by Jane Austen, with a Preface by George Saintsbury and Illustr