# Visit Grand Junction – End‑to‑End RAG Notebook (GPU, LoRA, UX Extras)
This single Jupyter notebook now contains **all 12 requested upgrades**:

1. Incremental recrawl with SHA‑256 hashing  
2. Back‑off & retry on fetch failures  
3. MIME‑type detection (saves PDFs/Docs separately)  
4. Boilerplate removal via **trafilatura**  
5. Semantic chunking (sentence/window)  
6. Retrieval **+ BGE reranking**  
7. Local **Mistral‑7B** with LoRA support (4‑bit)  
8. In‑notebook LoRA fine‑tuning on branded Q&A  
9. Streamlit front‑end with citation pop‑overs  
10. Image carousel placeholder  
11. Follow‑up suggestion chips  
12. Chat‑history export to PDF

Run sections sequentially (0 → 11).  
LoRA training needs ~24 GB GPU VRAM or adjust batch/accumulation steps.


In [None]:
!pip -q install faiss-cpu aiohttp trafilatura peft sentence-transformers trl accelerate qdrant-client pdfkit reportlab streamlit pymupdf nltk

In [None]:
# import asyncio, hashlib, json, mimetypes, os, random, re, time, uuid, xml.etree.ElementTree as ET
# from pathlib import Path
# from urllib.parse import urljoin, urldefrag, urlparse

# import aiohttp, faiss, numpy as np, requests, torch
# from bs4 import BeautifulSoup
# from sentence_transformers import SentenceTransformer, CrossEncoder
# from tqdm.notebook import tqdm
# from trafilatura import extract

In [None]:
import asyncio, aiohttp, hashlib, json, mimetypes, os, random, re, time, uuid, textwrap, datetime, xml.etree.ElementTree as ET
from pathlib import Path
from urllib.parse import urlparse, urljoin, urldefrag
import bs4, trafilatura, nltk, requests
import torch, faiss, numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
nltk.download("punkt", quiet=True)

In [None]:
import nltk

for res in ("punkt", "punkt_tab"):
    try:
        nltk.data.find(f"tokenizers/{res}")
    except LookupError:
        nltk.download(res, quiet=True)

In [None]:
BASE_URL              = "https://www.visitgrandjunction.com"
ADDITIONAL_DOMAINS    = ["campaign-archive.com", "mailchi.mp"]
DATA_DIR_TXT          = Path("data/html_txt");    DATA_DIR_TXT.mkdir(parents=True, exist_ok=True)
RAW_HTML_DIR          = Path("data/raw_html");    RAW_HTML_DIR.mkdir(parents=True, exist_ok=True)
NO_TEXT_HTML_DIR      = Path("data/html_no_text");NO_TEXT_HTML_DIR.mkdir(parents=True, exist_ok=True)
MIME_DIR              = Path("data/mime");        MIME_DIR.mkdir(parents=True, exist_ok=True)
CRAWL_DELAY           = 0.5
N_WORKERS             = 10
MAX_RETRIES           = 5
BACKOFF_FACTOR        = 1.5
EMBED_MODEL_NAME      = "sentence-transformers/all-MiniLM-L6-v2"
RERANKER_NAME         = "BAAI/bge-reranker-base"
LLM_BASE              = "mistralai/Mistral-7B-Instruct-v0.2"
LORA_ADAPTER_OUT      = Path("lora-vgj-checkpoint")
MANUAL_QA_JL          = Path("vgj_lora_dataset.jsonl")
AUTO_QA_JL            = Path("vgj_auto_dataset.jsonl")
COMBINED_QA_JL        = Path("vgj_combined.jsonl")
INDEX_PATH            = Path("faiss.index")
CHUNK_TOKENS          = 200
OVERLAP_TOKENS        = 40
DEVICE                = "cuda" if torch.cuda.is_available() else "cpu"
HASH_RECORDS          = Path("data/hashes.json")
DEBUG                 = True

## 1  Helper functions (robots, hashing, sitemap)

In [None]:
def dbg(*msg): 
    if DEBUG: print(*msg)

def robots_disallow(domain):
    try:
        txt = requests.get(f"https://{domain}/robots.txt", timeout=10).text.lower()
        return [l.split(":",1)[1].strip() for l in txt.splitlines() if l.startswith("disallow")]
    except Exception:
        return []

def internal_set(base, extra):
    return {urlparse(base).netloc, *extra}

def allowed(url, nets, dis_map):
    p = urlparse(url)
    if p.scheme not in {"http", "https"}:
        return False

    # accept if the host ends with any entry in nets
    if not any(p.netloc == n or p.netloc.endswith("." + n) for n in nets):
        return False

    # robots.txt disallows
    return not any(url.startswith(path) for path in dis_map.get(p.netloc, []))

async def sitemap_seed(base, nets):
    try:
        r = requests.get(f"{base}/sitemap.xml", timeout=15); r.raise_for_status()
        locs = re.findall(r"<loc>(.*?)</loc>", r.text)
        urls = [u for u in locs if allowed(u, nets, {})]
        dbg(f"Sitemap OK – {len(urls)} internal URLs")
        return urls or [base]
    except Exception as e:
        dbg("Sitemap fetch failed:", e)
        return [base]


In [None]:
def sha256(b): return hashlib.sha256(b).hexdigest()
HASH_DB = json.loads(HASH_RECORDS.read_text()) if HASH_RECORDS.exists() else {}
def upsert_hash(url, h):
    HASH_DB[url] = h
    HASH_RECORDS.write_text(json.dumps(HASH_DB))

In [None]:
class RateLimiter:
    def __init__(self, delay): self.delay, self.next_ts, self.lock = delay, 0, asyncio.Lock()
    async def __aenter__(self):
        async with self.lock:
            await asyncio.sleep(max(0, self.next_ts - time.time()))
            self.next_ts = time.time() + self.delay
    async def __aexit__(self, *_): pass

In [None]:
USER_AGENTS = [
    # Windows Chrome
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/125.0.0.0 Safari/537.36",

    # macOS Safari
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) "
    "AppleWebKit/605.1.15 (KHTML, like Gecko) "
    "Version/17.4 Safari/605.1.15",

    # Firefox
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) "
    "Gecko/20100101 Firefox/124.0",

    # Microsoft Edge
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0",
]

async def fetch(session, url, rl):
    for attempt in range(MAX_RETRIES):
        try:
            async with rl:
                async with session.get(url, timeout=20) as r:
                    r.raise_for_status()
                    mime = r.headers.get("content-type","text/html").split(";")[0]
                    return mime, await r.read()
        except Exception:
            await asyncio.sleep(BACKOFF_FACTOR ** attempt)
    return None, None

In [None]:
async def crawl(seed):
    assert seed, "Seed list empty – nothing to crawl."
    nets = internal_set(BASE_URL, ADDITIONAL_DOMAINS)
    dis  = {n: robots_disallow(n) for n in nets}
    q    = asyncio.Queue();  [q.put_nowait(u) for u in seed]
    seen = set()

    ua = random.choice(USER_AGENTS)
    headers = {
        "User-Agent":        ua,
        "Accept":            "text/html,application/xhtml+xml,application/xml;q=0.9,"
                             "image/avif,image/webp,*/*;q=0.8",
        "Accept-Language":   "en-US,en;q=0.9",
        "Accept-Encoding":   "gzip, deflate, br",
        "Upgrade-Insecure-Requests": "1",
        "Connection":        "keep-alive",
    }
    async with aiohttp.ClientSession(headers=headers) as session:
        tasks = [asyncio.create_task(
                    worker(f"w{i}", i, session, q, seen, CRAWL_DELAY, nets, dis))
                 for i in range(N_WORKERS)]
        await q.join()
        for t in tasks: t.cancel()


In [None]:
async def worker(name, idx, session, q, seen, delay, nets, dis):
    rl  = RateLimiter(delay)
    bar = tqdm(total=0, position=idx+1, desc=name, unit="pg", leave=True)

    while True:
        url = await q.get(); q.task_done()
        if url in seen:
            continue
        seen.add(url)
        bar.set_description(f"{name} {url}")

        mime, body = await fetch(session, url, rl)
        if not body:
            bar.update(); continue

        uid = hashlib.md5(url.encode()).hexdigest()

        # ───── Non-HTML assets ─────
        if mime != "text/html":
            ext = mimetypes.guess_extension(mime) or ".bin"
            (MIME_DIR / f"{uid}{ext}").write_bytes(body)
            bar.update(); continue

        # Always parse links from *any* HTML page
        soup = bs4.BeautifulSoup(body, "lxml")
        for a in soup.find_all("a", href=True):
            link, _ = urldefrag(urljoin(url, a["href"]))
            if allowed(link, nets, dis):
                q.put_nowait(link)

        # Clean-text extraction to decide where to store the page
        text = trafilatura.extract(body) or ""
        unsupported = "your browser is not supported for this experience" in text.lower()

        if len(text) < 100 or unsupported:
            # HTML with little/no useful text
            (NO_TEXT_HTML_DIR / f"{uid}.html").write_bytes(body)
            bar.update(); continue

        # “Good” page: keep everything
        (RAW_HTML_DIR  / f"{uid}.html").write_bytes(body)
        (DATA_DIR_TXT  / f"{uid}.txt").write_text(text)
        (DATA_DIR_TXT  / f"{uid}.url").write_text(url)
        bar.update()


In [None]:
DEBUG      = True
seed_urls  = await sitemap_seed(BASE_URL, internal_set(BASE_URL, ADDITIONAL_DOMAINS))
await crawl(seed_urls)
DEBUG      = False

## 3  Semantic chunking

In [None]:
# ────────────────────────────────────────────────────────────────
# 1.  Chunk helper  (sentence-aware, overlap aware)
# ────────────────────────────────────────────────────────────────
def chunks(text: str, max_tok: int = CHUNK_TOKENS, ov: int = OVERLAP_TOKENS):
    """
    Yield overlapping chunks of <= max_tok words.
    Overlap = last `ov` sentences of the previous chunk.
    """
    sents = nltk.sent_tokenize(text)
    buf, cur = [], 0                 # buffer of sentences, current token count

    for s in sents:
        n = len(s.split())
        if cur + n > max_tok and buf:
            yield " ".join(buf)
            # keep the *sentences* overlap, not tokens
            buf = buf[-ov:] if ov else []
            cur = sum(len(t.split()) for t in buf)

        buf.append(s)
        cur += n

    if buf:
        yield " ".join(buf)

# ────────────────────────────────────────────────────────────────
# 2.  Build / refresh FAISS index
# ────────────────────────────────────────────────────────────────
from tqdm.auto import tqdm
import faiss, json, numpy as np
from sentence_transformers import SentenceTransformer

TXT_DIR   = DATA_DIR_TXT                 # data/html_txt/
META_JSON = Path("meta.jsonl")           # one JSON per line

embedder  = SentenceTransformer(EMBED_MODEL_NAME, device=DEVICE)

index = None                # lazy-init once we know the dim
meta_f = META_JSON.open("w")

EXCLUDE_PREFIX = "https://www.visitgrandjunction.com/blog/all-posts"

# stream through all .txt files
files = sorted(TXT_DIR.glob("*.txt"))
for f in tqdm(files, desc="chunk->embed->index", unit="doc"):
    url = (f.parent / f"{f.stem}.url").read_text().strip()

    # ─── skip anything under /blog/all-posts ───
    if url.startswith(EXCLUDE_PREFIX):
        continue
    
    txt = f.read_text()

    for chunk in chunks(txt):
        vec = embedder.encode([chunk],
                              convert_to_numpy=True,
                              normalize_embeddings=True)[0]

        if index is None:                # first vector → build index shell
            index = faiss.IndexFlatIP(vec.shape[0])

        index.add(vec.reshape(1, -1))

        # keep meta aligned with vector order
        meta_f.write(json.dumps({"url": url, "text": chunk}) + "\n")

meta_f.close()
faiss.write_index(index, str(INDEX_PATH))   # ← convert Path → str
print(f"Indexed {index.ntotal:,} chunks → {INDEX_PATH}")
print(f"Meta written           → {META_JSON}")

In [None]:
# ────────────────────────────────────────────────────────────────
# 3.  Reranker + retrieval helper
# ────────────────────────────────────────────────────────────────
from sentence_transformers.cross_encoder import CrossEncoder
import itertools

# reload meta into memory (URLs are small)
meta_records = [json.loads(l) for l in META_JSON.open()]

reranker = CrossEncoder(RERANKER_NAME, device=DEVICE)

def retrieve(query: str, k: int = 20, n: int = 5):
    """
    1) semantic search k candidates with SBERT+FAISS
    2) Cross-Encoder rerank → top-n paragraphs
    Returns list of (paragraph, url).
    """
    qvec = embedder.encode([query],
                           convert_to_numpy=True,
                           normalize_embeddings=True)
    D, I = index.search(qvec, k)
    if not len(I[0]):
        return []

    # candidate texts / meta
    cand_texts = [meta_records[i]["text"] for i in I[0]]
    cand_urls  = [meta_records[i]["url"]  for i in I[0]]

    scores = reranker.predict(list(zip(itertools.repeat(query), cand_texts)))
    best   = np.argsort(scores)[::-1][:n]

    return [(cand_texts[i], cand_urls[i]) for i in best]


## 5  LoRA‑ready model loader

In [None]:
# from peft import LoraConfig,get_peft_model,prepare_model_for_kbit_training
# from transformers import AutoModelForCausalLM,AutoTokenizer

# def load_lora_model(ckpt=None):
#     base=AutoModelForCausalLM.from_pretrained(BASE_LLM,load_in_4bit=True,device_map="auto",torch_dtype=torch.float16)
#     base=prepare_model_for_kbit_training(base)
#     if ckpt and Path(ckpt).exists():
#         print("Loading LoRA weights…"); base.load_adapter(ckpt)
#     tok=AutoTokenizer.from_pretrained(BASE_LLM)
#     return base,tok

## 6  Dataset creation

In [None]:
# ───────────────────────── configuration ─────────────────────────
LLM_NAME         = "mistralai/Mistral-7B-Instruct-v0.2"
PARA_MAX         = 3
ANSWER_TOK_CAP   = 220

TXT_DIR          = Path("data/html_txt")   # crawled .txt files
RAW_HTML_DIR     = Path("data/raw_html")   # raw HTML files

MANUAL_QA_JL     = Path("vgj_lora_dataset.jsonl")
AUTO_QA_JL       = Path("vgj_auto_dataset.jsonl")
COMBINED_QA_JL   = Path("vgj_combined.jsonl")

# ───────── load base model once (4-bit) ─────────
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch, json, re, bs4, nltk
from tqdm.auto import tqdm

quant_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
tok = AutoTokenizer.from_pretrained(LLM_NAME, use_fast=True)
llm = AutoModelForCausalLM.from_pretrained(
    LLM_NAME,
    quantization_config=quant_cfg,
    torch_dtype=torch.float16,
    device_map={"": 0},
)

# ───────── helper: generate a question  ─────────
def gen_question(passage: str) -> str:
    sys = ("You are a helpful travel assistant. Read the PASSAGE and invent one "
           "concise, natural-sounding traveler question that could be answered "
           "by the same passage. Return ONLY the question text.")
    prompt = (f"<s>[INST] <<SYS>>\n{sys}\n<</SYS>>\n\n"
              f"PASSAGE:\n'''{passage}'''\n[/INST]")
    ids  = tok(prompt, return_tensors="pt").to(llm.device)
    with torch.no_grad():
        out = llm.generate(**ids, max_new_tokens=40,
                           pad_token_id=tok.eos_token_id)[0]
    q = tok.decode(out[ids.input_ids.shape[-1]:],
                   skip_special_tokens=True).strip()
    return q if q.endswith("?") else q + "?"

DEBUG = True   # turn off for production

# ───────── boiler-plate detector ─────────
BOILER_PAT = re.compile(
    r"(click here|minute read|photo credit|browser is not supported)",
    flags=re.I,
)

# ───────── build auto Q-A (drop dirty rows) ─────────
collapse = lambda s: re.sub(r"\s+", " ", s).strip()
auto_examples, skipped = [], 0

for txt_f in tqdm(sorted(TXT_DIR.glob("*.txt")), desc="auto-QA", unit="page"):
    url  = txt_f.with_suffix(".url").read_text().strip()
    html = (RAW_HTML_DIR / f"{txt_f.stem}.html").read_text()
    soup = bs4.BeautifulSoup(html, "lxml")

    paras = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    paras = [p for p in paras if len(p.split()) > 25][:PARA_MAX]
    if not paras:
        continue

    passage  = "\n\n".join(paras)
    question = gen_question(passage)

    # cap answer length
    words, answer_words = 0, []
    for p in paras:
        if words + len(p.split()) > ANSWER_TOK_CAP:
            break
        answer_words.extend(p.split()); words += len(p.split())
    answer = " ".join(answer_words) or paras[0]

    # ── skip if answer contains boiler-plate ──
    if BOILER_PAT.search(answer):
        skipped += 1
        continue

    print(f"\nQ: {question}\nA: {answer[:120]}…\n")
    auto_examples.append({"instruction": question,
                          "input": "",
                          "output": answer})

print(f"Skipped {skipped} junk excerpts")

# ───────── write JSONL files ─────────
with AUTO_QA_JL.open("w") as f:
    for ex in auto_examples:
        f.write(json.dumps(ex) + "\n")

with COMBINED_QA_JL.open("w") as out:
    for src in (MANUAL_QA_JL, AUTO_QA_JL):
        if src.exists():
            out.writelines(src.open())

print(f"Generated {len(auto_examples):,} clean pairs → {AUTO_QA_JL}")
print(f"Combined dataset written       → {COMBINED_QA_JL}")


## 7  In‑notebook LoRA fine‑tuning

In [1]:
# ───────────────────── 0. house-keeping  ───────────────────────
from pathlib import Path
import torch, gc, os
gc.collect(); torch.cuda.empty_cache()          # clear VRAM

# ───────────────────── 1. paths & knobs  ────────────────────────
BASE_MODEL      = "mistralai/Mistral-7B-Instruct-v0.2"
COMBINED_QA_JL  = "vgj_auto_dataset.jsonl"          # built earlier
CHECKPOINT_DIR  = "lora-vgj-checkpoint"

# LoRA hyper-params
LORA_R          = 16
LORA_ALPHA      = 32
LORA_DROPOUT    = 0.05

# training hyper-params
BATCH_PER_GPU   = 4          # ← tweak 3
GRAD_ACC_STEPS  = 4
LOG_STEPS       = 1           # ← tweak 1
EVAL_STEPS      = 1          # ← tweak 2
PATIENCE   = 3           # stop after 3 stagnant evals

EPOCHS = 10
LR = 2e-4

# ───────────────────── 2. tokenizer & base model  ───────────────
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          BitsAndBytesConfig)

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit               = True,
    bnb_4bit_quant_type        = "nf4",
    bnb_4bit_compute_dtype     = torch.float16,
    bnb_4bit_use_double_quant  = True
)

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
tok.pad_token = tok.eos_token           # needed for packed batches

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config = bnb_cfg,
    device_map          = {"": 0},      # push everything to cuda:0
    torch_dtype         = torch.float16
)

# ───────────────────── 3. wrap with LoRA  ───────────────────────
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

base = prepare_model_for_kbit_training(base)

lora_cfg = LoraConfig(
    r              = LORA_R,
    lora_alpha     = LORA_ALPHA,
    lora_dropout   = LORA_DROPOUT,
    bias           = "none",
    task_type      = "CAUSAL_LM"
)

model = get_peft_model(base, lora_cfg)
model.print_trainable_parameters()      # sanity check

# ───────────────────── 4. dataset  → chat template  ─────────────
from datasets import load_dataset
from sklearn.model_selection import train_test_split   # only to shuffle indices

def to_chat(example):
    user = example["instruction"].strip()
    if example["input"]:
        user += "\n" + example["input"].strip()
    return {
        "text": (
            f"<s>[INST] {user} [/INST] "
            f"{example['output'].strip()} </s>"
        )
    }

# load + template
dataset = (load_dataset("json",
                        data_files=COMBINED_QA_JL,
                        split="train")
           .map(to_chat,
                remove_columns=["instruction", "input", "output"]))

# 90 / 10 random split   (seed-stable)
train_idx, eval_idx = train_test_split(
    list(range(len(dataset))), test_size=0.1, random_state=42
)
train_set = dataset.select(train_idx)
eval_set  = dataset.select(eval_idx)

print("train =", len(train_set), "rows • eval =", len(eval_set))

# ───────────────────── 5. training args & trainer  ──────────────
from transformers import TrainingArguments, EarlyStoppingCallback
from trl import SFTTrainer

train_args = TrainingArguments(
    output_dir                   = CHECKPOINT_DIR,
    per_device_train_batch_size  = BATCH_PER_GPU,
    gradient_accumulation_steps  = GRAD_ACC_STEPS,
    num_train_epochs             = EPOCHS,
    learning_rate                = LR,
    lr_scheduler_type            = "cosine",
    warmup_ratio                 = 0.03,
    logging_steps                = LOG_STEPS,
    eval_strategy                = "steps",   # <─ ADD THIS
    eval_steps                   = EVAL_STEPS,
    load_best_model_at_end       = True,        # ← keep best epoch
    metric_for_best_model        = "eval_loss",
    greater_is_better            = False,
    save_strategy                = "steps",
    fp16                         = True,
    report_to                    = [],
)

trainer = SFTTrainer(
    model         = model,
    args          = train_args,
    train_dataset = train_set,
    eval_dataset  = eval_set,
    callbacks     = [EarlyStoppingCallback(
                        early_stopping_patience=PATIENCE,
                        early_stopping_threshold=0.0)]
)


# ───────────────────── 6. launch fine-tuning  ───────────────────
trainer.train()
model.save_pretrained(CHECKPOINT_DIR)
tok.save_pretrained(CHECKPOINT_DIR)

print(f"\nLoRA adapter + tokenizer saved to → {CHECKPOINT_DIR}")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940
train = 417 rows • eval = 47


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
1,3.4934,3.641377
2,3.5313,3.602441
3,3.3887,3.523367
4,3.2957,3.401173
5,3.2061,3.23756
6,3.0772,3.041271
7,2.8831,2.831852
8,2.7063,2.636621
9,2.5791,2.496101
10,2.4665,2.430332



LoRA adapter + tokenizer saved to → lora-vgj-checkpoint
