<a href="https://colab.research.google.com/github/ergul13/RAG/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import sys, subprocess
py = sys.executable
def pipi(x): subprocess.check_call([py, "-m", "pip", "install", "--quiet"] + x.split())

pipi("torch --index-url https://download.pytorch.org/whl/cu121")
pipi("transformers accelerate bitsandbytes sentencepiece")
pipi("faiss-cpu langdetect rank_bm25 pypdf beautifulsoup4 lxml")
pipi("FlagEmbedding")
pipi("gradio")

In [9]:
import os, shutil, glob, zipfile, pathlib
BASE_DIR = "/content/rag_suite"
DATA_DIR = f"{BASE_DIR}/data/raw"
PROC_DIR = f"{BASE_DIR}/data/processed"
INDEX_DIR = f"{BASE_DIR}/data/index"
FAISS_DIR = f"{INDEX_DIR}/faiss"
BM25_DIR = f"{INDEX_DIR}/bm25"

for p in [DATA_DIR, PROC_DIR, FAISS_DIR, BM25_DIR]:
    os.makedirs(p, exist_ok=True)

if os.path.exists("/content/Rag.zip"):
    with zipfile.ZipFile("/content/Rag.zip","r") as z: z.extractall(DATA_DIR)

# İstersen dosya yüklemek için:
# from google.colab import files
# files.upload()  # örn. data.zip seç
# !unzip -o data.zip -d /content/rag_suite/data/raw

In [10]:
import os, re, gc, json, pickle, glob, hashlib, shutil, time
from typing import List, Tuple, Dict
import torch, faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
from FlagEmbedding import BGEM3FlagModel, FlagReranker
from rank_bm25 import BM25Okapi
from langdetect import detect
from pypdf import PdfReader
from bs4 import BeautifulSoup

CFG = {
    "LLM_MODEL_QWEN": "Qwen/Qwen2.5-7B-Instruct",
    "LLM_MODEL_LLAMA": "meta-llama/Llama-3.1-8B-Instruct",
    "USE_LLAMA": False,
    "LOAD_4BIT": True,
    "MAX_NEW_TOKENS": 512,
    "EMB_MODEL": "BAAI/bge-m3",
    "RERANK_MODEL": "BAAI/bge-reranker-v2-m3",
    "CHUNK_TOK_MIN": 200,
    "CHUNK_TOK_MAX": 600,
    "CHUNK_OVERLAP_TOK": 80,
    "TOPK_DENSE": 30,
    "TOPK_BM25": 30,
    "MERGE_TOPK": 50,
    "FINAL_CONTEXTS": 8,
    "RRF_K": 60,
    "MQ_COUNT": 3,
    "CTX_TOKEN_LIMIT": 2800,
    "SEED": 42,
}

torch.manual_seed(CFG["SEED"])

def read_txt(p):
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def read_pdf(p):
    pdf = PdfReader(p)
    return "\n".join([page.extract_text() or "" for page in pdf.pages])

def read_html(p):
    html = read_txt(p)
    soup = BeautifulSoup(html, "lxml")
    for s in soup(["script","style","noscript"]): s.extract()
    return soup.get_text("\n")

def read_md(p):
    return read_txt(p)

def read_code(p):
    return read_txt(p)

def detect_lang_safe(text):
    try: return detect(text[:1000])
    except: return "unknown"

def tokenize_for_bm25(text):
    return re.findall(r"\w+", text.lower())

def simple_token_count(s):
    return max(1, len(re.findall(r"\S+", s)))

def split_chunks(text, min_tok, max_tok, overlap_tok):
    toks = re.findall(r"\S+", text)
    chunks, i = [], 0
    while i < len(toks):
        size = max_tok
        chunk = toks[i:i+size]
        if len(chunk) < min_tok and i != 0: break
        chunks.append(" ".join(chunk))
        i += max(1, size - overlap_tok)
    return chunks

def smart_chunk(file_path, text):
    ext = os.path.splitext(file_path)[1].lower()
    if ext in [".py",".js",".ts",".java",".cpp",".c",".ipynb",".rs",".go",".rb",".php",".cs",".scala",".kt",".swift"]:
        lines = text.splitlines()
        blocks, buf = [], []
        for line in lines:
            buf.append(line)
            if re.match(r"^\s*(def |class |function |if |for |while |switch|public |private |protected |#[^!]|@)", line) and len(buf) > 30:
                blocks.append("\n".join(buf)); buf=[]
        if buf: blocks.append("\n".join(buf))
        text_blocks = [b for blk in blocks for b in split_chunks(blk, CFG["CHUNK_TOK_MIN"]//2, CFG["CHUNK_TOK_MAX"], CFG["CHUNK_OVERLAP_TOK"]//2)]
        if not text_blocks:
            text_blocks = split_chunks(text, CFG["CHUNK_TOK_MIN"], CFG["CHUNK_TOK_MAX"], CFG["CHUNK_OVERLAP_TOK"])
        return text_blocks
    if ext == ".md":
        parts = re.split(r"\n(?=#)", text)
        out=[]
        for part in parts:
            out += split_chunks(part, CFG["CHUNK_TOK_MIN"], CFG["CHUNK_TOK_MAX"], CFG["CHUNK_OVERLAP_TOK"])
        return out
    return split_chunks(text, CFG["CHUNK_TOK_MIN"], CFG["CHUNK_TOK_MAX"], CFG["CHUNK_OVERLAP_TOK"])

def load_file(path):
    ext = os.path.splitext(path)[1].lower()
    if ext==".pdf": raw = read_pdf(path)
    elif ext in [".html",".htm"]: raw = read_html(path)
    elif ext==".md": raw = read_md(path)
    elif ext in [".txt",".log",".csv",".tsv",".json",".yml",".yaml"]: raw = read_txt(path)
    elif ext in [".py",".js",".ts",".java",".cpp",".c",".ipynb",".rs",".go",".rb",".php",".cs",".scala",".kt",".swift"]: raw = read_code(path)
    else: raw = ""
    return raw

def hash_str(s): return hashlib.sha256(s.encode("utf-8")).hexdigest()[:16]

def build_index(input_files: List[str]):
    model = BGEM3FlagModel(CFG["EMB_MODEL"], use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu")
    faiss_path = f"{FAISS_DIR}/index.faiss"
    meta_path = f"{FAISS_DIR}/meta.pkl"
    bm25_path = f"{BM25_DIR}/bm25.pkl"

    corpus_tokens = []
    corpus_texts = []
    corpus_meta = []
    dense_vectors = []

    for fp in input_files:
        raw = load_file(fp)
        if not raw.strip(): continue
        lang = detect_lang_safe(raw)
        chunks = smart_chunk(fp, raw)
        for idx, ch in enumerate(chunks):
            if not ch.strip(): continue
            doc_id = f"{hash_str(fp)}_{idx}"
            corpus_texts.append(ch)
            corpus_meta.append({"id": doc_id, "source": fp, "lang": lang})
            corpus_tokens.append(tokenize_for_bm25(ch))
            dense = model.encode(ch, batch_size=1)["dense_vecs"][0]
            dense_vectors.append(dense.astype("float32"))

    if not corpus_texts:
        raise RuntimeError("No text extracted. Upload supported files.")

    xb = torch.stack([torch.from_numpy(v) for v in dense_vectors]).cpu().numpy()
    index = faiss.IndexFlatIP(xb.shape[1])
    faiss.normalize_L2(xb)
    index.add(xb)
    faiss.write_index(index, faiss_path)

    with open(meta_path, "wb") as f: pickle.dump({"texts": corpus_texts, "meta": corpus_meta}, f)
    bm25 = BM25Okapi(corpus_tokens)
    with open(bm25_path, "wb") as f: pickle.dump({"bm25": bm25, "texts": corpus_texts, "meta": corpus_meta}, f)

def load_indices():
    faiss_path = f"{FAISS_DIR}/index.faiss"
    meta_path = f"{FAISS_DIR}/meta.pkl"
    bm25_path = f"{BM25_DIR}/bm25.pkl"
    index = faiss.read_index(faiss_path)
    with open(meta_path, "rb") as f: mm = pickle.load(f)
    with open(bm25_path, "rb") as f: bm = pickle.load(f)
    return index, mm["texts"], mm["meta"], bm["bm25"]

def dense_search(emb_model, index, texts, query, topk):
    q_vec = emb_model.encode(query, batch_size=1)["dense_vecs"][0].astype("float32").reshape(1, -1)
    faiss.normalize_L2(q_vec)
    D, I = index.search(q_vec, topk)
    res = []
    for d, i in zip(D[0], I[0]):
        if i < 0: continue
        res.append((int(i), float(d)))
    return res

def bm25_search(bm25, query, topk):
    toks = re.findall(r"\w+", query.lower())
    scores = bm25.get_scores(toks)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:topk]
    return [(i, float(scores[i])) for i in idx]

def rrf_fuse(ranklists: List[List[Tuple[int,float]]], k=60, topk=50):
    scores = {}
    for rl in ranklists:
        for r, (i, _) in enumerate(rl):
            scores[i] = scores.get(i, 0.0) + 1.0 / (k + r + 1)
    fused = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]
    return fused

def load_llm():
    model_id = CFG["LLM_MODEL_LLAMA"] if CFG["USE_LLAMA"] else CFG["LLM_MODEL_QWEN"]
    kwargs = {}
    if CFG["LOAD_4BIT"]:
        kwargs.update(dict(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, device_map="auto"))
    else:
        kwargs.update(dict(torch_dtype=torch.float16, device_map="auto"))
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=True)
    mdl = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, **kwargs)
    return tok, mdl

def load_reranker():
    return FlagReranker(CFG["RERANK_MODEL"], use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu")

def llm_generate(tok, mdl, prompt, temperature=0.2, max_new_tokens=CFG["MAX_NEW_TOKENS"]):
    ids = tok(prompt, return_tensors="pt").to(mdl.device)
    out = mdl.generate(**ids, do_sample=True, temperature=temperature, max_new_tokens=max_new_tokens, eos_token_id=tok.eos_token_id)
    return tok.decode(out[0], skip_special_tokens=True)

def multi_query_variants(tok, mdl, q, n=3, lang="en"):
    sysmsg = "You generate diverse rephrasings of a user query for better retrieval. Return each variant on a new line without numbering."
    if lang == "tr":
        sysmsg = "Kullanıcı sorgusunu daha iyi arama için çeşitli yeniden ifadeler üret. Numaralandırma yapmadan her varyantı yeni satırda döndür."
    prompt = f"<|system|>\n{sysmsg}\n<|user|>\n{q}\n"
    raw = llm_generate(tok, mdl, prompt, temperature=0.7, max_new_tokens=128)
    lines = [l.strip() for l in raw.splitlines() if l.strip()]
    uniq, seen = [], set()
    for l in lines:
        if l.lower().startswith(("assistant","system","user")): continue
        if l not in seen:
            uniq.append(l); seen.add(l)
        if len(uniq) >= n: break
    if not uniq: uniq = [q, q+"?", q+" details"]
    return uniq

def truncate_contexts(ctxs: List[str], limit_tokens=2800):
    out, count = [], 0
    for c in ctxs:
        tc = simple_token_count(c)
        if count + tc > limit_tokens: break
        out.append(c); count += tc
    return out

def build_prompt(query, contexts, metas, lang="en"):
    citations = []
    for i,(c,m) in enumerate(zip(contexts, metas), start=1):
        src = m.get("source","")
        citations.append(f"[{i}] {os.path.basename(src)} | {src}")
    ctx_block = "\n\n".join([f"[{i+1}] {c}" for i,c in enumerate(contexts)])
    if lang=="tr":
        instr = "Aşağıdaki bağlam parçalarına dayanarak soruyu yanıtla. Yanıtın sonunda kullandığın kaynak numaralarını belirt. Bağlamda yoksa 'yetersiz bağlam' de."
    else:
        instr = "Answer the question based strictly on the context. Cite the used source numbers at the end. If information is missing, say 'insufficient context'."
    prompt = f"<|system|>\n{instr}\n<|user|>\nQuestion:\n{query}\n\nContext:\n{ctx_block}\n\nAnswer:\n"
    return prompt, "\n".join(citations)

def rerank_and_select(reranker, query, texts, metas, n_final):
    pairs = [(query, t) for t in texts]
    scores = reranker.compute_score(pairs, normalize=True)
    ranked = list(zip(range(len(texts)), scores))
    ranked.sort(key=lambda x: x[1], reverse=True)
    idx = [i for i,_ in ranked[:n_final]]
    sel_texts = [texts[i] for i in idx]
    sel_metas = [metas[i] for i in idx]
    return sel_texts, sel_metas

def list_supported():
    return [".pdf",".html",".htm",".md",".txt",".log",".csv",".tsv",".json",".yml",".yaml",".py",".js",".ts",".java",".cpp",".c",".ipynb",".rs",".go",".rb",".php",".cs",".scala",".kt",".swift"]

def build_all_indices():
    files = []
    for ext in list_supported():
        files += glob.glob(os.path.join(DATA_DIR, f"*{ext}"))
    if not files:
        raise RuntimeError("Upload some files first.")
    build_index(files)
    return f"Indexed {len(files)} files."

def run_retrieval(query, lang_hint=None):
    index, texts, meta, bm25 = load_indices()
    emb = BGEM3FlagModel(CFG["EMB_MODEL"], use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu")
    tok, mdl = load_llm()
    lang = lang_hint or ("tr" if any(ch in query for ch in "ığüşöçİĞÜŞÖÇ") else "en")
    variants = [query] + multi_query_variants(tok, mdl, query, n=CFG["MQ_COUNT"], lang=lang)
    dense_lists = [dense_search(emb, index, texts, v, CFG["TOPK_DENSE"]) for v in variants]
    bm25_lists = [bm25_search(bm25, v, CFG["TOPK_BM25"]) for v in variants]
    fused = rrf_fuse(dense_lists + bm25_lists, k=CFG["RRF_K"], topk=CFG["MERGE_TOPK"])
    cand_idx = [i for i,_ in fused]
    cand_texts = [texts[i] for i in cand_idx]
    cand_metas = [meta[i] for i in cand_idx]
    rr = load_reranker()
    sel_texts, sel_metas = rerank_and_select(rr, query, cand_texts, cand_metas, CFG["FINAL_CONTEXTS"])
    return sel_texts, sel_metas, tok, mdl, lang

def answer_query(query, lang_hint=None):
    ctx_texts, ctx_metas, tok, mdl, lang = run_retrieval(query, lang_hint)
    if not ctx_texts: return "No context found.", []
    ctx_texts = truncate_contexts(ctx_texts, CFG["CTX_TOKEN_LIMIT"])
    prompt, cite_list = build_prompt(query, ctx_texts, ctx_metas, lang)
    ans = llm_generate(tok, mdl, prompt, temperature=0.2, max_new_tokens=CFG["MAX_NEW_TOKENS"])
    return ans, ctx_metas

In [11]:
import gradio as gr

def ui_ingest(files):
    saved = []
    if files:
        for f in files:
            dst = os.path.join(DATA_DIR, os.path.basename(f.name))
            with open(dst, "wb") as w: w.write(f.read())
            saved.append(dst)
    msg = build_all_indices()
    return f"Uploaded {len(saved)} files. {msg}"

def ui_query(q):
    if not os.path.exists(f"{FAISS_DIR}/index.faiss"):
        raise gr.Error("Index not found. Upload and Build index first.")
    ans, metas = answer_query(q)
    cites = "\n".join([f"- {os.path.basename(m['source'])} ({m['source']})" for m in metas])
    return ans, cites

def set_model(choice):
    CFG["USE_LLAMA"] = (choice == "Llama-3.1-8B-Instruct")
    return f"Model set to: {choice}"

with gr.Blocks(title="Hybrid RAG Suite") as demo:
    gr.Markdown("# Hybrid RAG Suite (TR/EN) • Dense+BM25 • Rerank • Multi-Query • Qwen/Llama • Gradio")

    with gr.Tab("Ingest"):
        up = gr.File(file_count="multiple", label="Upload files")
        btn_idx = gr.Button("Build/Refresh Index")
        out_idx = gr.Markdown()
        up.change(ui_ingest, inputs=up, outputs=out_idx)
        btn_idx.click(lambda: build_all_indices(), outputs=out_idx)
        gr.Markdown("Supported: " + ", ".join(list_supported()))

    with gr.Tab("Chat"):
        model_choice = gr.Radio(["Qwen2.5-7B-Instruct","Llama-3.1-8B-Instruct"], value="Qwen2.5-7B-Instruct", label="Response LLM")
        setbtn = gr.Button("Set Model")
        setout = gr.Markdown()
        setbtn.click(lambda c: set_model(c), inputs=model_choice, outputs=setout)
        q = gr.Textbox(label="Query")
        ask = gr.Button("Ask")
        a = gr.Markdown()
        cites = gr.Markdown()
        ask.click(ui_query, inputs=q, outputs=[a, cites])

    with gr.Tab("Utils"):
        btn_clean = gr.Button("Reset All")
        msg = gr.Markdown()
        def reset_all():
            shutil.rmtree(BASE_DIR, ignore_errors=True)
            os.makedirs(DATA_DIR, exist_ok=True)
            os.makedirs(PROC_DIR, exist_ok=True)
            os.makedirs(FAISS_DIR, exist_ok=True)
            os.makedirs(BM25_DIR, exist_ok=True)
            return "Reset done."
        btn_clean.click(reset_all, outputs=msg)

demo.launch(debug=False, share=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

