In [1]:
!pip install -U --no-cache-dir faiss-gpu-cu11
!pip -q install -U transformers sentence-transformers tqdm bitsandbytes accelerate
!pip install flash-attn --no-build-isolation
!pip install -U qwen

Collecting faiss-gpu-cu11
  Downloading faiss_gpu_cu11-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting nvidia-cuda-runtime-cu11>=11.8.89 (from faiss-gpu-cu11)
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cublas-cu11>=11.11.3.6 (from faiss-gpu-cu11)
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading faiss_gpu_cu11-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl (417.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl (875 kB)
[2K   [9

In [2]:
import json
from pathlib import Path
from google.colab import drive
import faiss

In [3]:
drive.mount('/content/drive')
DEV_DIR = Path("/content/drive/MyDrive/dataset/dev.jsonl")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
COLLECTION_TO_RESOURCES = {
    "clapnq": {
        "corpus_path": "/content/drive/MyDrive/shared/dataset/clapnq/corpus.jsonl",
        "index_path":  "/content/drive/MyDrive/shared/indexes_qwen3_emb4b/clapnq-qwen3-emb4b/index.faiss",
        "id_path": "/content/drive/MyDrive/shared/indexes_qwen3_emb4b/clapnq-qwen3-emb4b/doc_ids.json",
    },

    "cloud": {
        "corpus_path": "/content/drive/MyDrive/shared/dataset/cloud/corpus.jsonl",
        "index_path":  "/content/drive/MyDrive/shared/indexes_qwen3_emb4b/cloud-qwen3-emb4b/index.faiss",
        "id_path": "/content/drive/MyDrive/shared/indexes_qwen3_emb4b/cloud-qwen3-emb4b/doc_ids.json",
    },
    "fiqa": {
        "corpus_path": "/content/drive/MyDrive/shared/dataset/fiqa/corpus.jsonl",
        "index_path":  "/content/drive/MyDrive/shared/indexes_qwen3_emb4b/fiqa-qwen3-emb4b/index.faiss",
        "id_path": "/content/drive/MyDrive/shared/indexes_qwen3_emb4b/fiqa-qwen3-emb4b/doc_ids.json",
    },
    "govt": {
        "corpus_path": "/content/drive/MyDrive/shared/dataset/govt/corpus.jsonl",
        "index_path":  "/content/drive/MyDrive/shared/indexes_qwen3_emb4b/govt-qwen3-emb4b/index.faiss",
        "id_path": "/content/drive/MyDrive/shared/indexes_qwen3_emb4b/govt-qwen3-emb4b/doc_ids.json",
    },
}
QUERIES_PATH = Path("/content/drive/MyDrive/dataset/all_last_turn.jsonl")

In [5]:
def get_corpus(obj):
    coll = obj.get("Collection") or obj.get("collection")
    if "clapnq" in coll:
        return "clapnq"
    elif "cloud" in coll:
        return "cloud"
    elif "fiqa" in coll:
        return "fiqa"
    elif "govt" in coll:
        return "govt"

In [6]:
CACHE = {}  # corpus_name -> dict(index, ids, offsets, corpus_path)

def load_corpus_cache(corpus_name):
    if corpus_name in CACHE:
        return CACHE[corpus_name]

    corpus_path = COLLECTION_TO_RESOURCES[corpus_name]["corpus_path"]
    index_path  = COLLECTION_TO_RESOURCES[corpus_name]["index_path"]

    # 1) load faiss index once
    index = faiss.read_index(index_path)

    # 2) build (ids, offsets) without loading all texts
    ids = []
    texts = []

    with open(corpus_path, "rb") as f:          # binary mode for exact offsets
        for line in f:
            obj = json.loads(line.decode("utf-8"))
            ids.append(obj["_id"])
            texts.append(obj["text"])

    CACHE[corpus_name] = {
        "index": index,
        "ids": ids,
        "texts": texts,
        "corpus_path": corpus_path,
    }
    return CACHE[corpus_name]

In [7]:
from sentence_transformers import SentenceTransformer
import torch

embedding_model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-4B",
    device="cpu",
    model_kwargs={
        "dtype": torch.float16,
    },
    tokenizer_kwargs={"padding_side": "left"},
)
embedding_model.max_seq_length = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [8]:
@torch.no_grad()
def embed_text(text):
    emb = embedding_model.encode(
        [text],
        batch_size=1,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True,
    ).astype("float32")                # shape: (1, dim)
    return emb

In [9]:
def retrieve(obj, question, top_k=10): # the question should use query rewrite
    corpus = get_corpus(obj)
    cache = load_corpus_cache(corpus)

    q = embed_text(question)  # (1, dim) float32 normalized
    scores, idxs = cache["index"].search(q, top_k)

    results = []
    for score, i in zip(scores[0].tolist(), idxs[0].tolist()):
        if i == -1:
            continue
        results.append({
            "doc_id": cache["ids"][i],
            "score": float(score),
            "text": cache["texts"][i],
        })
    return results

In [10]:
from sentence_transformers import CrossEncoder

cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L12-v2", device="cpu")

def rerank(question, contexts, top_k=5, batch_size=32):
    # keep the consine score
    for c in contexts:
        c["cos_score"] = c.get("score", None)

    pairs = [(question, c["text"]) for c in contexts]
    ce_scores = cross_encoder_model.predict(pairs, batch_size=batch_size)

    for c, s in zip(contexts, ce_scores):
        c["re_score"] = float(s)

    # rerank and only return top 5
    contexts = sorted(contexts, key=lambda x: x["re_score"], reverse=True)
    return contexts[:top_k]

config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os

base_model_name = "Qwen/Qwen3-14B"

generation_tokenizer = AutoTokenizer.from_pretrained(base_model_name)


# Load base model
generation_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
)

# adding LoRA adapter

generation_model = PeftModel.from_pretrained(
    generation_model,
    "/content/drive/MyDrive/rag_lora_adapter_qwen3_14B_try3"

)

generation_model.eval()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/3.84G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 5120)
        (layers): ModuleList(
          (0-39): 40 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(

In [12]:
def generate_answer(history, current, context):
    formatted = []
    for i, r in enumerate(context, start=1):
        formatted.append(f"[Document #{i}]\n{r['text']}\n")
    context_with_score = "\n".join(formatted)
    if history:
      limited = []
      if(len(history) > 2):
        limited = history[-4:]
      else:
        limited = history[-2:]
      history_text = "\n".join(f"{h[0]}: {h[1]}" for h in limited)
    else:
        history_text = ""

    instruct = """
               You are a RAG answer generator.
                Use the reference documents and the conversation history as the main source of information to answer the question.
                If the reference contains partial clues, synthesize them to answer the question.
                Answer concisely (1–2 sentences).
                If the reference is empty, answer: I don't know.
                """.strip()
    prompt = f"""
              REFERENCE:
              {context_with_score}

              HISTORY:
              {history_text}

              QUESTION:
              {current}

              Answer:
            """.strip()
    messages = [
      {"role": "system", "content": instruct},
      {"role": "user", "content": prompt}
    ]
    input_text = generation_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True,enable_thinking=False,)
    inputs = generation_tokenizer(input_text, return_tensors="pt").to(generation_model.device)

    output_ids = generation_model.generate(
          **inputs,
          max_new_tokens=256,
          temperature=0.0,
          do_sample=False,
          num_beams=3,
          early_stopping=True,
          eos_token_id=generation_tokenizer.eos_token_id,
      )


    generated = output_ids[0]
    answer_ids = generated[len(inputs["input_ids"][0]):]
    answer = generation_tokenizer.decode(answer_ids, skip_special_tokens=True)
    #fin_answer = json.loads(json_answer)
    #answer = fin_answer["text"]
    return answer.strip()

In [13]:
import json, re
from tqdm import tqdm

output_path = "/content/drive/MyDrive/dataset/rag_pipeline3.jsonl"

USER_LINE_RE = re.compile(r'^\|user\|\s*:\s*(.*)\s*$', re.M)

# 1) build map: task_id -> rewritten query
qid2query = {}

with open(QUERIES_PATH, "r") as fq:
    for line in fq:
        q_obj = json.loads(line)
        qid = q_obj.get("_id")
        if not qid:
            continue
        text = q_obj.get("text", "")
        query_list = USER_LINE_RE.findall(text)
        if query_list:
            qid2query[qid] = query_list[0].strip()


with open(DEV_DIR, "r") as fin, open(output_path, "w") as fout:
    for line in tqdm(fin, desc="RAG pipeline"):
        obj = json.loads(line)

        qid = obj["task_id"]
        history = [(cov["speaker"], cov["text"]) for cov in obj["input"]][:-1]
        current = obj["input"][-1]["text"]

        query = qid2query.get(qid) or current
        candidates = retrieve(obj, query, top_k = 20)
        contexts = rerank(query, candidates)

        prediction = generate_answer(history, query, contexts)
        prediction = " ".join(prediction.split())

        obj["predictions"] = [{"text": prediction}]
        fout.write(json.dumps(obj, ensure_ascii=False) + "\n")

RAG pipeline: 0it [00:00, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
RAG pipeline: 169it [39:38, 14.08s/it]
