In [None]:
!pip install -U qwen
!pip -q install faiss-cpu datasets transformers

In [None]:
import json
from pathlib import Path
from google.colab import drive
import faiss

In [None]:
drive.mount('/content/drive')
REF_DIR = Path("/content/drive/MyDrive/dataset/reference.jsonl")

In [None]:
COLLECTION_TO_RESOURCES = {
    "clapnq": {
        "corpus_path": "/content/drive/MyDrive/corpus/clapnq_corpus.jsonl",
        "index_path":  "/content/drive/MyDrive/index/clapnq.faiss",
    },
    "cloud": {
        "corpus_path": "/content/drive/MyDrive/corpus/cloud_.jsonl",
        "index_path":  "/content/drive/MyDrive/index/cloud.faiss",
    },
    "fiqa": {
        "corpus_path": "/content/drive/MyDrive/corpus/fiqa_.jsonl",
        "index_path":  "/content/drive/MyDrive/index/fiqa.faiss",
    },
    "govt": {
        "corpus_path": "/content/drive/MyDrive/corpus/govt_corpus.jsonl",
        "index_path":  "/content/drive/MyDrive/index/mt_rag_govt.faiss",
    },
}
QUERIES_PATH = f"{BASE_DIR}/history_selected_rewrite_queries/rewritten_last_turn_qwen3_30B.jsonl"

In [None]:
def get_corpus(obj):
    coll = obj.get("Collection") or obj.get("collection")
    if "clapnq" in coll:
        return "clapnq"
    elif "cloud" in coll:
        return "cloud"
    elif "fiqa" in coll:
        return "fiqa"
    elif "govt" in coll:
        return "govt"

In [None]:
CACHE = {}  # corpus_name -> dict(index, ids, offsets, corpus_path)

def load_corpus_cache(corpus_name):
    if corpus_name in CACHE:
        return CACHE[corpus_name]

    corpus_path = COLLECTION_TO_RESOURCES[corpus_name]["corpus_path"]
    index_path  = COLLECTION_TO_RESOURCES[corpus_name]["index_path"]

    # 1) load faiss index once
    index = faiss.read_index(index_path)

    # 2) build (ids, offsets) without loading all texts
    ids, offsets = [], []
    offset = 0
    with open(corpus_path, "rb") as f:          # binary mode for exact offsets
        for line in f:
            offsets.append(offset)
            obj = json.loads(line.decode("utf-8"))
            ids.append(obj["_id"])
            offset += len(line)

    CACHE[corpus_name] = {
        "index": index,
        "ids": ids,
        "offsets": offsets,
        "corpus_path": corpus_path,
    }
    return CACHE[corpus_name]

def read_doc_by_idx(cache, i):
    corpus_path = cache["corpus_path"]
    offset = cache["offsets"][i]
    with open(corpus_path, "rb") as f:
        f.seek(offset)
        line = f.readline()
    obj = json.loads(line.decode("utf-8"))
    return obj["text"]

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "Qwen/Qwen3-Embedding-4B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

In [None]:
@torch.no_grad()
def embed_text(text, max_length=512):
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model(**inputs)
    last_hidden = outputs.last_hidden_state
    mask = inputs["attention_mask"].unsqueeze(-1)

    emb = (last_hidden * mask).sum(dim=1) / mask.sum(dim=1)
    emb = emb.cpu().numpy().astype("float32")

    # cosine normalize
    emb = emb / np.linalg.norm(emb, axis=1, keepdims=True)
    return emb   # shape: (1, dim)

In [None]:
def retrieve(obj, question, top_k=5):
    corpus = get_corpus(obj)
    cache = load_corpus_cache(corpus)

    q = embed_text(question)  # (1, dim) float32 normalized
    scores, idxs = cache["index"].search(q, top_k)

    results = []
    for score, i in zip(scores[0].tolist(), idxs[0].tolist()):
        if i == -1:
            continue
        text = read_doc_by_idx(cache, i)
        results.append({
            "doc_id": cache["ids"][i],
            "score": float(score),
            "text": text,
        })
    return results

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os

base_model = "Qwen/Qwen3-14B"

tokenizer = AutoTokenizer.from_pretrained(base_model)


# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    torch_dtype=torch.float16,
)

# adding LoRA adapter

model = PeftModel.from_pretrained(
    model,
    "/content/drive/MyDrive/rag_lora_adapter_qwen3_14B_try3"

)

model.eval()

In [None]:
def generate_answer(history, current, context):
    contexts_sorted = sorted(context, key=lambda x: x["score"], reverse=True)
    formatted = []
    for i, r in enumerate(contexts_sorted, start=1):
        formatted.append(f"[Document #{i}]\n{r['text']}\n")
    context_with_score = "\n".join(formatted)
    if history:
      limited = []
      if(len(history) > 2):
        limited = history[-4:]
      else:
        limited = history[-2:]
      history_text = "\n".join(f"{h[0]}: {h[1]}" for h in limited)
    else:
        history_text = ""

    instruct = """
                You are a RAG answer generator.

                Use the provided reference documents and the conversation history as evidence.
                Do not add information that is not supported by the reference.

                If the reference fully answers the question:
                - Provide a direct answer in 1â€“2 sentences.

                If the reference provides only partial information:
                - Sentence 1 must state that a specific detail needed to fully answer the question is missing.
                  You may describe the missing detail in natural language (for example: "whether", "how", "under what conditions", or "which case").
                - Sentence 2 must start with "However," and include what the reference supports.

                If the reference does not contain relevant information:
                - Respond with exactly: "I don't know."

                If the question is conversational AND does not request factual information:
                - Reply conversationally and briefly.
                """.strip()
    prompt = f"""
              REFERENCE:
              {context_with_score}

              HISTORY:
              {history_text}

              QUESTION:
              {current}

              Answer:
            """.strip()
    messages = [
      {"role": "system", "content": instruct},
      {"role": "user", "content": prompt}
    ]
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True,enable_thinking=False,)
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    end_id = tokenizer.convert_tokens_to_ids("END") if "END" in tokenizer.get_vocab() else tokenizer.eos_token_id

    output_ids = model.generate(
           **inputs,
          max_new_tokens=256,
          do_sample=False,
          temperature=0.0,
          eos_token_id=tokenizer.eos_token_id,
      )


    generated = output_ids[0]
    answer_ids = generated[len(inputs["input_ids"][0]):]
    answer = tokenizer.decode(answer_ids, skip_special_tokens=True)
    #fin_answer = json.loads(json_answer)
    #answer = fin_answer["text"]
    return answer.strip()


In [None]:
output_path = "/content/drive/MyDrive/dataset/rag_pipeline.jsonl"
with open(REF_DIR, "r") as fin, open(output_path, "w") as fout:
    for line in fin:
        obj = json.loads(line)


        history = [(cov["speaker"],cov["text"]) for cov in obj["input"]][:-1]
        current = [x["text"] for x in obj["input"]][-1]
        context = retrieve(obj, current)

        # generator
        prediction = generate_answer(history, current, context)

        prediction = " ".join(prediction.split())

        obj["predictions"] = [{"text": prediction}]

        fout.write(json.dumps(obj, ensure_ascii=False) + "\n")