In [20]:
from pathlib import Path
import json, re
from functools import lru_cache

In [21]:
BASE_DIR = Path("..").resolve()
USER_LINE_RE = re.compile(r'^\|user\|\s*:\s*(.*)\s*$', re.M)

In [22]:
"""
dataset can choose the files in the queries data directory
"""
def read_data(dataset:str):
    path = BASE_DIR / "queries data" / dataset
    with open(path, "r") as file:
        return file.readlines()
       

In [23]:
print(read_data("cloud_questions.jsonl")[0])
print(json.loads(read_data("cloud_questions.jsonl")[0])["_id"])

{"_id":"d5b1e735a040853ed361a3dfde1b8ef0<::>1","text":"|user|: does IBM offer document databases?"}

d5b1e735a040853ed361a3dfde1b8ef0<::>1


In [24]:
"""
build the dataset dictionary as (id, turn): {"history": list, "current": string} format
"""
def build_dict(lines:list[str]) -> dict:
    query_dict = {}
    for line in lines:
        obj = json.loads(line)
        _id = obj["_id"]
        main_id, turn = _id.split("<::>")
        turn = int(turn)
        queries = USER_LINE_RE.findall(obj.get("text"))
        history = queries[:-1]
        current = queries[-1]
        query_dict[(main_id, turn)] = {"history": history, "current": current}
    return(query_dict)

In [25]:
cloud_dict = build_dict(read_data("cloud_questions.jsonl"))
print(cloud_dict)



In [26]:
import ollama

In [27]:
"""the model we are using in queries rewrite"""
EMBEDDING_MODEL = "qwen3-embedding:8b"
PRON_MODEL = "myaniu/qwen2.5-1m:7b"
REWRITE_MODEL = "qwen3:30b"

In [28]:
def get_embed(text:str) -> list[float]:
    embedding = ollama.embed(model=EMBEDDING_MODEL, input=text)['embeddings'][0]
    return embedding

In [30]:
def rewrite_pronouns(sentence: str, context_hint: str | None = None) -> str:
    prompt = f"""Replace anaphoric pronouns with their most likely explicit referents inferred from the sentence itself{(" and the hint: " + context_hint) if context_hint else ""}.
Rules:
- Preserve facts and meaning.
- Only replace anaphoric pronouns (he, she, it, they, this, that, those, these, his, her, their, etc.) when unambiguous.
- Do NOT add new facts. Keep one sentence. Output only the rewritten sentence.

Sentence: {sentence}
Rewritten:"""
    opts = {"temperature":0.1, "top_p":0.9, "num_predict": 50, "seed": 42}
    r = ollama.generate(model=PRON_MODEL, prompt=prompt, options=opts)
    return r["response"].strip()

PRON_RE = re.compile(r'\b(he|she|it|they|them|his|her|their|this|that|these|those)\b', re.I)

@lru_cache(maxsize=4096)
def rewrite_if_pronoun(sent: str, hint: str | None):
    # only if it contains the pronoun to do the rewrite pronouns, others would be original sentence
    if PRON_RE.search(sent):
        return rewrite_pronouns(sent, hint)
    return sent



In [31]:
def cosine_similarity(a, b):
    dot_product = sum([x * y for x, y in zip(a, b)])
    norm_a = sum([x ** 2 for x in a]) ** 0.5
    norm_b = sum([x ** 2 for x in b]) ** 0.5
    return dot_product / (norm_a * norm_b)

def select_history(history, current, k=3, keep_recent=1):
    """keep the recent questions to keep the background knowledge and if the turn is longer find other relative questions"""
    history_clean = []
    prev_clean = None
    for i, h  in enumerate(history):
        rewritten = rewrite_if_pronoun(h, prev_clean)
        history_clean.append(rewritten)
        prev_clean = rewritten
    hint_for_current = history_clean[-1] if history_clean else None
    current_clean = rewrite_if_pronoun(current, hint_for_current)
    
    if not history_clean:
        return [], current_clean
    if len(history_clean) <= k:
        return history_clean, current_clean
    
    recent = history_clean[-keep_recent:] if keep_recent > 0 else []
    pool = history_clean[:-keep_recent] if keep_recent > 0 else history

    cur_vec = get_embed(current_clean)
    scored = [(h, cosine_similarity(get_embed(h), cur_vec)) for h in pool]
    scored.sort(key=lambda x: x[1], reverse=True)
    need = max(0, k - len(recent))
    picked = [h for h, _ in scored[:need]]
    # get the question from early turn first and the recent one
    return picked + recent, current_clean
        
        

In [32]:
history_for_rewrite = select_history(['where is bone marrow found what does it do for the body', 'What happens if it does not work well?', 'How is Sickle cell treated?', 'Will it kill me?', 'How about transplant?', "Is Huntington's disease also inherited?"], 'Any cures for it?')
print(history_for_rewrite)

(['How about transplant?', 'How is Sickle cell treated?', "Is Huntington's disease also inherited?"], "Any cures for Huntington's disease?")


In [33]:
print(rewrite_if_pronoun('where is bone marrow found what does it do for the body', None))

Where is bone marrow found? What does it do for the body?


In [34]:
def rewrite_query(history_for_rewrite: list[str], current: str) -> str:
    history_text = "\n".join(f"- {h}" for h in history_for_rewrite if h)

    system_msg= (
        "You are a query rewriter for retrieval. "
        "Output must be a single-line JSON object only, with one key: question. "
        "Never invent; use ONLY info from HISTORY/CURRENT."
    )

    user_msg = f"""
                Rewrite CURRENT into one standalone question.

                Rules:
                - USE ONLY info from HISTORY/CURRENT.
                - Keep names explicit; avoid pronouns where possible.
                - Be minimal; no new terms.
                
                IMPORTANT:
                - If CURRENT is already standalone AND sufficiently specific for document retrieval,
                  return it unchanged.
                - If CURRENT is short, generic, or a yes/no follow-up
                  (e.g., "Do I need scripts?", "Is it required?", "How does it help?"),
                  rewrite it to explicitly include:
                    - the relevant product/system
                    - the relevant task or operation
                  using ONLY information inferable from HISTORY.
                HISTORY:
                {history_text}
                
                CURRENT:
                {current}
                
                Return JSON exactly like:
                {{"question": "<one-line question>"}}
                """


    resp = ollama.chat(
            model=REWRITE_MODEL,
            messages=[
                {"role": "system", "content": f"{system_msg} /no_think"},
                {"role": "user",   "content": f"{user_msg} /no_think"},
            ],
            options={"temperature": 0.0}
        )
        
    text = resp["message"]["content"].strip()
    data = json.loads(text)
    q = (data.get("question") or "").strip()
        # 保險：把可能混進來的包裹符號清一下
    q = q.strip("`\"' \n\r\t")
    return q

In [35]:
select, current_rewrite = history_for_rewrite
print(rewrite_query(select, current_rewrite))

Are there any cures for Huntington's disease?


In [None]:
import json

OUT = Path("cloud_rewritten_last_turn_qwen3_30B2.jsonl")

with OUT.open("w", encoding="utf-8") as f:
    
    for (cid, turn), item in cloud_dict.items():
        history = item.get("history") 
        current = item.get("current") 
        # do the select and rewrite
        selected, current_clean = select_history(history, current)
        rewrite = rewrite_query(selected, current_clean)

        if not rewrite:   
            continue

        rec = {
            "_id": f"{cid}<::>{turn}",
            "text": f"|user|: {rewrite}",
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("wrote:", OUT)