# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
import re, json, ujson, numpy as np
from pathlib import Path
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import download
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer

download("punkt")
download("stopwords")

STOP = set(stopwords.words("english"))
STEM = PorterStemmer().stem

In [None]:
# ---------- 1. 读取 evidence.json ----------
ev_path = Path("./data/evidence.json")
with ev_path.open("r", encoding="utf-8") as f:
    evid_dict = ujson.load(f)

evid_ids   = list(evid_dict.keys())
raw_texts  = [evid_dict[eid] for eid in evid_ids]

In [None]:
cv = CountVectorizer(
        lowercase=True,
        ngram_range=(1, 2),
        token_pattern=r"(?u)\b[a-z]+\b",    # 只要字母组成的词
)

def nltk_stem_preprocessor(text: str) -> str:
    """
    先用正则粗清洗，再词干化，再过滤停用词，最后以空格连接——
    CountVectorizer 将把空格视为 token 分界。
    """
    tokens = re.findall(r"[A-Za-z]+", text.lower())
    tokens = [STEM(t) for t in tokens if t not in STOP]
    return " ".join(tokens)

# 使用自定义预处理器，scikit‑learn 会在内部调用它
cv.set_params(preprocessor=nltk_stem_preprocessor, stop_words=None)

# fit 只是为了构造 analyzer；不关心矩阵
cv.fit(raw_texts)
analyzer = cv.build_analyzer()

# ---------- 3. 得到每篇文档的 token 列表 -------------------
token_corpus = [analyzer(doc) for doc in tqdm(raw_texts, desc="Tokenize")]

In [None]:
# ---------- 3. 构建 BM25 索引 ----------
bm25 = BM25Okapi(token_corpus, k1=1.5, b=0.75)

In [None]:
def retrieve_topk(claim_text: str, topk: int = 100):
    query_tokens = analyzer(claim_text)
    scores       = bm25.get_scores(query_tokens)
    idx_sorted   = np.argsort(scores)[-topk:][::-1]
    return [(evid_ids[i], float(scores[i])) for i in idx_sorted]

# -------- DEMO --------
demo_claim = "South Australia has the most expensive electricity in the world."
top_hits   = retrieve_topk(demo_claim, 5)   # [(id, score), ...]


print("Top-5 results:\n")
for rank, (eid, score) in enumerate(top_hits, 1):
    print(f"#{rank:02d}  {eid}   BM25={score:.4f}")
    print("     ", evid_dict[eid])
    print()


In [None]:
# batch process
def process_claim_file(claim_json: str, out_json: str):
    with open(claim_json, "r", encoding="utf-8") as f:
        claims = json.load(f)            # {claim_id: {...}}
    results = {}
    for cid, obj in tqdm(claims.items(), desc="Retrieve"):
        hits = retrieve_topk(obj["claim_text"])
        results[cid] = {"evidences": [h[0] for h in hits]}
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

In [None]:
# process_claim_file("./data/train-claims.json", "./data/train-claims-top100.json")
# process_claim_file("./data/dev-claims.json", "./data/dev-claims-top100.json")
# process_claim_file("./data/test-claims-unlabelled.json", "./data/test-claims-top100.json")

In [None]:
import json, ujson, numpy as np
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

DATA_DIR   = Path("data")
TOP100_FNS = {
    "train": "train-claims-top100.json",
    "dev"  : "dev-claims-top100.json",
    "test" : "test-claims-top100.json"
}
TOP_M = 6

# ---------- 0. evidence ----------
with (DATA_DIR / "evidence.json").open() as f:
    evid_dict = ujson.load(f)
evid_ids = list(evid_dict.keys())
id2row   = {eid: i for i, eid in enumerate(evid_ids)}

# ---------- 1. encode evidence ----------
print("Encoding evidence vectors ...")
bi_model   = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
BATCH = 1024
chunks = []
for i in tqdm(range(0, len(evid_ids), BATCH)):
    txts = [evid_dict[eid] for eid in evid_ids[i:i+BATCH]]
    chunks.append(
        bi_model.encode(txts, batch_size=32, normalize_embeddings=True).astype("float32")
    )
evid_matrix = np.vstack(chunks)

# ---------- 2. process each split ----------
for split, fn in TOP100_FNS.items():
    top100_path = DATA_DIR / fn
    if not top100_path.exists():
        continue

    with top100_path.open() as f:
        top100 = ujson.load(f)

    claim_texts = {}
    cfile = DATA_DIR / f"{split}-claims.json"

    with cfile.open() as f:
        raw = ujson.load(f)
    claim_texts = {
        cid: raw[cid]["claim_text"] if isinstance(raw[cid], dict) else raw[cid] for cid in raw
    }

    dense_out, text_out = {}, {}
    for cid, entry in tqdm(top100.items(), desc=f"{split} rerank"):
        id_list = entry["evidences"] if isinstance(entry, dict) else entry
        claim_emb = bi_model.encode(claim_texts.get(cid, ""), normalize_embeddings=True)
        vecs = evid_matrix[[id2row[eid] for eid in id_list]]
        scores = vecs @ claim_emb
        top_idx = scores.argsort()[-TOP_M:][::-1]
        top_ids = [id_list[i] for i in top_idx]

        # dense
        dense_out[cid] = top_ids

        # text
        text_out[cid] = {
            "claim_text": claim_texts.get(cid, ""),
            "ranked_evidences": [
                {"id": eid, "text": evid_dict[eid]} for eid in top_ids
            ]
        }

    # output
    (DATA_DIR / f"{split}-claims-top{TOP_M}-dense.json").write_text(
        json.dumps(dense_out, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    (DATA_DIR / f"{split}-claims-top{TOP_M}-text.json").write_text(
        json.dumps(text_out, ensure_ascii=False, indent=2), encoding="utf-8"
    )

print("All splits processed ✅")

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [2]:
import json
def build_prompt_with_example(json_path: str) -> str:
    # 1. few‐shot 示例
    example = {
        "claim-2152": {
            "claim_text": "Venus doesn't have a runaway greenhouse effect",
            "ranked_evidences": [
                {
                    "id": "evidence-1018575",
                    "text": (
                        "A runaway greenhouse effect involving carbon dioxide and water vapor "
                        "has long ago been hypothesized to have occurred on Venus, this idea "
                        "is still largely accepted."
                    )
                }
            ],
            "claim_label": "REFUTES",
            "evidences": ["evidence-1018575"]
        }
    }

    # 2. 读取 test‐claims-top6-text.json 并取第一条
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    test_claim_id, test_claim = next(iter(data.items()))
    
    # 3. 拼接 prompt
    prompt_lines = []
    prompt_lines.append("You are a fact-checking assistant. "
                 "For the given Claim and Candidate Evidences, "
                 "determine the correct Claim Label and list the IDs of those evidences you deem relevant."
                 "The label is [SUPPORTS, REFUTES, NOT_ENOUGH_INFO, DISPUTED]" )
    for cid, info in example.items():
        prompt_lines.append(f"\"{cid}\": \"{info['claim_text']}\",")
        prompt_lines.append("  \"evidences\": [")
        for ev in info['ranked_evidences']:
            prompt_lines.append(f"    {{\"{ev['id']}\": \"{ev['text']}\"}},")
        prompt_lines.append("  ],")
        prompt_lines.append(f"label: \"{info['claim_label']}\"")

    prompt_lines.append(f"Now, given the following claim and its candidate evidences, "
                 "please output in the JSON format: {\"label\":\"\", \"evidences\":\"\"}")
    prompt_lines.append(f"\"{test_claim_id}\": \"{test_claim['claim_text']}\",")
    prompt_lines.append("  \"evidences\": [")
    for ev in test_claim['ranked_evidences']:
        prompt_lines.append(f"    {{\"{ev['id']}\": \"{ev['text']}\"}},")
    prompt_lines.append("  ]")
    prompt_lines.append("label:")
    prompt_lines.append("evidences:")
    return "\n".join(prompt_lines)


if __name__ == "__main__":
    prompt = build_prompt_with_example('./data/test-claims-top6-text.json')
    print(prompt)

You are a fact-checking assistant. For the given Claim and Candidate Evidences, determine the correct Claim Label and list the IDs of those evidences you deem relevant.The label is [SUPPORTS, REFUTES, NOT_ENOUGH_INFO, DISPUTED]
"claim-2152": "Venus doesn't have a runaway greenhouse effect",
  "evidences": [
    {"evidence-1018575": "A runaway greenhouse effect involving carbon dioxide and water vapor has long ago been hypothesized to have occurred on Venus, this idea is still largely accepted."},
  ],
label: "REFUTES"
Now, given the following claim and its candidate evidences, please output in the JSON format: {"label":"", "evidences":""}
"claim-2967": "The contribution of waste heat to the global climate is 0.028 W/m2.",
  "evidences": [
    {"evidence-308923": "Global forcing from waste heat was 0.028 W/m2 in 2005."},
    {"evidence-1185839": "It could prove to be the most inexorable, however, if we are fortunate enough to evade all the rest.” Simple global-scale estimates that recen

In [33]:
char_len = len(prompt)
print(f"Prompt 字符数：{char_len}")

Prompt 字符数：1813


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name = "Qwen/Qwen3-0.6B"
# model_name = "Qwen/Qwen3-4B"
# model_name = "Qwen/Qwen3-4B-FP8"

model_name = "Qwen/Qwen3-1.7B"
# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
print(text)
tokens = tokenizer(text)
num_tokens = len(tokens.input_ids)
print(num_tokens)

<|im_start|>user
You are a fact-checking assistant. For the given Claim and Candidate Evidences, determine the correct Claim Label and list the IDs of those evidences you deem relevant.The label is [SUPPORTS, REFUTES, NOT_ENOUGH_INFO, DISPUTED]
"claim-2152": "Venus doesn't have a runaway greenhouse effect",
  "evidences": [
    {"evidence-1018575": "A runaway greenhouse effect involving carbon dioxide and water vapor has long ago been hypothesized to have occurred on Venus, this idea is still largely accepted."},
  ],
label: "REFUTES"
Now, given the following claim and its candidate evidences, please output in the JSON format: {"label":"", "evidences":""}
"claim-2967": "The contribution of waste heat to the global climate is 0.028 W/m2.",
  "evidences": [
    {"evidence-308923": "Global forcing from waste heat was 0.028 W/m2 in 2005."},
    {"evidence-1185839": "It could prove to be the most inexorable, however, if we are fortunate enough to evade all the rest.” Simple global-scale est

In [4]:
# prepare the model input

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)

thinking content: <think>
Okay, let's tackle this fact-checking task. The user provided a claim and a list of candidate evidences, and I need to determine the correct label: SUPPORTS, REFUTES, NOT_ENOUGH_INFO, or DISPUTED. 

First, the claim is: "The contribution of waste heat to the global climate is 0.028 W/m2." The evidences include several entries. Let me go through each one.

Evidence 308923 says that in 2005, the global forcing from waste heat was 0.028 W/m². That seems directly related to the claim. But I need to check if this is accurate. However, the claim is about the contribution, and the evidence states a specific value for 2005. But wait, the claim is about the contribution being 0.028 W/m². If the evidence says that in 2005 it was 0.028, but maybe that's just a specific instance. But the claim is about the contribution in general. However, the evidence might not be the best because it's a specific year. Also, the other evidence mentions that after 2000, there's a noticeab

In [1]:
import os
import json
import re
import time
from tqdm import tqdm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# ———— Config ————
TEST_CLAIMS_FILE = './data/test-claims-top6-text.json'
RESULTS_FILE     = 'results.json'
CHECKPOINT_FILE  = 'checkpoint.json'
MODEL_NAME       = 'Qwen/Qwen3-1.7B'

# ———— Few-shot Example ————
FEW_SHOT_EXAMPLE = {
    "claim-2152": {
        "claim_text": "Venus doesn't have a runaway greenhouse effect",
        "ranked_evidences": [
            {
                "id": "evidence-1018575",
                "text": (
                    "A runaway greenhouse effect involving carbon dioxide and water vapor "
                    "has long ago been hypothesized to have occurred on Venus, this idea "
                    "is still largely accepted."
                )
            },
            {
                "id": "evidence-791159",
                "text": (
                    "Venus receives about twice the sunlight that Earth does, which is "
                    "thought to have contributed to its runaway greenhouse effect."
                )
            },
            {
                "id": "evidence-500249",
                "text": (
                    "In the extreme, the planet Venus is thought to have experienced a "
                    "very large increase in greenhouse effect over its lifetime, so much "
                    "so that its poles have warmed sufficiently to render its surface "
                    "temperature effectively isothermal."
                )
            }
        ],
        "claim_label": "REFUTES",
        "evidences": ["evidence-1018575", "evidence-791159"]
    }
}

# ———— Helpers ————
def load_json(path, default):
    if os.path.isfile(path):
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return default

def save_json(obj, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def build_prompt(claim_id, claim_obj):
    lines = []
    # system/user instruction
    lines.append(
        "You are a fact-checking assistant. "
        "For the given Claim and Candidate Evidences, determine the correct Claim Label "
        "and list the IDs of those evidences you deem relevant, at least one evidence. "
        "The label is one of [SUPPORTS, REFUTES, NOT_ENOUGH_INFO, DISPUTED]."
    )
    lines.append("")  # blank

    # few-shot block
    for ex_id, ex in FEW_SHOT_EXAMPLE.items():
        lines.append(f'"{ex_id}": "{ex["claim_text"]}",')
        lines.append("  \"ranked_evidences\": [")
        for ev in ex["ranked_evidences"]:
            lines.append(f'    {{"{ev["id"]}": "{ev["text"]}"}},')
        lines.append("  ],")
        lines.append(f'label: "{ex["claim_label"]}"')
        lines.append(f'evidences: {ex["evidences"]}')
        lines.append("")  # separator

    # target claim
    lines.append(
        "Now, given the following claim and its candidate evidences, "
        "please output **only** valid JSON in the format: {\"label\":\"\", \"evidences\":[]}"
    )
    lines.append(f'"{claim_id}": "{claim_obj["claim_text"]}",')
    lines.append("  \"evidences\": [")
    for ev in claim_obj["ranked_evidences"]:
        lines.append(f'    {{"{ev["id"]}": "{ev["text"]}"}},')
    lines.append("  ]")
    lines.append("label:")
    lines.append("evidences:")
    return "\n".join(lines)


In [2]:
# ———— Load Model ————
print(f"Loading model {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)
model.eval()

Loading model Qwen/Qwen3-1.7B ...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (up_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (down_proj): Linear(in_features=6144, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [3]:
# ———— Main Loop ————
def main():
    test_data   = load_json(TEST_CLAIMS_FILE, {})
    results     = load_json(RESULTS_FILE, {})
    checkpoint  = load_json(CHECKPOINT_FILE, {"last_id": None})
    started = checkpoint["last_id"] is None

    for cid, claim in tqdm(test_data.items(), desc="Claims"):
        # skip until after last checkpoint
        if not started:
            if cid == checkpoint["last_id"]:
                started = True
            continue

        prompt = build_prompt(cid, claim)
        # apply chat template
        messages = [{"role": "user", "content": prompt}]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=True
        )
        # print(text)
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # conduct text completion
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=32768
        )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

        # parsing thinking content
        try:
        # rindex finding 151668 (</think>)
            index = len(output_ids) - output_ids[::-1].index(151668)
        except ValueError:
            index = 0

        content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
        
        # parse JSON
        try:
            # clean
            m = re.search(r'\{.*\}', content, flags=re.DOTALL)
            json_str = m.group(0) if m else content

            parsed = json.loads(json_str)
            results[cid] = {
                "claim_label": parsed["label"],
                "evidences":   parsed["evidences"]
            }
        except json.JSONDecodeError:
            print(f"[WARN] JSON parse failed for {cid}, raw output:\n{content}")
            break

        # persist
        save_json(results, RESULTS_FILE)
        checkpoint["last_id"] = cid
        save_json(checkpoint, CHECKPOINT_FILE)

        # courtesy pause
        time.sleep(0.5)

    print("✅ All done.")

if __name__ == "__main__":
    main()


Claims:   0%|          | 0/153 [00:00<?, ?it/s]

Claims:  84%|████████▎ | 128/153 [20:34<04:01,  9.64s/it]


KeyboardInterrupt: 

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*