# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
## Group: 61

| Name              | Student ID | Email                               |
| ----------------- | ---------- | ----------------------------------- |
| Jingcheng Qian    | 1640690    | jingchengq@student.unimelb.edu.au   |
| Weichen Wang      |            |                                     |
| Yue Zhang         |            |                                     |
## Overview

This notebook contains the essential functions required for task implementation. 

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
import re, json, ujson, numpy as np
from pathlib import Path
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import download
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer
import unicodedata

from pathlib import Path
from tqdm import tqdm

import torch
from sentence_transformers import CrossEncoder

download("punkt")
download("stopwords")

STOP = set(stopwords.words("english"))
STEM = PorterStemmer().stem

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\77280\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\77280\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# read evidence.json
# note that the data file does not include evidence.json
ev_path = Path("./data/evidence.json")
with ev_path.open("r", encoding="utf-8") as f:
    evid_dict = ujson.load(f)

evid_ids   = list(evid_dict.keys())
raw_texts  = [evid_dict[eid] for eid in evid_ids]

## Two-Stage Evidence Retrieval

**Data clean**

remove unicode and keep words and alphanumerics

In [None]:
def full_clean(text: str) -> str:
    """
    Perform text cleaning:
      1. Normalizing to NFC form
      2. Removing soft hyphens and various dash characters
      3. Decomposing and stripping diacritics (NFKD + ASCII encoding)
      4. Translating any remaining non-ASCII characters to ASCII (Unidecode)
      5. Removing control characters and symbol characters
      6. keeping words and alphanumerics 
    """
    txt = unicodedata.normalize("NFC", text)
    txt = re.sub(r"[\u00AD\u2010-\u2014\-]", "", txt)
    txt = unicodedata.normalize("NFKD", txt)
    txt = txt.encode("ASCII", "ignore").decode("utf-8")
    txt = unidecode(txt)
    txt = regex.sub(r"[\p{C}\p{S}]+", "", txt)
    txt = re.sub(r"[^A-Za-z0-9\s]", "", txt)
    return txt

def nltk_stem_preprocessor(text: str) -> str:
    txt = full_clean(text).lower()
    words = txt.split()
    stems = [STEM(w) for w in words if w not in STOP]
    return " ".join(stems)

### BM25 Retrieval

Use BM25 for first stage evidence retrieval, generate top-100 evidences from evidence.json

In [None]:
cv = CountVectorizer(
        ngram_range=(1, 2),
        preprocessor=nltk_stem_preprocessor,
        tokenizer=lambda text: text.split(),
        token_pattern=None,             
        stop_words=None,
    )
analyzer = cv.build_analyzer()

# TOKEN
token_corpus = [analyzer(doc) for doc in tqdm(raw_texts, desc="Tokenize")]

bm25 = BM25Okapi(token_corpus, k1=1.2, b=0.75)

In [None]:
# query claim and return Top-k relevant evidence
def retrieve_topk(claim_text: str, topk: int = 100):
    query_tokens = analyzer(claim_text)
    scores       = bm25.get_scores(query_tokens)
    idx_sorted   = np.argsort(scores)[-topk:][::-1]
    return [(evid_ids[i], float(scores[i])) for i in idx_sorted]

# batch process
def process_claim_file(claim_json: str, out_json: str):
    with open(claim_json, "r", encoding="utf-8") as f:
        claims = json.load(f)            # {claim_id: {...}}
    results = {}
    for cid, obj in tqdm(claims.items(), desc="Retrieve"):
        hits = retrieve_topk(obj["claim_text"])
        results[cid] = {"evidences": [h[0] for h in hits]}
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

For efficiency, comment the batch process function. Uncomment these to test the output

In [None]:
# process_claim_file("./data/train-claims.json", "./data/train-claims-top100.json")
# process_claim_file("./data/dev-claims.json", "./data/dev-claims-top100.json")
# process_claim_file("./data/test-claims-unlabelled.json", "./data/test-claims-top100.json")

### Cross-Encoder Reranker
Reads top-100 candidates per claim, re-scores them with a pretrained CrossEncoder, and emits the top-M evidences in both ID form and full-text form.

In [None]:
# ---------- Config ----------
DATA_DIR   = Path("data")
TOP100_FNS = {
    "train": "train-claims-top100.json",
    "dev"  : "dev-claims-top100.json",
    "test" : "test-claims-top100.json"
}
TOP_M      = 6
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
BATCH_SIZE = 32


# Initialize Cross-Encoder
device = "cuda" if torch.cuda.is_available() else "cpu"
ce_model = CrossEncoder(MODEL_NAME, device=device)

# ---------- Process each split ----------
for split, fn in TOP100_FNS.items():
    top100_path = DATA_DIR / fn
    if not top100_path.exists():
        continue

    print(f"[{split}] Loading top-100 lists…")
    with top100_path.open() as f:
        top100 = ujson.load(f)

    # Load raw claim texts
    cfile = DATA_DIR / f"{split}-claims.json"
    with cfile.open() as f:
        raw = ujson.load(f)
    claim_texts = {
        cid: (raw[cid]["claim_text"] if isinstance(raw[cid], dict) else raw[cid])
        for cid in raw
    }

    dense_out = {}
    text_out  = {}

    print(f"[{split}] Reranking with Cross-Encoder…")
    for cid, entry in tqdm(top100.items(), desc=f"{split} split"):
        cand_ids = entry["evidences"] if isinstance(entry, dict) else entry
        claim    = claim_texts.get(cid, "")

        # Build (claim, evidence) pairs
        pairs = [(claim, evid_dict[eid]) for eid in cand_ids]

        # Score
        scores = ce_model.predict(pairs, batch_size=BATCH_SIZE)

        # Pick top-M
        top_idx = scores.argsort()[-TOP_M:][::-1]
        top_ids = [cand_ids[i] for i in top_idx]

        # --- dense (just IDs) ---
        dense_out[cid] = top_ids

        # --- text (with full evidence text + claim_text) ---
        text_out[cid] = {
            "claim_text": claim,
            "ranked_evidences": [
                {"id": eid, "text": evid_dict[eid]}
                for eid in top_ids
            ]
        }

    # Write outputs
    out_dense_path = DATA_DIR / f"{split}-claims-top{TOP_M}-dense-ce.json"
    out_text_path  = DATA_DIR / f"{split}-claims-top{TOP_M}-text-ce.json"

    out_dense_path.write_text(
        json.dumps(dense_out, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )
    out_text_path.write_text(
        json.dumps(text_out, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )

    print(f"[{split}] → wrote {out_dense_path.name} and {out_text_path.name}")

[dev] Loading top-100 lists…
[dev] Reranking with Cross-Encoder…


dev split: 100%|██████████| 154/154 [00:16<00:00,  9.54it/s]


[dev] → wrote dev-claims-top6-dense-ce.json and dev-claims-top6-text-ce.json
[test] Loading top-100 lists…
[test] Reranking with Cross-Encoder…


test split: 100%|██████████| 153/153 [00:16<00:00,  9.36it/s]

[test] → wrote test-claims-top6-dense-ce.json and test-claims-top6-text-ce.json





# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
import os
import json
import re
import time
from tqdm import tqdm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# ———— Config ————
TEST_CLAIMS_FILE = './data2/test-claims-top10-text-ce.json'
RESULTS_FILE     = 'results.json'
CHECKPOINT_FILE  = 'checkpoint.json'
MODEL_NAME       = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'

# ———— Few-shot Example ————
FEW_SHOT_EXAMPLE = {
    "claim-2152": {
        "claim_text": "Venus doesn't have a runaway greenhouse effect",
        "ranked_evidences": [
            {
                "id": "evidence-1018575",
                "text": (
                    "A runaway greenhouse effect involving carbon dioxide and water vapor "
                    "has long ago been hypothesized to have occurred on Venus, this idea "
                    "is still largely accepted."
                )
            },
            {
                "id": "evidence-791159",
                "text": (
                    "Venus receives about twice the sunlight that Earth does, which is "
                    "thought to have contributed to its runaway greenhouse effect."
                )
            },
            {
                "id": "evidence-500249",
                "text": (
                    "In the extreme, the planet Venus is thought to have experienced a "
                    "very large increase in greenhouse effect over its lifetime, so much "
                    "so that its poles have warmed sufficiently to render its surface "
                    "temperature effectively isothermal."
                )
            }
        ],
        "claim_label": "REFUTES",
        "evidences": ["evidence-1018575", "evidence-791159"]
    }
}

In [None]:
# ———— Helpers ————
def load_json(path, default):
    if os.path.isfile(path):
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return default

def save_json(obj, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def build_prompt(claim_id, claim_obj):
    lines = []
    # system/user instruction
    lines.append(
        "You are a fact-checking assistant. "
        "For the given Claim and Candidate Evidences, determine the correct Claim Label "
        "and list the IDs of those evidences you deem relevant, at least one evidence. "
        "The label is one of [SUPPORTS, REFUTES, NOT_ENOUGH_INFO, DISPUTED]."
        '''
        Classification definitions:
        - SUPPORTS: The evidence directly confirms the claim is true
        - REFUTES: The evidence directly contradicts the claim, showing it's false
        - DISPUTED: The evidence contains conflicting information about the claim
        - NOT_ENOUGH_INFO: The evidence is insufficient to make a determination
        '''
    )

    # few-shot block
    for ex_id, ex in FEW_SHOT_EXAMPLE.items():
        lines.append(f'"{ex_id}": "{ex["claim_text"]}",')
        lines.append("  \"ranked_evidences\": [")
        for ev in ex["ranked_evidences"]:
            lines.append(f'    {{"{ev["id"]}": "{ev["text"]}"}},')
        example_label_evidences = {
            "label": ex["claim_label"],
            "evidences": ex["evidences"]
        }
        json_line = json.dumps(example_label_evidences)
        lines.append(f'  {json_line},')
        lines.append("")  # separator

    # target claim
    lines.append(
    "Now, please output **only** valid JSON, with exactly these two keys:\n"
    "  \"label\": string,\n"
    "  \"evidences\": an array of evidence ID strings (e.g. [\"evidence-123\",\"evidence-456\"])."
    )
    lines.append(f'"{claim_id}": "{claim_obj["claim_text"]}",')
    lines.append("  \"evidences\": [")
    for ev in claim_obj["ranked_evidences"]:
        lines.append(f'    {{"{ev["id"]}": "{ev["text"]}"}},')
    lines.append("  ]")
    lines.append("label:")
    lines.append("evidences:")
    return "\n".join(lines)

### Load Model
load in 4-bit 7B model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, need around 7GB GPU

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                       # 4 bit
    bnb_4bit_quant_type="nf4",               
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,   # bf16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="cuda"
)

### Predict
use checkpoint to store model state

In [None]:
# ———— Main Loop ————
def main():
    test_data   = load_json(TEST_CLAIMS_FILE, {})
    results     = load_json(RESULTS_FILE, {})
    checkpoint  = load_json(CHECKPOINT_FILE, {"last_id": None})
    started = checkpoint["last_id"] is None

    for cid, claim in tqdm(test_data.items(), desc="Claims"):
        # skip until after last checkpoint
        if not started:
            if cid == checkpoint["last_id"]:
                started = True
            continue

        prompt = build_prompt(cid, claim)

        messages = [{"role": "user", "content": prompt}]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=True
        )

        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # conduct text completion
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=32768
        )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
        content = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip("\n")


        # parse JSON
        try:
            # clean
            m = re.search(r'(\{.*\})', content, flags=re.DOTALL)

            json_str = m.group(1)

            parsed = json.loads(json_str)
            results[cid] = {
                "claim_label": parsed["label"],
                "evidences":   parsed["evidences"]
            }

        # retru if output is not json format
        except json.JSONDecodeError:
            print(f"[WARN] JSON parse failed for {cid}, try again.")
            continue
        except AttributeError:
            continue

        # persist
        save_json(results, RESULTS_FILE)
        checkpoint["last_id"] = cid
        save_json(checkpoint, CHECKPOINT_FILE)

        # pause
        time.sleep(0.5)

    print("All done.")

In [None]:
main()

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

### Compute Recall and F-score to evaluate evidence retrieval output

In [None]:
import json
import argparse
from typing import List, Tuple, Optional

def compute_metrics(
    gt: List[str],
    retrieved: List[str]
) -> Tuple[int, Optional[float], Optional[float]]:
    """
    Args:
        gt: ground-truth evidence ID
        retrieved: top-100 evidence ID 

    Returns:
        tp: hit
        recall: tp / len(gt) 
        precision: tp / len(retrieved)
    """
    set_gt = set(gt)
    set_ret = set(retrieved)
    tp = len(set_gt & set_ret)
    recall = tp / len(gt) if gt else None
    precision = tp / len(retrieved) if retrieved else None
    return tp, recall, precision

def main(train_claims_path: str, top100_path: str):
    # read json
    with open(train_claims_path, 'r', encoding='utf-8') as f:
        train_claims = json.load(f)
    with open(top100_path, 'r', encoding='utf-8') as f:
        top100 = json.load(f)

    recalls = []
    precisions = []

    # each claim
    for claim_id, claim_info in train_claims.items():
        gt_list = claim_info.get("evidences", [])
        retrieved_list = top100.get(claim_id, {})

        tp, recall, precision = compute_metrics(gt_list, retrieved_list)
        recalls.append(recall if recall is not None else 0.0)
        precisions.append(precision if precision is not None else 0.0)

    # avg
    avg_recall = sum(recalls) / len(recalls) if recalls else 0.0
    avg_precision = sum(precisions) / len(precisions) if precisions else 0.0
    if (avg_precision + avg_recall) > 0:
        avg_f1 = 2 * avg_precision * avg_recall / (avg_precision + avg_recall)
    else:
        avg_f1 = 0.0
    print("\n=== Overall ===")
    print(f"Average Recall@k   : {avg_recall:.3f}")
    print(f"Average Precision@k: {avg_precision:.3f}")
    print(f"Average F1@k       : {avg_f1:.3f}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="topk Recall Precision F-score"
    )
    parser.add_argument(
        "--train_claims",
        type=str,
        default="./data/dev-claims.json",
    )
    parser.add_argument(
        "--top100",
        type=str,
        default="./data/dev-claims-top10-dense.json",
    )
    args = parser.parse_args()
    main(args.train_claims, args.top100)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*