This is Notebook 3 of 4. It coverS Stage 2 and 3 experiments train-claim.json. and accompanying report for more details.

⚙️ Runtime Notes



    •    Code execution may take 3 hours. 1 hour for each of model's experience.
 .

📚 Navigation

Use the Table of Contents on the left sidebar to navigate between sections.

🧠 Architecture Overview

🧪 Experiment Section

    •    experiments conducted for Stage 2&3 on training set.

Thank you 🙂

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **combine**

In [None]:
import os
import json
import numpy as np
from tqdm.auto import tqdm, trange
import torch
from sentence_transformers import SentenceTransformer, losses, InputExample, CrossEncoder
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from collections import defaultdict
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

HF_TOKEN = os.environ.get("HF_TOKEN")

if HF_TOKEN is None:
    raise ValueError("Hugging Face token not found. Please set HF_TOKEN in your .env file.")

login(HF_TOKEN)

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
class ImprovedDualEncoderRetrieval:
    def __init__(self,
                 retriever_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
                 reranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
                 batch_size=16,
                 max_seq_length=512,
                 use_pooling="mean"):

        self.retriever_model = retriever_model

        # Retriever
        print(f"✅ Loading retriever model [{retriever_model}] in FP32")
        self.model = SentenceTransformer(retriever_model, device='cuda')


        # Reranker
        print(f"✅ Loading reranker model [{reranker_model}] in FP32")
        self.reranker_model = CrossEncoder(reranker_model, device='cuda')

        self.batch_size = batch_size
        self.max_seq_length = max_seq_length
        self.use_pooling = use_pooling

        self.evidence_data = None
        self.claim_data = None
        self.evidence_embeddings = None
        self.claim_embeddings = None

        self.use_hard_negatives = True
        self.use_preprocessing = False
        self.normalize_embeddings = False
        self.reranking = True

        self.cache_dir = "embeddings_cache"
        # Create cache directory if it doesn't exist
        os.makedirs(self.cache_dir, exist_ok=True)

    def preprocess_text(self, text):
        # Lowercase
        text = text.lower().strip()

        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)

        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)

        # Normalize whitespace
        text = ' '.join(text.split())

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

        # Lemmatization (preferred over stemming)
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

        return text

    def load_data(self, evidence_path, claim_path):
        print(f"Loading evidence data from {evidence_path}")
        with open(evidence_path, 'r') as f:
            self.evidence_data = json.load(f)

        print(f"Loading claim data from {claim_path}")
        with open(claim_path, 'r') as f:
            self.claim_data = json.load(f)

        if self.use_preprocessing:
            print("Preprocessing evidence and claim texts...")
            self.evidence_data = {eid: self.preprocess_text(text)
                                  for eid, text in self.evidence_data.items()}

            if "claim_text" in list(self.claim_data.values())[0]:
                for cid in self.claim_data:
                    self.claim_data[cid]["claim_text"] = self.preprocess_text(self.claim_data[cid]["claim_text"])
            else:
                self.claim_data = {cid: self.preprocess_text(text)
                                  for cid, text in self.claim_data.items()}

        print(f"Loaded {len(self.evidence_data)} evidence passages and {len(self.claim_data)} claims")
        return True

    def validate_loaded_data(self):
        """Validate that the loaded evidence and claims match expected counts and integrity."""
        # Check evidence data
        print(f"🔍 Validation Check: Loaded evidence passages = {len(self.evidence_data)}")
        if len(self.evidence_data) < 100000:
            print(f"⚠️ WARNING: Evidence passages seem unusually low ({len(self.evidence_data)}) compared to expected (~1,200,000). Check your loaded file or preprocessing.")

        # Check claims data
        print(f"🔍 Validation Check: Loaded claim samples = {len(self.claim_data)}")
        if len(self.claim_data) < 1000:
            print(f"⚠️ WARNING: Claim samples seem unusually low ({len(self.claim_data)}) compared to expected (~1,000). Check your loaded file or preprocessing.")

    def validate_before_embedding(self):
        """Validate before embedding begins."""
        print(f"🔍 Validation Check: About to embed {len(self.evidence_data)} evidence passages.")
        if len(self.evidence_data) < 100000:
            print(f"⚠️ WARNING: Embedding very few evidence passages ({len(self.evidence_data)}). This is likely wrong.")

    def _batch_encode(self, texts, desc):
        """Encode texts in batches efficiently"""
        embeddings = []
        for i in tqdm(range(0, len(texts), self.batch_size), desc=desc):
            batch_texts = texts[i:i + self.batch_size]
            if not batch_texts:
                continue
            # Stay in GPU, output as torch.Tensor
            batch_embeddings = self.model.encode(batch_texts, convert_to_tensor=True, device='cuda')

            # Normalize if enabled
            if self.normalize_embeddings:
                batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)

            # Move to CPU only when saving or stacking
            embeddings.append(batch_embeddings.cpu())
        return torch.cat(embeddings, dim=0).numpy()

    def embed_evidence(self, save_path="evidence_embeddings_streamed.jsonl", mapping_save_path="evidence_id_mapping.json", overwrite=False):
        """
        Embed evidence texts and save with int ID mapping (required for FAISS).
        Also save the mapping to reverse back later.
        """
        print(f"📦 Streaming embeddings to {save_path}")
        if os.path.exists(save_path) and not overwrite:
            print("✅ Found cached evidence embeddings. Skipping embedding.")
            return

        self.validate_before_embedding()

        evidence_ids = list(self.evidence_data.keys())
        texts = list(self.evidence_data.values())

        id_mapping = {eid: idx for idx, eid in enumerate(evidence_ids)}  # string to int mapping

        with open(save_path, "w") as f:
            for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding evidence"):
                batch_ids = evidence_ids[i:i+self.batch_size]
                batch_texts = texts[i:i+self.batch_size]
                batch_embeds = self.model.encode(batch_texts, convert_to_tensor=True, device='cuda')
                if self.normalize_embeddings:
                    batch_embeds = torch.nn.functional.normalize(batch_embeds, p=2, dim=1)
                batch_embeds = batch_embeds.cpu().numpy()
                for eid, emb in zip(batch_ids, batch_embeds):
                    f.write(json.dumps({"id": id_mapping[eid], "original_id": eid, "embedding": emb.tolist()}) + "\n")

        with open(mapping_save_path, "w") as f:
            json.dump(id_mapping, f)

        print(f"✅ Done. Embeddings and mapping saved to {save_path} and {mapping_save_path}")

    def embed_claims(self, save_path=None, overwrite=False):
        """Embed claims, with caching for efficiency"""
        if save_path is None:
            save_path = os.path.join(self.cache_dir, f"claim_embeddings_{self.retriever_model.replace('/', '_')}.json")

        # Check if embeddings already exist
        if os.path.exists(save_path) and not overwrite:
            print(f"Loading pre-computed claim embeddings from {save_path}")
            with open(save_path, 'r') as f:
                embedding_dict = json.load(f)
                claim_ids = list(self.claim_data.keys())
                try:
                    self.claim_embeddings = np.array([embedding_dict[cid] for cid in claim_ids])
                    return self.claim_embeddings
                except:
                    print("Error loading cached embeddings. Recomputing...")

        # Embed claims
        claim_ids = list(self.claim_data.keys())

        # Extract claim text based on the format
        if isinstance(self.claim_data[claim_ids[0]], dict) and "claim_text" in self.claim_data[claim_ids[0]]:
            claim_texts = [self.claim_data[cid]["claim_text"] for cid in claim_ids]
        else:
            # For test-unlabelled
            claim_texts = [self.claim_data[cid] for cid in claim_ids]

        print(f"Embedding {len(claim_texts)} claims...")
        self.claim_embeddings = self._batch_encode(claim_texts, "Embedding claims")

        # Save embeddings
        embedding_dict = {cid: self.claim_embeddings[i].tolist() for i, cid in enumerate(claim_ids)}
        with open(save_path, 'w') as f:
            json.dump(embedding_dict, f)
            print(f"Saved claim embeddings to {save_path}")
        return self.claim_embeddings

    def fine_tune_model(self, epochs=1, learning_rate=2e-5):
        print("⚙️ Fine-tuning retriever model...")

        # Limit to 300 claims max
        subset_claims = dict(list(self.claim_data.items())[:300])
        all_evidence_ids = set(self.evidence_data.keys())

        train_examples = []
        for cid, claim in subset_claims.items():
            if "claim_text" not in claim or "evidences" not in claim:
                print(f"❌ Claim ID {cid} missing 'claim_text' or 'evidences'. Skipping.")
                continue

            claim_text = claim["claim_text"]
            positive_ids = claim["evidences"]

            for pid in positive_ids:
                if pid in self.evidence_data:
                    train_examples.append(InputExample(texts=[claim_text, self.evidence_data[pid]], label=1.0))

            # Sample 3 negatives
            negative_ids = list(all_evidence_ids - set(positive_ids))
            sampled_negs = random.sample(negative_ids, min(3, len(negative_ids)))
            for nid in sampled_negs:
                train_examples.append(InputExample(texts=[claim_text, self.evidence_data[nid]], label=0.0))

        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.batch_size)
        train_loss = losses.MultipleNegativesRankingLoss(self.model)

        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=epochs,
            warmup_steps=100,
            show_progress_bar=True
        )

        print("✅ Fine-tuning complete.")

    @staticmethod
    def estimate_safe_buffer_size(claim_embeddings_shape, embedding_dim=768, safety_margin=0.7):
        """
        Estimate buffer size that safely fits in available VRAM together with claims.
        :param claim_embeddings_shape: shape of claim embeddings (num_claims, emb_dim)
        :param embedding_dim: dimension of embeddings (default 768)
        :param safety_margin: fraction of free VRAM to use (default 70%)
        :return: estimated safe buffer size
        """
        # Get free GPU memory in MB
        free_mem = torch.cuda.mem_get_info()[0] / (1024 ** 2)  # Free memory in MB
        target_mem = free_mem * safety_margin  # Use only safety_margin% of free VRAM

        # Estimate memory needed for claim embeddings (already loaded)
        num_claims = claim_embeddings_shape[0]
        claim_mem = num_claims * embedding_dim * 2 / (1024 ** 2)  # float16 (2 bytes per element)

        # Remaining memory for evidence buffer
        usable_mem = target_mem - claim_mem
        if usable_mem <= 0:
            print("❗ Warning: Not enough memory even for claims. Reduce batch size or use smaller model.")
            return 1000  # fallback small buffer

        # Calculate buffer size
        buffer_size = int((usable_mem * (1024 ** 2)) / (embedding_dim * 2))  # float16

        print(f"✅ Estimated safe buffer size: {buffer_size} (VRAM safety margin {safety_margin * 100:.0f}%)")
        return max(1000, buffer_size)

    def _process_buffer_gpu(self, buffer_ids, buffer_embs, claim_embs, results, k):
        # Load buffer to GPU in float16 for efficiency
        buffer_embs = torch.tensor(buffer_embs, dtype=torch.float16).cuda()

        if self.normalize_embeddings:
            buffer_embs = torch.nn.functional.normalize(buffer_embs, p=2, dim=1)

        sims = torch.matmul(claim_embs, buffer_embs.T)

        top_scores, top_indices = torch.topk(sims, k=k, dim=1)
        top_indices = top_indices.cpu().numpy()
        top_scores = top_scores.cpu().numpy()

        claim_keys = list(self.claim_data.keys())
        for i in trange(len(claim_keys), desc="🔄 Merging top-k per claim", leave=False, dynamic_ncols=True):
            cid = claim_keys[i]
            for idx, score in zip(top_indices[i], top_scores[i]):
                results[cid].append((buffer_ids[idx], float(score)))


    def retrieve_top_evidence(self, k=5, embedding_jsonl_path="evidence_embeddings_streamed.jsonl", buffer_size=None):
        if self.claim_embeddings is None:
            raise ValueError("Claim embeddings must be computed first")

        print(f"🔍 Streaming retrieval: Using GPU-accelerated cosine similarity")

        # Prepare claim embeddings on GPU (float16 for efficiency)
        claim_embs = torch.tensor(self.claim_embeddings, dtype=torch.float16).cuda()
        if self.normalize_embeddings:
            claim_embs = torch.nn.functional.normalize(claim_embs, p=2, dim=1)

        results = {cid: [] for cid in self.claim_data}

        # Estimate safe buffer size if not provided
        if buffer_size is None:
            buffer_size = self.estimate_safe_buffer_size(self.claim_embeddings.shape, embedding_dim=claim_embs.shape[1])

        total_lines = sum(1 for _ in open(embedding_jsonl_path))
        with open(embedding_jsonl_path, 'r') as f:
            buffer_ids = []
            buffer_embs = []

            for line in tqdm(f, desc="📥 Streaming evidence", total=total_lines, dynamic_ncols=True):
                obj = json.loads(line)
                buffer_ids.append(obj['original_id'])
                buffer_embs.append(obj['embedding'])

                if len(buffer_ids) >= buffer_size:
                    self._process_buffer_gpu(buffer_ids, buffer_embs, claim_embs, results, k)
                    buffer_ids, buffer_embs = [], []

            if buffer_ids:
                self._process_buffer_gpu(buffer_ids, buffer_embs, claim_embs, results, k)

        for cid in results:
            results[cid] = sorted(results[cid], key=lambda x: x[1], reverse=True)[:k]

        print("✅ Retrieval completed")
        return results

    def _rerank_results(self, initial_results, k=5):
        """Rerank initial results with Cross Encoder."""
        if not self.reranking:
            return initial_results

        print("Applying reranking to initial results (batched)...")
        reranked_results = {}

        batch_size = 16  # Safe default, adjust if OOM

        # Outer progress: per claim
        for claim_id, evidence_list in tqdm(initial_results.items(), desc="Claims reranking"):
            if not evidence_list:
                reranked_results[claim_id] = []
                continue

            claim_text = (
                self.claim_data[claim_id]["claim_text"]
                if isinstance(self.claim_data[claim_id], dict) and "claim_text" in self.claim_data[claim_id]
                else self.claim_data[claim_id]
            )

            evidence_pairs = [(claim_text, self.evidence_data[eid]) for eid, _ in evidence_list]

            rerank_scores = []
            # Inner progress: per batch within the claim
            for i in tqdm(range(0, len(evidence_pairs), batch_size), desc=f"Reranking {claim_id}", leave=False):
                batch_pairs = evidence_pairs[i:i + batch_size]
                rerank_scores.extend(self.reranker_model.predict(batch_pairs))

            reranked = sorted(
                zip(evidence_list, rerank_scores),
                key=lambda x: x[1],
                reverse=True
            )[:k]

            reranked_results[claim_id] = [(eid, float(score)) for ((eid, _), score) in reranked]

        return reranked_results

    def calculate_retrieval_metrics(self, results):
        """Calculate precision, recall, and F1 score for the retrieval results"""
        precisions = []
        recalls = []
        f1_scores = []

        for claim_id, retrieved_evidence in results.items():
          if not isinstance(self.claim_data[claim_id], dict) or 'evidences' not in self.claim_data[claim_id]:
              continue
          ground_truth = set(self.claim_data[claim_id]['evidences'])
          retrieved = set([ev_id for ev_id, _ in retrieved_evidence])
          true_positives = len(ground_truth.intersection(retrieved))
          precision = true_positives / len(retrieved) if len(retrieved) > 0 else 0
          recall = true_positives / len(ground_truth) if len(ground_truth) > 0 else 0
          f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
          precisions.append(precision)
          recalls.append(recall)
          f1_scores.append(f1)

        avg_precision = sum(precisions) / len(precisions) if precisions else 0
        avg_recall = sum(recalls) / len(recalls) if recalls else 0
        avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0

        metrics = {
          "precision": avg_precision,
          "recall": avg_recall,
          "f1": avg_f1,
          "num_evaluated": len(precisions)
        }

        print(f"Evaluated on {metrics['num_evaluated']} claims")
        print(f"Average Precision: {metrics['precision']:.4f}")
        print(f"Average Recall: {metrics['recall']:.4f}")
        print(f"Average F1 Score: {metrics['f1']:.4f}")

        return metrics

    def run_pipeline(self, evidence_path, claim_path, k=5,
                     fine_tune=True, epochs=1,
                     evidence_save_path="evidence_embeddings_streamed.jsonl",
                     claim_save_path="claim_embeddings.json",
                     results_path="retrieval_results.json",
                     metrics_path="retrieval_metrics.json",
                     overwrite=False,
                     save_raw_embeddings=True):
        """
        A pipeline with checkpointing, memory-safe streaming, and auto-resume features.
        """
        print("🔁 Starting retrieval pipeline with checkpoints")

        # Step 1: Load data
        print("📥 Loading data...")
        self.load_data(evidence_path, claim_path)
        self.validate_loaded_data()

        # Step 2: Fine-tuning (optional)
        if fine_tune:
            if self.use_hard_negatives:
                print("🔐 Embedding evidence for hard negative mining...")
                self.embed_evidence(save_path=evidence_save_path)
            print("🧠 Fine-tuning model...")
            self.fine_tune_model(epochs=epochs)
        else:
            print("⚠️ Skipping fine-tuning")

        # Step 3: Embed evidence
        if not overwrite and os.path.exists(evidence_save_path):
            print(f"📦 Skipping evidence embedding. Using: {evidence_save_path}")
        else:
            print("📦 Embedding and saving evidence to JSONL...")
            self.embed_evidence(save_path=evidence_save_path, overwrite=overwrite)

        # 🆕 Save raw evidence embeddings to .npy
        if save_raw_embeddings:
            evidence_embs = []
            with open(evidence_save_path, 'r') as f:
                for line in f:
                    obj = json.loads(line)
                    evidence_embs.append(obj['embedding'])
            np.save("evidence_embeddings_raw.npy", np.array(evidence_embs, dtype=np.float32))
            print("✅ Saved raw evidence embeddings to evidence_embeddings_raw.npy")

        # Step 4: Embed claims
        if not overwrite and os.path.exists(claim_save_path):
            print(f"📦 Loading cached claim embeddings from {claim_save_path}")
            with open(claim_save_path) as f:
                loaded = json.load(f)
                self.claim_embeddings = np.array([loaded[cid] for cid in self.claim_data])
        else:
            print("📦 Embedding claims...")
            self.embed_claims(save_path=claim_save_path, overwrite=overwrite)

        # 🆕 Save raw claim embeddings to .npy
        if save_raw_embeddings:
            np.save("claim_embeddings_raw.npy", self.claim_embeddings)
            print("✅ Saved raw claim embeddings to claim_embeddings_raw.npy")

        # Step 5: Retrieval using streaming cosine similarity (no FAISS)
        if results_path and not overwrite and os.path.exists(results_path):
            print(f"📂 Loading existing retrieval results from {results_path}")
            with open(results_path, 'r') as f:
                results = json.load(f)
        else:
            print("🔍 Retrieving top-k evidence using cosine similarity and streaming...")
            results = self.retrieve_top_evidence(k=k, embedding_jsonl_path=evidence_save_path)
            if results_path:
                with open(results_path, 'w') as f:
                    json.dump(results, f)
                print(f"✅ Saved retrieval results to {results_path}")

        # Step 6: Evaluation
        print("📊 Evaluating retrieval...")
        metrics = self.calculate_retrieval_metrics(results)
        if metrics_path:
            with open(metrics_path, 'w') as f:
                json.dump(metrics, f)
            print(f"✅ Saved metrics to {metrics_path}")

        return results, metrics



    def format_evidences(self,evidence_ids, evidence_dict):
        return '\n'.join([f"{i+1}. {evidence_dict.get(eid, '[MISSING]')}" for i, eid in enumerate(evidence_ids)])

    def build_prompt_cot(self, test_claim, evidence_dict):

        # 1) Header (triple-quoted, same style as before)
        header = """
        Please analyze the relationship between evidence and claim based on the following four examples, each of which shows a chain of thought. Think step by step, writing out your full reasoning and then classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """

        # 2) Four CoT examples (triple-quoted, numbered as before)
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        - At very high concentrations (100× atmospheric), CO2 is toxic to animals.
        - Plants grow up to 50% faster at 1,000 ppm CO₂ under unlimited nutrients.
        - Higher CO₂ also alters water demand and can create ecological imbalances.
        Reasoning:
        1. CO₂ has both positive (plant growth) and negative (toxicity, water‐stress) effects.
        2. The net ecological impact is therefore debated and not uniformly beneficial.
        Label: DISPUTED

        Example2：
        Claim: El Niño drove record highs in global temperatures suggesting rise may not be down to man-made emissions.
        Evidences:
        - El Niño causes short-term warming spikes.
        - Long-term temperature trends correlate with rising greenhouse gas levels, not El Niño cycles.
        Reasoning:
        1. El Niño is a transient phenomenon, not a sustained climate forcing.
        2. Evidence attributes persistent warming to human CO₂ emissions.
        3. Therefore, attributing record highs solely to El Niño is incorrect.
        Label: REFUTES

        Example3：
        Claim: In 1946, PDO switched to a cool phase.
        Evidences:
        - The PDO shifted to a cool phase in 1945/1946, as recorded by ocean temperature patterns.
        - Historical regime shifts around 1947 match salmon production changes in the North Pacific.
        Reasoning:
        1. Oceanographic data confirm a cool-phase shift around 1946.
        2. Independent ecological records (salmon regimes) align with that timing.
        3. Thus, the claim is supported by multiple lines of evidence.
        Label: SUPPORTS

        Example4：
        Claim: Climate scientists have predicted global temperatures would increase more than one degree Celsius by 2020, but observed temperatures have been only half as high.
        Evidences:
        - In February 2020 the region recorded 18.3 °C, about 0.83 °C above 1880 levels.
        - The last century saw an average rise of ~0.8 °C, most of it in the last three decades.
        - Projections for 2100 anticipate up to +4 °C if emissions remain unchecked.
        Reasoning:
        1. Observed warming (~0.8 °C) matches projections within expected uncertainty.
        2. Regional short-term records do not contradict global model forecasts.
        3. The evidence does not clearly show a two-fold overprediction by models.
        Label: NOT_ENOUGH_INFO
        """

        # 3) Prepare the test claim block, same "Now classify:" style
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim['claim_text']}
        Evidences:
        {test_evi}
        Reasoning:
        Label:"""

        return header + examples + test_input
    def build_prompt_5example_cot(self, test_claim, evidence_dict):

        # 1) Header (triple-quoted, same style as before)
        header = """
        Please analyze the relationship between evidence and claim based on the following four examples, each of which shows a chain of thought. Think step by step, writing out your full reasoning and then classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """

        # 2) Four CoT examples (triple-quoted, numbered as before)
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        - At very high concentrations (100× atmospheric), CO2 is toxic to animals.
        - Plants grow up to 50% faster at 1,000 ppm CO₂ under unlimited nutrients.
        - Higher CO₂ also alters water demand and can create ecological imbalances.
        Reasoning:
        1. CO₂ has both positive (plant growth) and negative (toxicity, water‐stress) effects.
        2. The net ecological impact is therefore debated and not uniformly beneficial.
        Label: DISPUTED

        Example2：
        Claim: El Niño drove record highs in global temperatures suggesting rise may not be down to man-made emissions.
        Evidences:
        - El Niño causes short-term warming spikes.
        - Long-term temperature trends correlate with rising greenhouse gas levels, not El Niño cycles.
        Reasoning:
        1. El Niño is a transient phenomenon, not a sustained climate forcing.
        2. Evidence attributes persistent warming to human CO₂ emissions.
        3. Therefore, attributing record highs solely to El Niño is incorrect.
        Label: REFUTES

        Example3：
        Claim: In 1946, PDO switched to a cool phase.
        Evidences:
        - The PDO shifted to a cool phase in 1945/1946, as recorded by ocean temperature patterns.
        - Historical regime shifts around 1947 match salmon production changes in the North Pacific.
        Reasoning:
        1. Oceanographic data confirm a cool-phase shift around 1946.
        2. Independent ecological records (salmon regimes) align with that timing.
        3. Thus, the claim is supported by multiple lines of evidence.
        Label: SUPPORTS

        Example4：
        Claim: Climate scientists have predicted global temperatures would increase more than one degree Celsius by 2020, but observed temperatures have been only half as high.
        Evidences:
        - In February 2020 the region recorded 18.3 °C, about 0.83 °C above 1880 levels.
        - The last century saw an average rise of ~0.8 °C, most of it in the last three decades.
        - Projections for 2100 anticipate up to +4 °C if emissions remain unchecked.
        Reasoning:
        1. Observed warming (~0.8 °C) matches projections within expected uncertainty.
        2. Regional short-term records do not contradict global model forecasts.
        3. The evidence does not clearly show a two-fold overprediction by models.
        Label: NOT_ENOUGH_INFO

        Example 5：
        Claim: When stomata-derived CO₂ (red) is compared to ice-core-derived CO₂ (blue), the stomata generally show much more variability in the atmospheric CO₂ level and often show levels much higher than the ice cores.
        Evidences:
        - One study using stomata on fossilized leaves reports CO₂ mole fractions above 300 ppm between seven and ten thousand years ago, suggesting greater variability.
        - Other researchers argue these high stomatal values likely stem from calibration or contamination issues rather than true atmospheric fluctuations.
        Reasoning:
        1. The stomatal evidence shows peaks (>300 ppm) not matched by ice-core records, indicating higher measured variability.
        2. While calibration/contamination could inflate some values, the repeated pattern across samples suggests a real effect.
        3. Therefore, even allowing for potential errors, the stomatal data support the claim of greater CO₂ variability than ice cores indicate.
        Answer: SUPPORTS
        """

        # 3) Prepare the test claim block, same "Now classify:" style
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim['claim_text']}
        Evidences:
        {test_evi}
        Reasoning:
        Label:"""

        return header + examples + test_input
    def build_prompt_3example_cot(self,test_claim, evidence_dict):
        header = """
        Please analyze the relationship between evidence and claim based on the following four examples, each of which shows a chain of thought. Think step by step, writing out your full reasoning and then classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        - At very high concentrations (100× atmospheric), CO2 is toxic to animals.
        - Plants grow up to 50% faster at 1,000 ppm CO₂ under unlimited nutrients.
        - Higher CO₂ also alters water demand and can create ecological imbalances.
        Reasoning:
        1. CO₂ has both positive (plant growth) and negative (toxicity, water‐stress) effects.
        2. The net ecological impact is therefore debated and not uniformly beneficial.
        Label: DISPUTED

        Example2：
        Claim: El Niño drove record highs in global temperatures suggesting rise may not be down to man-made emissions.
        Evidences:
        - El Niño causes short-term warming spikes.
        - Long-term temperature trends correlate with rising greenhouse gas levels, not El Niño cycles.
        Reasoning:
        1. El Niño is a transient phenomenon, not a sustained climate forcing.
        2. Evidence attributes persistent warming to human CO₂ emissions.
        3. Therefore, attributing record highs solely to El Niño is incorrect.
        Label: REFUTES

        Example3：
        Claim: In 1946, PDO switched to a cool phase.
        Evidences:
        - The PDO shifted to a cool phase in 1945/1946, as recorded by ocean temperature patterns.
        - Historical regime shifts around 1947 match salmon production changes in the North Pacific.
        Reasoning:
        1. Oceanographic data confirm a cool-phase shift around 1946.
        2. Independent ecological records (salmon regimes) align with that timing.
        3. Thus, the claim is supported by multiple lines of evidence.
        Label: SUPPORTS
        """
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim['claim_text']}
        Evidences:
        {test_evi}
        Reasoning:
        Label:"""

        return header + examples + test_input
    def build_prompt_2example_cot(self,test_claim, evidence_dict):
        header = """
        Please analyze the relationship between evidence and claim based on the following four examples, each of which shows a chain of thought. Think step by step, writing out your full reasoning and then classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        - At very high concentrations (100× atmospheric), CO2 is toxic to animals.
        - Plants grow up to 50% faster at 1,000 ppm CO₂ under unlimited nutrients.
        - Higher CO₂ also alters water demand and can create ecological imbalances.
        Reasoning:
        1. CO₂ has both positive (plant growth) and negative (toxicity, water‐stress) effects.
        2. The net ecological impact is therefore debated and not uniformly beneficial.
        Label: DISPUTED

        Example2：
        Claim: El Niño drove record highs in global temperatures suggesting rise may not be down to man-made emissions.
        Evidences:
        - El Niño causes short-term warming spikes.
        - Long-term temperature trends correlate with rising greenhouse gas levels, not El Niño cycles.
        Reasoning:
        1. El Niño is a transient phenomenon, not a sustained climate forcing.
        2. Evidence attributes persistent warming to human CO₂ emissions.
        3. Therefore, attributing record highs solely to El Niño is incorrect.
        Label: REFUTES
        """
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim['claim_text']}
        Evidences:
        {test_evi}
        Reasoning:
        Label:"""

        return header + examples + test_input
    def build_prompt_1example_cot(self,test_claim, evidence_dict):
        header = """
        Please analyze the relationship between evidence and claim based on the following four examples, each of which shows a chain of thought. Think step by step, writing out your full reasoning and then classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        - At very high concentrations (100× atmospheric), CO2 is toxic to animals.
        - Plants grow up to 50% faster at 1,000 ppm CO₂ under unlimited nutrients.
        - Higher CO₂ also alters water demand and can create ecological imbalances.
        Reasoning:
        1. CO₂ has both positive (plant growth) and negative (toxicity, water‐stress) effects.
        2. The net ecological impact is therefore debated and not uniformly beneficial.
        Label: DISPUTED

        """
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim['claim_text']}
        Evidences:
        {test_evi}
        Reasoning:
        Label:"""

        return header + examples + test_input
    def build_prompt(self,test_claim, evidence_dict):
        header = """Please analyze the relationship between evidence and claim based on the following four examples. classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
        Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.
        Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.
        Label: DISPUTED
        Example2：
        Claim: El Ni\u00f1o drove record highs in global temperatures suggesting rise may not be down to man-made emissions.
        Evidences:
        While ‘climate change’ can be due to natural forces or human activity, there is now substantial evidence to indicate that human activity – and specifically increased greenhouse gas (GHGs) emissions – is a key factor in the pace and extent of global temperature increases.
        This acceleration is due mostly to human-caused global warming, which is driving thermal expansion of seawater and the melting of land-based ice sheets and glaciers.
        Label: REFUTES
        Example3：
        Claim: In 1946, PDO switched to a cool phase.
        Evidences:
        There is evidence of reversals in the prevailing polarity (meaning changes in cool surface waters versus warm surface waters within the region) of the oscillation occurring around 1925, 1947, and 1977; the last two reversals corresponded with dramatic shifts in salmon production regimes in the North Pacific Ocean.
        1945/1946: The PDO changed to a "cool" phase, the pattern of this regime shift is similar to the 1970s episode with maximum amplitude in the subarctic and subtropical front but with a greater signature near the Japan while the 1970s shift was stronger near the American west coast.
        Label: SUPPORTS
        Example4：
        Claim: Climate scientists have predicted global temperatures would increase more than one degree Celsius by 2020,\" but observed temperatures have been only half as high.
        Evidences:
        In February 2020, the region recorded the highest temperature of 18.3 degree Celsius which was a degree higher than the previous record of 17.5 degrees in March 2015.
        The Earth's average surface temperature has increased by 1.5 °F (0.83 °C) since 1880.
        About a billion years from now, all surface water will have disappeared and the mean global temperature will reach 70 °C (158 °F).
        In the last 100 years, Earth's average surface temperature increased by about 0.8 °C (1.4 °F) with about two thirds of the increase occurring over just the last three decades.
        The 10th Emissions Gap Report issued by the United Nations Environment Programme (UNEP) predicts that if emissions continue to increase at the same rate as they have in 2010–2020, global temperatures would rise by as much as 4° by 2100.
        Label: NOT_ENOUGH_INFO
        """
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim["claim_text"]}
        Evidences:
        {test_evi}
        Label:"""
        return header + examples + test_input

    def build_prompt_1example(self,test_claim, evidence_dict):
        header = """Please analyze the relationship between evidence and claim based on the following one examples. classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
        Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.
        Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.
        Label: DISPUTED

        """
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim["claim_text"]}
        Evidences:
        {test_evi}
        Label:"""
        return header + examples + test_input

    def build_prompt_2example(self,test_claim, evidence_dict):
        header = """Please analyze the relationship between evidence and claim based on the following two examples. classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
        Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.
        Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.
        Label: DISPUTED
        Example2：
        Claim: El Ni\u00f1o drove record highs in global temperatures suggesting rise may not be down to man-made emissions.
        Evidences:
        While ‘climate change’ can be due to natural forces or human activity, there is now substantial evidence to indicate that human activity – and specifically increased greenhouse gas (GHGs) emissions – is a key factor in the pace and extent of global temperature increases.
        This acceleration is due mostly to human-caused global warming, which is driving thermal expansion of seawater and the melting of land-based ice sheets and glaciers.
        Label: REFUTES

        """
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim["claim_text"]}
        Evidences:
        {test_evi}
        Label:"""
        return header + examples + test_input


    def build_prompt_3example(self,test_claim, evidence_dict):
        header = """Please analyze the relationship between evidence and claim based on the following three examples. classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
        Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.
        Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.
        Label: DISPUTED
        Example2：
        Claim: El Ni\u00f1o drove record highs in global temperatures suggesting rise may not be down to man-made emissions.
        Evidences:
        While ‘climate change’ can be due to natural forces or human activity, there is now substantial evidence to indicate that human activity – and specifically increased greenhouse gas (GHGs) emissions – is a key factor in the pace and extent of global temperature increases.
        This acceleration is due mostly to human-caused global warming, which is driving thermal expansion of seawater and the melting of land-based ice sheets and glaciers.
        Label: REFUTES
        Example3：
        Claim: In 1946, PDO switched to a cool phase.
        Evidences:
        There is evidence of reversals in the prevailing polarity (meaning changes in cool surface waters versus warm surface waters within the region) of the oscillation occurring around 1925, 1947, and 1977; the last two reversals corresponded with dramatic shifts in salmon production regimes in the North Pacific Ocean.
        1945/1946: The PDO changed to a "cool" phase, the pattern of this regime shift is similar to the 1970s episode with maximum amplitude in the subarctic and subtropical front but with a greater signature near the Japan while the 1970s shift was stronger near the American west coast.
        Label: SUPPORTS

        """
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim["claim_text"]}
        Evidences:
        {test_evi}
        Label:"""
        return header + examples + test_input


    def build_prompt_5example(self,test_claim, evidence_dict):
        header = """Please analyze the relationship between evidence and claim based on the following five examples. classify the relationship as one of:
        - SUPPORTS
        - REFUTES
        - NOT ENOUGH INFO
        - DISPUTED
        """
        examples = """
        Example1：
        Claim: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
        Evidences:
        At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
        Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.
        Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.
        Label: DISPUTED
        Example2：
        Claim: El Ni\u00f1o drove record highs in global temperatures suggesting rise may not be down to man-made emissions.
        Evidences:
        While ‘climate change’ can be due to natural forces or human activity, there is now substantial evidence to indicate that human activity – and specifically increased greenhouse gas (GHGs) emissions – is a key factor in the pace and extent of global temperature increases.
        This acceleration is due mostly to human-caused global warming, which is driving thermal expansion of seawater and the melting of land-based ice sheets and glaciers.
        Label: REFUTES
        Example3：
        Claim: In 1946, PDO switched to a cool phase.
        Evidences:
        There is evidence of reversals in the prevailing polarity (meaning changes in cool surface waters versus warm surface waters within the region) of the oscillation occurring around 1925, 1947, and 1977; the last two reversals corresponded with dramatic shifts in salmon production regimes in the North Pacific Ocean.
        1945/1946: The PDO changed to a "cool" phase, the pattern of this regime shift is similar to the 1970s episode with maximum amplitude in the subarctic and subtropical front but with a greater signature near the Japan while the 1970s shift was stronger near the American west coast.
        Label: SUPPORTS
        Example4：
        Claim: Climate scientists have predicted global temperatures would increase more than one degree Celsius by 2020,\" but observed temperatures have been only half as high.
        Evidences:
        In February 2020, the region recorded the highest temperature of 18.3 degree Celsius which was a degree higher than the previous record of 17.5 degrees in March 2015.
        The Earth's average surface temperature has increased by 1.5 °F (0.83 °C) since 1880.
        About a billion years from now, all surface water will have disappeared and the mean global temperature will reach 70 °C (158 °F).
        In the last 100 years, Earth's average surface temperature increased by about 0.8 °C (1.4 °F) with about two thirds of the increase occurring over just the last three decades.
        The 10th Emissions Gap Report issued by the United Nations Environment Programme (UNEP) predicts that if emissions continue to increase at the same rate as they have in 2010–2020, global temperatures would rise by as much as 4° by 2100.
        Label: NOT_ENOUGH_INFO
        Example5：
        Claim: When stomata-derived CO2 (red) is compared to ice core-derived CO2 (blue), the stomata generally show much more variability in the atmospheric CO2 level and often show levels much higher than the ice cores.
        Evidences: One study using evidence from stomata of fossilized leaves suggests greater variability, with carbon dioxide mole fractions above 300 ppm during the period seven to ten thousand years ago, though others have argued that these findings more likely reflect calibration or contamination problems rather than actual CO 2 variability.
        Label: SUPPORTS
        """
        test_evi = self.format_evidences(test_claim["evidences"], evidence_dict)
        test_input = f"""
        Now classify:
        Claim: {test_claim["claim_text"]}
        Evidences:
        {test_evi}
        Label:"""
        return header + examples + test_input

    def get_predictions_with_ids_cot(self, claim_items, evidence_dict, model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_new_tokens= 20, example_n=4):
        predictions = {}
        actuals = {}
        final_results = {}
        if not hasattr(self, 'llama_tokenizer'):
            self.llama_tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
            self.llama_model     = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
            self.llama_model.eval()

        for claim_id, test_claim in claim_items:
          if example_n == 4:
            prompt = self.build_prompt_cot(test_claim, evidence_dict)
          elif example_n == 2:
            prompt = self.build_prompt_2example_cot(test_claim, evidence_dict)
          elif example_n == 3:
            prompt = self.build_prompt_3example_cot(test_claim, evidence_dict)
          elif example_n == 5:
            prompt = self.build_prompt_5example_cot(test_claim, evidence_dict)
          elif example_n == 1:
            prompt = self.build_prompt_1example_cot(test_claim, evidence_dict)
          # prompt = self.build_prompt(test_claim, evidence_dict)
          inputs = self.llama_tokenizer(prompt, return_tensors="pt").to(self.llama_model.device)
          with torch.no_grad():
              outputs = self.llama_model.generate(
                  **inputs,
                  max_new_tokens=max_new_tokens,
                  do_sample=False
              )
          decoded = self.llama_tokenizer.decode(outputs[0], skip_special_tokens=True)

          # 4.6 Extract the label from the generated text
          prediction = decoded.split("Label:")[-1].strip().split()[0].upper()


          actual = test_claim["claim_label"].upper()

          predictions[claim_id] = prediction
          actuals[claim_id] = actual

          print(f"Claim ID: {claim_id}")
          print(f"Claim: {test_claim['claim_text']}")
          print(f"Predicted: {prediction}, Actual: {actual}\n")

          final_results[claim_id] = {
                "claim_label": prediction,
                "evidences": test_claim['evidences']
          }

        return final_results, predictions, actuals

    def classify_with_cot(self, example_n = 4, model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", input_path = "/content/drive/MyDrive/predictions_retriever20_reranker3_3.json"):

        with open(input_path, "r") as f:
          classify_claims = json.load(f)

        classify_claims_ids = list(classify_claims.keys())
        classify_claims_texts = [classify_claims[cid]['claim_text'] for cid in classify_claims_ids]
        final_results, predictions, actuals = self.get_predictions_with_ids_cot(claim_items=list(classify_claims.items()), evidence_dict=self.evidence_data, model_name=model_name,max_new_tokens=20, example_n=example_n)
        return final_results

    def classify_with_fix_examples(self, example_n = 4, model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", input_path = "/content/drive/MyDrive/predictions_retriever20_reranker3_3.json"):

        with open(input_path, "r") as f:
          classify_claims = json.load(f)

        classify_claims_ids = list(classify_claims.keys())
        classify_claims_texts = [classify_claims[cid]['claim_text'] for cid in classify_claims_ids]
        final_results, predictions, actuals = self.get_predictions_with_ids(claim_items=list(classify_claims.items()), evidence_dict=self.evidence_data, model_name=model_name,max_new_tokens=20, example_n=example_n)
        return final_results

    def get_predictions_with_ids(self, claim_items, evidence_dict, model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_new_tokens= 20, example_n=4):
        predictions = {}
        actuals = {}
        final_results = {}
        if not hasattr(self, 'llama_tokenizer'):
            self.llama_tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
            self.llama_model     = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
            self.llama_model.eval()

        for claim_id, test_claim in claim_items:
          if example_n == 4:
            prompt = self.build_prompt(test_claim, evidence_dict)
          elif example_n == 2:
            prompt = self.build_prompt_2example(test_claim, evidence_dict)
          elif example_n == 3:
            prompt = self.build_prompt_3example(test_claim, evidence_dict)
          elif example_n == 5:
            prompt = self.build_prompt_5example(test_claim, evidence_dict)
          elif example_n == 1:
            prompt = self.build_prompt_1example(test_claim, evidence_dict)
          # prompt = self.build_prompt(test_claim, evidence_dict)
          inputs = self.llama_tokenizer(prompt, return_tensors="pt").to(self.llama_model.device)
          with torch.no_grad():
              outputs = self.llama_model.generate(
                  **inputs,
                  max_new_tokens=max_new_tokens,
                  do_sample=False
              )
          decoded = self.llama_tokenizer.decode(outputs[0], skip_special_tokens=True)

          # 4.6 Extract the label from the generated text
          prediction = decoded.split("Label:")[-1].strip().split()[0].upper()


          actual = test_claim["claim_label"].upper()

          predictions[claim_id] = prediction
          actuals[claim_id] = actual

          print(f"Claim ID: {claim_id}")
          print(f"Claim: {test_claim['claim_text']}")
          print(f"Predicted: {prediction}, Actual: {actual}\n")

          final_results[claim_id] = {
                "claim_label": prediction,
                "evidences": test_claim['evidences']
          }

        return final_results, predictions, actuals




    def classify_with_local_llm(self,
                              predictions_path: str,
                              output_path: str,
                              num_shot: int = 5,
                              model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                              max_new_tokens: int = 20):
        """
        Perform few-shot classification of previously retrieved predictions
        using a local TinyLLaMA model, with progress reporting.

        :param predictions_path: Path to a JSON file mapping claim_id -> {"evidences": [...]}
        :param output_path:       Destination JSON path for final labels + evidences
        :param num_shot:          Number of few-shot examples to include
        :param model_name:        HuggingFace identifier for the TinyLLaMA model
        :param max_new_tokens:    Maximum tokens to generate for each answer
        """

        # 1) Load the labeled training claims for few-shot examples
        if not hasattr(self, 'train_claims'):
            with open(TRAIN_CLAIMS_PATH, 'r', encoding='utf-8') as f:
                self.train_claims = json.load(f)
            self.train_ids   = list(self.train_claims.keys())
            self.train_texts = [self.train_claims[cid]['claim_text'] for cid in self.train_ids]
            self.train_embs  = self.model.encode(self.train_texts, convert_to_tensor=False)

        # 2) Load the local TinyLLaMA tokenizer & model
        if not hasattr(self, 'llama_tokenizer'):
            self.llama_tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
            self.llama_model     = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
            self.llama_model.eval()

        # 3) Read the previously saved predictions (claim_id -> evidences)
        with open(predictions_path, 'r', encoding='utf-8') as f:
            preds = json.load(f)

        total = len(preds)
        print(f"🔄 Starting classification of {total} claims with TinyLLaMA…")

        final_results = {}

        # 4) Loop over each claim with progress updates
        for idx, (cid, entry) in enumerate(preds.items(), start=1):
            # if idx == 40:
            #   break
            if idx == 1 or idx % 20 == 0 or idx == total:
                print(f"  → Progress: {idx}/{total} (claim_id={cid})")

            # 4.1 Retrieve the claim text
            raw = self.claim_data[cid]
            claim_text = raw['claim_text'] if isinstance(raw, dict) else raw

            # 4.2 Retrieve the top-k evidence texts
            ev_texts = [self.evidence_data[eid] for eid in entry['evidences']]

            # 4.3 Select few-shot examples by embedding similarity
            target_emb = self.model.encode([claim_text], convert_to_tensor=False)[0]
            sims       = cosine_similarity([target_emb], self.train_embs)[0]
            chosen_idxs = np.argsort(sims)[-num_shot:][::-1]

            examples = []
            for i in chosen_idxs:
                tcid   = self.train_ids[i]
                tclaim = self.train_claims[tcid]['claim_text']
                teids  = self.train_claims[tcid]['evidences']
                tlabel = self.train_claims[tcid].get('claim_label', 'NOT_ENOUGH_INFO')
                evs    = [self.evidence_data[eid] for eid in teids]
                examples.append({'claim': tclaim, 'evidences': evs, 'label': tlabel})

            # 4.4 Build the few-shot prompt
            prompt = (
                "You are an expert fact-checker.\n"
                "For each claim below, assess how relevant and valid each evidence snippet is, "
                "and then respond with exactly one of the following labels:\n"
                "  SUPPORTS, REFUTES, NOT_ENOUGH_INFO, or DISPUTED.\n\n"
            )
            for ex in examples:
                prompt += f"Q: {ex['claim']}\n"
                prompt += "Evidence:\n" + "\n".join(f"- {s}" for s in ex['evidences']) + "\n"
                prompt += f"A: {ex['label']}\n\n"

            prompt += f"Q: {claim_text}\n"
            prompt += "Evidence:\n" + "\n".join(f"- {s}" for s in ev_texts) + "\nA:"

            # 4.5 Generate the answer (greedy decoding)
            inputs = self.llama_tokenizer(prompt, return_tensors="pt").to(self.llama_model.device)
            with torch.no_grad():
                outputs = self.llama_model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False
                )
            decoded = self.llama_tokenizer.decode(outputs[0], skip_special_tokens=True)

            # 4.6 Extract the label from the generated text
            label = decoded.split("A:")[-1].strip().split()[0].upper()
            if label not in ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]:
                label = "NOT_ENOUGH_INFO"

            final_results[cid] = {
                "claim_label": label,
                "evidences": entry['evidences']
            }

        # 5) Write the final results to JSON
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(final_results, f, indent=2, ensure_ascii=False)

        print(f"✅ Done! Processed {total} claims. Results saved to {output_path}")
        return final_results



    def classify_with_few_shot_new(self, ranked_result, k=3, num_shot=5, model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", temperature=0.0,max_new_tokens = 20):
        """
        Few-shot in-context classification using top-k evidence and Chain-of-Thought prompting.
        """
        import openai

        # 1. Load and embed training claims if not already loaded
        if not hasattr(self, 'train_claims'):
            with open(TRAIN_CLAIMS_PATH, 'r') as f:
                self.train_claims = json.load(f)
            self.train_ids = list(self.train_claims.keys())
            self.train_texts = [self.train_claims[cid]['claim_text'] for cid in self.train_ids]
            self.train_embs = self.model.encode(self.train_texts)

        if not hasattr(self, 'llama_tokenizer'):
            self.llama_tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
            self.llama_model     = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
            self.llama_model.eval()

        # 2. Retrieve top-k evidence for all claims
        topk = ranked_result

        results = {}
        for cid, candidates in topk.items():
            # Claim text and evidence
            raw = self.claim_data[cid]
            claim_text = raw['claim_text'] if isinstance(raw, dict) else raw
            ev_ids = [eid for eid in candidates['evidences']]
            ev_texts = [self.evidence_data[eid] for eid in ev_ids]

            # Auto-select few-shot examples by similarity
            target_emb = self.model.encode([claim_text])[0]
            sims = cosine_similarity([target_emb], self.train_embs)[0]
            idx = np.argsort(sims)[-num_shot:][::-1]
            examples = []
            for i in idx:
                tcid = self.train_ids[i]
                tclaim = self.train_claims[tcid]['claim_text']
                teids = self.train_claims[tcid]['evidences']
                tlabel = self.train_claims[tcid]['claim_label']
                examples.append({'claim': tclaim, 'evidences': teids, 'label': tlabel})

            # Build CoT-style few-shot prompt
            prompt = (
                "You are an expert fact-checker. Given a claim and its supporting evidence passages, think carefully step by step and determine whether the claim is one of: SUPPORTS, REFUTES, NOT_ENOUGH_INFO, or DISPUTED.\n\n"
            )
            for ex in examples:
                txts = [self.evidence_data[eid] for eid in ex['evidences']]
                prompt += f"Claim: {ex['claim']}\n"
                prompt += f"Evidence: {' '.join(txts)}\n"
                prompt += "Let's think step by step:\n"
                prompt += f"- Review the evidence carefully.\n"
                prompt += f"- Identify if the evidence directly supports, refutes, provides insufficient information, or shows conflicting information.\n"
                prompt += f"- Conclusion: {ex['label']}\n\n"
            prompt += f"Claim: {claim_text}\n"
            prompt += f"Evidence: {' '.join(ev_texts)}\n"
            prompt += "Let's think step by step:\n"

            # 4.6 Extract the label from the generated text
            inputs = self.llama_tokenizer(prompt, return_tensors="pt").to(self.llama_model.device)
            with torch.no_grad():
                outputs = self.llama_model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False
                )
            decoded = self.llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
            # label = decoded[len(prompt):].strip()
            # decoded = self.llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
            # last_output = decoded[len(prompt):].strip()
            # label = last_output.split("\n")[0].split()[0].upper()]
            import re
            matches = re.findall(r"Conclusion:\s*(SUPPORTS|REFUTES|NOT_ENOUGH_INFO|DISPUTED)", decoded)
            if matches:
                label = matches[-1]
            else:
                label = 'SUPPORTS'



            # Format result
            results[cid] = {
                # 'claim_text': claim_text,
                'claim_label': label,
                'evidences': ev_ids
            }

        return results




    def weighted_majority_vote(self, pred_dicts, weights):
        """
        pred_dicts: list of dicts containing claim -> {claim_label, evidences}
        weights: list of floats corresponding to each dict
        Returns: dict of claim -> {claim_label, evidences}
        """
        assert len(pred_dicts) == len(weights), "Each prediction dict must have a corresponding weight."

        claim_label_scores = defaultdict(lambda: defaultdict(float))  # claim -> label -> total_weight
        claim_label_evidences = defaultdict(lambda: defaultdict(list))  # claim -> label -> [evidences]

        # Count votes and track evidence per label
        for pred_dict, weight in zip(pred_dicts, weights):
            for claim_id, data in pred_dict.items():
                label = data['claim_label']
                evidences = data['evidences']
                claim_label_scores[claim_id][label] += weight
                claim_label_evidences[claim_id][label].append(evidences)

        # Build final result
        final_result = {}
        for claim_id, label_scores in claim_label_scores.items():
            # Choose the label with highest weight
            final_label = max(label_scores.items(), key=lambda x: x[1])[0]
            # Pick the first evidences list from the winning label's votes
            final_evidences = claim_label_evidences[claim_id][final_label][0]
            final_result[claim_id] = {
                'claim_label': final_label,
                'evidences': final_evidences
            }

        return final_result


    def combine_method(self, predictions_path = "/content/drive/MyDrive/predictions_retriever20_reranker3_2.json", output_path = '/content/drive/MyDrive/outout1.json', model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", weights = [0.4,0.3,0.3]):
        with open(predictions_path, "r") as f:
          classify_claims = json.load(f)
        print('first model')
        final_result1 = self.classify_with_local_llm(num_shot=3,model_name = model_name,predictions_path = predictions_path,output_path=output_path)
        print('second model')
        final_result2 = self.classify_with_cot(example_n = 2, model_name = model_name, input_path = predictions_path)
        print('third model')
        final_result3 = self.classify_with_fix_examples(example_n = 4, model_name = model_name, input_path = predictions_path)
        print('finish, start combine')
        pred_dicts = [final_result1, final_result2, final_result3]
        final_result = self.weighted_majority_vote(pred_dicts, weights)
        return final_result


# **Load Data**

In [None]:
retriever = ImprovedDualEncoderRetrieval(
    retriever_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    reranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
    batch_size=16
)

# Set project directory path
PROJECT_DIR = '/content/drive/MyDrive/COMP90042_2025'
DATA_DIR = f'{PROJECT_DIR}/data/'

# Ensure directories exist
os.makedirs(DATA_DIR, exist_ok=True)

# Define file paths
EVIDENCE_PATH = f'{DATA_DIR}evidence.json'
TRAIN_CLAIMS_PATH = f'{DATA_DIR}train-claims.json'
DEV_CLAIMS_PATH = f'{DATA_DIR}dev-claims.json'
TEST_CLAIMS_PATH = f'{DATA_DIR}test-claims-unlabelled.json'

# Output file paths
EVIDENCE_EMBEDDINGS_PATH = f'{PROJECT_DIR}/evidence_embeddings_streamed.jsonl'
CLAIM_EMBEDDINGS_PATH = f'{PROJECT_DIR}/claim_embeddings.json'
RESULTS_PATH = f'{PROJECT_DIR}/retrieval_results.json'
METRICS_PATH = f'{PROJECT_DIR}/retrieval_metrics.json'


✅ Loading retriever model [sentence-transformers/multi-qa-mpnet-base-dot-v1] in FP32


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Loading reranker model [cross-encoder/ms-marco-MiniLM-L-12-v2] in FP32


config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

In [None]:
retriever.load_data(EVIDENCE_PATH, TRAIN_CLAIMS_PATH)

NameError: name 'retriever' is not defined

In [None]:
original_file = "/content/drive/MyDrive/COMP90042_2025/data/train-claims.json"  # The file used as standard Lable for evaluating. example: we use train data to predict so here is dev-claims.
retrieved_file = "/content/drive/MyDrive/predictions_retriever20_reranker3.json" # The output file of stage 1.


with open(original_file, "r") as f:
    claim_data = json.load(f)

with open(retrieved_file, "r") as f:
    New_claim_data = json.load(f)


for cid in claim_data:    # Because the file output of our stage1 only has the evidences and claim_label, But we also need claim_text in the stage2.
    if cid in New_claim_data and 'evidences' in New_claim_data[cid]:
        claim_data[cid]['evidences'] = New_claim_data[cid]['evidences']


predictions_path = "/content/drive/MyDrive/predictions_retriever20_reranker3_3.json" # store the file for stage2.

with open(predictions_path, "w", encoding="utf-8") as f:
    json.dump(claim_data, f, ensure_ascii=False, indent=4)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def evaluate_with_ids(predictions_dict, actuals_dict):
    common_ids = set(predictions_dict.keys()) & set(actuals_dict.keys())

    y_pred = [predictions_dict[cid]["claim_label"] for cid in common_ids]
    y_true = [actuals_dict[cid]["claim_label"] for cid in common_ids]

    print("\n=== Evaluation Metrics ===")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2%}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))

In [None]:
actuals_dict={}
with open(TRAIN_CLAIMS_PATH, 'r') as f:
      claim_data = json.load(f)
for claim_id, info in claim_data.items():
    evidences = info["evidences"]
    claim_label = info["claim_label"]
    actuals_dict[claim_id] = {
                    "claim_label": claim_label,
                    "evidences": evidences
                    }

# **First** **model**

In [None]:
final_result_list = []
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
predictions_path = '/content/drive/MyDrive/predictions_retriever20_reranker3_3.json'
output_path = '/content/drive/MyDrive/outout1.json'
for i in range(1,6):
  f1 = retriever.classify_with_local_llm(num_shot = i,model_name = model_name, predictions_path=predictions_path, output_path=output_path)
  final_result_list.append(f1)



tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

🔄 Starting classification of 1228 claims with TinyLLaMA…
  → Progress: 1/1228 (claim_id=claim-1937)
  → Progress: 20/1228 (claim_id=claim-60)
  → Progress: 40/1228 (claim_id=claim-2257)
  → Progress: 60/1228 (claim_id=claim-1227)
  → Progress: 80/1228 (claim_id=claim-740)
  → Progress: 100/1228 (claim_id=claim-1796)
  → Progress: 120/1228 (claim_id=claim-1620)
  → Progress: 140/1228 (claim_id=claim-1667)
  → Progress: 160/1228 (claim_id=claim-663)
  → Progress: 180/1228 (claim_id=claim-1376)
  → Progress: 200/1228 (claim_id=claim-1665)
  → Progress: 220/1228 (claim_id=claim-2506)
  → Progress: 240/1228 (claim_id=claim-622)
  → Progress: 260/1228 (claim_id=claim-344)
  → Progress: 280/1228 (claim_id=claim-2032)
  → Progress: 300/1228 (claim_id=claim-2262)
  → Progress: 320/1228 (claim_id=claim-194)
  → Progress: 340/1228 (claim_id=claim-3034)
  → Progress: 360/1228 (claim_id=claim-108)
  → Progress: 380/1228 (claim_id=claim-2114)
  → Progress: 400/1228 (claim_id=claim-2903)
  → Progress

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


  → Progress: 1160/1228 (claim_id=claim-2111)
  → Progress: 1180/1228 (claim_id=claim-2726)
  → Progress: 1200/1228 (claim_id=claim-2570)
  → Progress: 1220/1228 (claim_id=claim-2508)
  → Progress: 1228/1228 (claim_id=claim-3093)
✅ Done! Processed 1228 claims. Results saved to /content/drive/MyDrive/outout1.json


In [None]:
for predictions_dict in final_result_list:
  print('--------------------------------')
  print('Result:')
  evaluate_with_ids(predictions_dict, actuals_dict)

--------------------------------
Result:

=== Evaluation Metrics ===
Accuracy: 56.03%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.2581    0.1935    0.2212       124
NOT_ENOUGH_INFO     0.9630    0.6736    0.7927       386
        REFUTES     0.0000    0.0000    0.0000       199
       SUPPORTS     0.5394    0.7784    0.6372       519

       accuracy                         0.5603      1228
      macro avg     0.4401    0.4114    0.4128      1228
   weighted avg     0.5567    0.5603    0.5408      1228

--------------------------------
Result:

=== Evaluation Metrics ===
Accuracy: 52.28%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.1415    0.2419    0.1786       124
NOT_ENOUGH_INFO     0.9447    0.6192    0.7480       386
        REFUTES     0.5385    0.3166    0.3987       199
       SUPPORTS     0.4799    0.5973    0.5322       519

       accuracy                   

In [None]:
i = 1
for predictions_dict in final_result_list:
  with open('/content/drive/MyDrive/data_for_model1_'+ str(i) +'.json', 'w', encoding='utf-8') as f:
    json.dump(predictions_dict, f, ensure_ascii=False, indent=4)
  i = i+1

# **Second model**

In [None]:
final_result_list = []
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
predictions_path = '/content/drive/MyDrive/predictions_retriever20_reranker3_3.json'
for i in range(1,6):
    f2 = retriever.classify_with_cot(example_n = i, model_name = model_name, input_path = predictions_path)
    final_result_list.append(f2)
    with open('/content/drive/MyDrive/cot_'+ str(i) +'.json', 'w', encoding='utf-8') as f:
        json.dump(f2, f, indent=2, ensure_ascii=False)
    print(f"✅ Done! Results saved to {'/content/drive/MyDrive/cot_'+ str(i) +'.json'}")


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Predicted: DISPUTED, Actual: SUPPORTS

Claim ID: claim-2657
Claim: Currently, humans are emitting around 29 billion tonnes of carbon dioxide into the atmosphere per year.
Predicted: SUPPORTS, Actual: SUPPORTS

Claim ID: claim-625
Claim: I note particularly that sea-level rise is not affected by the warming; it continues at the same rate, 1.8 millimeters a year, according to a 1990 review by Andrew S. Trupin and John Wahr.
Predicted: SUPPORTS, Actual: REFUTES

Claim ID: claim-1619
Claim: Hansen predicted in 1988 the West Side Highway would be underwater in 20 years.
Predicted: SUPPORTS, Actual: NOT_ENOUGH_INFO

Claim ID: claim-2921
Claim: The tax-payer funded National Oceanic and Atmospheric Administration  (NOAA) has become mired in fresh global warming data scandal involving  numbers for the Great Lakes region that substantially ramp up averages."
Predicted: DISPUTED, Actual: NOT_ENOUGH_INFO

Claim ID: claim-1146
Claim: “It’s far too early to t

In [None]:
for predictions_dict in final_result_list:
  print('--------------------------------')
  print('Result:')
  evaluate_with_ids(predictions_dict, actuals_dict)

--------------------------------
Result:

=== Evaluation Metrics ===
Accuracy: 42.18%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.5000    0.0081    0.0159       124
            NOT     0.0000    0.0000    0.0000         0
NOT_ENOUGH_INFO     0.0000    0.0000    0.0000       386
        REFUTES     0.0000    0.0000    0.0000       199
       SUPPORTS     0.4224    0.9961    0.5932       519

       accuracy                         0.4218      1228
      macro avg     0.1845    0.2008    0.1218      1228
   weighted avg     0.2290    0.4218    0.2523      1228

--------------------------------
Result:

=== Evaluation Metrics ===
Accuracy: 42.26%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.0000    0.0000    0.0000       124
NOT_ENOUGH_INFO     0.0000    0.0000    0.0000       386
        REFUTES     0.0000    0.0000    0.0000       199
       SUPPORTS     0.4226    1.000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [None]:
i = 1
for predictions_dict in final_result_list:
  with open('/content/drive/MyDrive/data_for_model2_'+ str(i) +'.json', 'w', encoding='utf-8') as f:
    json.dump(predictions_dict, f, ensure_ascii=False, indent=4)
  i = i+1

# **Third model**

In [None]:
final_result_list = []
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
predictions_path = '/content/drive/MyDrive/predictions_retriever20_reranker3_3.json'

for i in range(1,6):
  f3 = retriever.classify_with_fix_examples(example_n = i, model_name = model_name, input_path = predictions_path)
  final_result_list.append(f3)




tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Claim ID: claim-775
Claim: Instead of negotiating over climate change policies and trying to make them more market-oriented, some political conservatives have taken the approach of blocking them by trying to undermine the science.
Predicted: DISPUTED, Actual: SUPPORTS

Claim ID: claim-2657
Claim: Currently, humans are emitting around 29 billion tonnes of carbon dioxide into the atmosphere per year.
Predicted: SUPPORTS, Actual: SUPPORTS

Claim ID: claim-625
Claim: I note particularly that sea-level rise is not affected by the warming; it continues at the same rate, 1.8 millimeters a year, according to a 1990 review by Andrew S. Trupin and John Wahr.
Predicted: DISPUTED, Actual: REFUTES

Claim ID: claim-1619
Claim: Hansen predicted in 1988 the West Side Highway would be underwater in 20 years.
Predicted: DISPUTED, Actual: NOT_ENOUGH_INFO

Claim ID: claim-2921
Claim: The tax-payer funded National Oceanic and Atmospheric Administration  (NOAA) has b

In [None]:
for predictions_dict in final_result_list:
  print('--------------------------------')
  print('Result:')
  evaluate_with_ids(predictions_dict, actuals_dict)

--------------------------------
Result:

=== Evaluation Metrics ===
Accuracy: 42.26%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.0000    0.0000    0.0000       124
NOT_ENOUGH_INFO     0.0000    0.0000    0.0000       386
        REFUTES     0.0000    0.0000    0.0000       199
       SUPPORTS     0.4226    1.0000    0.5942       519

       accuracy                         0.4226      1228
      macro avg     0.1057    0.2500    0.1485      1228
   weighted avg     0.1786    0.4226    0.2511      1228

--------------------------------
Result:

=== Evaluation Metrics ===
Accuracy: 42.26%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.0000    0.0000    0.0000       124
NOT_ENOUGH_INFO     0.0000    0.0000    0.0000       386
        REFUTES     0.0000    0.0000    0.0000       199
       SUPPORTS     0.4226    1.0000    0.5942       519

       accuracy                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [None]:
i = 1
for predictions_dict in final_result_list:
  with open('/content/drive/MyDrive/data_for_model3_'+ str(i) +'.json', 'w', encoding='utf-8') as f:
    json.dump(predictions_dict, f, ensure_ascii=False, indent=4)
  i = i+1

# **combine 3 model**

In [None]:
predictions_path = "/content/drive/MyDrive/predictions_retriever20_reranker3_3.json"
output_path = '/content/drive/MyDrive/outout1.json'
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
weights = [0.4,0.3,0.3]

final = retriever.combine_method(predictions_path = predictions_path, output_path = output_path, model_name = model_name, weights = weights)

first model
🔄 Starting classification of 1228 claims with TinyLLaMA…
  → Progress: 1/1228 (claim_id=claim-1937)
  → Progress: 20/1228 (claim_id=claim-60)
  → Progress: 40/1228 (claim_id=claim-2257)
  → Progress: 60/1228 (claim_id=claim-1227)
  → Progress: 80/1228 (claim_id=claim-740)
  → Progress: 100/1228 (claim_id=claim-1796)
  → Progress: 120/1228 (claim_id=claim-1620)
  → Progress: 140/1228 (claim_id=claim-1667)
  → Progress: 160/1228 (claim_id=claim-663)
  → Progress: 180/1228 (claim_id=claim-1376)
  → Progress: 200/1228 (claim_id=claim-1665)
  → Progress: 220/1228 (claim_id=claim-2506)
  → Progress: 240/1228 (claim_id=claim-622)
  → Progress: 260/1228 (claim_id=claim-344)
  → Progress: 280/1228 (claim_id=claim-2032)
  → Progress: 300/1228 (claim_id=claim-2262)
  → Progress: 320/1228 (claim_id=claim-194)
  → Progress: 340/1228 (claim_id=claim-3034)
  → Progress: 360/1228 (claim_id=claim-108)
  → Progress: 380/1228 (claim_id=claim-2114)
  → Progress: 400/1228 (claim_id=claim-2903)


KeyboardInterrupt: 

# Experiment Stage3

In [None]:
with open('/content/drive/MyDrive/cot_3.json', 'r') as f:
      f2 = json.load(f)
with open('/content/drive/MyDrive/data_for_model3_3.json', 'r') as f:
      f3 = json.load(f)
with open('/content/drive/MyDrive/data_for_model1_3.json', 'r') as f:
      f1 = json.load(f)

In [None]:
weights = [0.4,0.3,0.3]
f5 = retriever.weighted_majority_vote([f1,f2,f3],weights)
evaluate_with_ids(f5, actuals_dict)
file_name = "/content/drive/MyDrive/final_combined_result0001.json"

# 写入 JSON 文件
with open(file_name, "w", encoding="utf-8") as f:
    json.dump(f5, f, ensure_ascii=False, indent=4)


=== Evaluation Metrics ===
Accuracy: 50.41%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.2222    0.4355    0.2943       124
NOT_ENOUGH_INFO     0.8834    0.3731    0.5246       386
        REFUTES     0.7500    0.2714    0.3985       199
       SUPPORTS     0.4893    0.7071    0.5784       519

       accuracy                         0.5041      1228
      macro avg     0.5862    0.4468    0.4490      1228
   weighted avg     0.6285    0.5041    0.5037      1228



In [None]:
weights = [0.3,0.4,0.3]
f5 = retriever.weighted_majority_vote([f1,f2,f3],weights)
evaluate_with_ids(f5, actuals_dict)
file_name = "/content/drive/MyDrive/final_combined_result0002.json"

# 写入 JSON 文件
with open(file_name, "w", encoding="utf-8") as f:
    json.dump(f5, f, ensure_ascii=False, indent=4)


=== Evaluation Metrics ===
Accuracy: 34.28%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.1130    0.4355    0.1794       124
NOT_ENOUGH_INFO     0.0000    0.0000    0.0000       386
        REFUTES     0.0000    0.0000    0.0000       199
       SUPPORTS     0.4893    0.7071    0.5784       519

       accuracy                         0.3428      1228
      macro avg     0.1506    0.2857    0.1895      1228
   weighted avg     0.2182    0.3428    0.2626      1228



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
weights = [0.3,0.3,0.4]
f5 = retriever.weighted_majority_vote([f1,f2,f3],weights)
evaluate_with_ids(f5, actuals_dict)
file_name = "/content/drive/MyDrive/final_combined_result0003.json"

# 写入 JSON 文件
with open(file_name, "w", encoding="utf-8") as f:
    json.dump(f5, f, ensure_ascii=False, indent=4)


=== Evaluation Metrics ===
Accuracy: 36.56%

Classification Report:
                 precision    recall  f1-score   support

       DISPUTED     0.2222    0.4355    0.2943       124
NOT_ENOUGH_INFO     0.0000    0.0000    0.0000       386
        REFUTES     0.0000    0.0000    0.0000       199
       SUPPORTS     0.4010    0.7611    0.5253       519

       accuracy                         0.3656      1228
      macro avg     0.1558    0.2991    0.2049      1228
   weighted avg     0.1919    0.3656    0.2517      1228



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
