In [1]:
from pathlib import Path #Imports Path for filesystem operations
import os, sys #Imports OS and system utilities

PROJECT_ROOT = Path.cwd().resolve() #Starts from the current working directory
for _ in range(3): #Tries to find the project root by moving up the directory tree
    if (PROJECT_ROOT / "src").exists() and (PROJECT_ROOT / "configs").exists(): #Checks for project structure
        break #Stops if project root is found
    PROJECT_ROOT = PROJECT_ROOT.parent #Moves one directory up

os.chdir(PROJECT_ROOT) #Changes the working directory to the project root

SRC = PROJECT_ROOT / "src" #Defines the src directory path
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC)) #Adds src to Python path if missing

print("CWD:", Path.cwd()) #Prints current working directory
print("SRC in sys.path:", str(SRC) in sys.path) #Checks if src is in Python path


CWD: C:\Users\chiar\Desktop\NLP Project\0. File potenzialmente finale - Copy
SRC in sys.path: True


In [2]:
from ragtrust.config import load_config
cfg = load_config("configs/with_training.yaml")  # o zero_training.yaml


In [3]:
from ragtrust.utils import set_seed #Imports utility to fix random seeds
from ragtrust.data.fever import load_fever_examples, extract_gold_titles, build_fever_corpus #Imports FEVER data helpers
from ragtrust.data.hotpotqa import load_hotpot_examples, build_hotpot_corpus #Imports HotpotQA data helpers
from ragtrust.experiment import RAGTrustPipeline #Imports the main RAG pipeline
from ragtrust.evaluation.run_experiment import run_fever, run_hotpot #Imports evaluation runners
from dataclasses import replace #Imports replace to safely modify configs

#CORE RUN (generation: disabled)
cfg = replace(cfg, generation=replace(cfg.generation, enabled=False, num_samples=1)) #Disables generation for core evaluation
print("CORE mode: generation disabled ->", cfg.generation.enabled) #Prints generation status
#Fine New9

set_seed(cfg.seed) #Fixes random seed for reproducibility

results = {} #Initializes dictionary to store results

#FEVER
if cfg.data.fever.enabled: #Checks if FEVER is enabled
    fever_ex = load_fever_examples(cfg.data.fever.train_jsonl, cfg.data.fever.max_examples) #Loads FEVER examples
    fever_ex = [ex for ex in fever_ex if ex.label in ("SUPPORTS", "REFUTES")] #Keeps only labeled examples

    wanted = set() #Initializes set of required Wikipedia titles
    for ex in fever_ex:
        wanted |= extract_gold_titles(ex.evidence) #Collects gold evidence titles

    corpus_fever = build_fever_corpus( #Builds FEVER corpus
        cfg.data.fever.wiki_pages_dir, #Wikipedia pages directory
        wanted_titles=wanted,  #Speeds up corpus construction by filtering pages
        max_sentences=cfg.retrieval.max_corpus_sentences, #Limits corpus size
    )

    rates = [0.0, 0.1, 0.2]   #Defines poisoning rates (0.0 is clean baseline)

    # --- BASELINE (no poison): cfg + pipeline coerenti ---
    cfg_base = replace(cfg, poisoning=replace(cfg.poisoning, enabled=False, rate=0.0)) #Creates clean baseline config
    pipeline_fever_base = RAGTrustPipeline(cfg_base, corpus_fever) #Builds baseline FEVER pipeline

    # Sanity check poisoning (baseline)
    ex0 = fever_ex[0] #Selects one example for sanity check
    out0 = pipeline_fever_base.run_one(ex0.claim, verifier_mode="single") #Runs pipeline on one claim

    n_total = len(out0["poisoned"]) #Counts total retrieved passages
    n_poison = sum(1 for p in out0["poisoned"] if p.get("is_poison") == "1") #Counts poisoned passages

    print("Poison enabled:", cfg_base.poisoning.enabled, "rate:", cfg_base.poisoning.rate) #Prints poisoning status
    print("Retrieved:", len(out0["retrieved"]), "Poisoned list:", n_total, "Injected poison:", n_poison) #Prints sanity check info

    fever_res_base = run_fever(cfg_base, pipeline_fever_base, fever_ex) #Runs FEVER baseline evaluation
    results["fever_base"] = fever_res_base #Stores baseline results
    print("FEVER BASE summary:", fever_res_base["summary"]) #Prints baseline summary

    #CORE RUN: clean vs poisoned (each run uses its own coherent cfg + pipeline)
    for r in rates: #Loops over poisoning rates
        cfg_r = replace(cfg, poisoning=replace(cfg.poisoning, enabled=(r > 0.0), rate=r)) #Creates config for this rate
        print("\n=== CORE RUN | poison enabled:", cfg_r.poisoning.enabled, "rate:", cfg_r.poisoning.rate, "===") #Prints run info

        pipeline_r = RAGTrustPipeline(cfg_r, corpus_fever) #Builds pipeline for this run
        res_r = run_fever(cfg_r, pipeline_r, fever_ex) #Runs FEVER evaluation

        results[f"fever_poison_{r}"] = res_r #Stores results for this rate
        print("FEVER summary:", res_r["summary"]) #Prints summary

#HotpotQA
if cfg.data.hotpotqa.enabled: #Checks if HotpotQA is enabled
    hotpot_ex = load_hotpot_examples(cfg.data.hotpotqa.train_json, cfg.data.hotpotqa.max_examples) #Loads HotpotQA examples

    corpus_hotpot = build_hotpot_corpus(hotpot_ex) #Builds HotpotQA corpus
    pipeline_hotpot = RAGTrustPipeline(cfg, corpus_hotpot) #Builds HotpotQA pipeline

    hotpot_res = run_hotpot(cfg, pipeline_hotpot, hotpot_ex) #Runs HotpotQA evaluation
    results["hotpotqa"] = hotpot_res #Stores HotpotQA results
    print("HotpotQA summary:", hotpot_res["summary"]) #Prints HotpotQA summary


  from .autonotebook import tqdm as notebook_tqdm



CORE mode: generation disabled -> False


Batches: 100%|██████████| 51/51 [00:36<00:00,  1.41it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Poison enabled: False rate: 0.0
Retrieved: 20 Poisoned list: 20 Injected poison: 0
FEVER BASE summary: {'acc': 0.25, 'hallucination': 0.4342105263157895}

=== CORE RUN | poison enabled: False rate: 0.0 ===


Batches: 100%|██████████| 51/51 [00:36<00:00,  1.41it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FEVER summary: {'acc': 0.25, 'hallucination': 0.4342105263157895}

=== CORE RUN | poison enabled: True rate: 0.1 ===


Batches: 100%|██████████| 51/51 [00:49<00:00,  1.03it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FEVER summary: {'acc': 0.23684210526315788, 'hallucination': 0.40789473684210525}

=== CORE RUN | poison enabled: True rate: 0.2 ===


Batches: 100%|██████████| 51/51 [00:26<00:00,  1.94it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FEVER summary: {'acc': 0.23684210526315788, 'hallucination': 0.40789473684210525}


Batches: 100%|██████████| 7/7 [00:00<00:00, 29.92it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HotpotQA summary: {'acc': 0.0, 'hallucination': 0.005}


In [4]:
#Running a decision rule ablation experiment
from ragtrust.ablation.ablation import run_decision_rule_ablation, DECISION_RULES #Imports ablation function and rules

ablation_results = run_decision_rule_ablation( #Runs the decision rule ablation
    cfg=cfg_base, #Uses the clean baseline configuration
    corpus=corpus_fever, #Uses the FEVER corpus
    examples=fever_ex, #Uses FEVER evaluation examples
    run_fn=run_fever, #Uses the FEVER evaluation function
    rules=DECISION_RULES, #Applies all predefined decision rules
    n=200 #Limits the number of examples for the ablation
)



=== ABLATION RULE: strict ===


Batches: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Summary: {'acc': 0.2565789473684211, 'hallucination': 0.5263157894736842}

=== ABLATION RULE: balanced ===


Batches: 100%|██████████| 51/51 [00:27<00:00,  1.83it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Summary: {'acc': 0.26973684210526316, 'hallucination': 0.5789473684210527}

=== ABLATION RULE: conservative ===


Batches: 100%|██████████| 51/51 [00:29<00:00,  1.75it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Summary: {'acc': 0.2565789473684211, 'hallucination': 0.4934210526315789}


In [5]:
#Enabling answer generation and self-consistency evaluation.
cfg_gen = replace( #Creates a modified configuration for generation mode
    cfg,
    generation=replace(cfg.generation, enabled=True, num_samples=10), #Enables generation and sets number of samples
    evaluation=replace(cfg.evaluation, compute_self_consistency=True), #Enables self-consistency evaluation
)
print("GEN mode: generation enabled ->", cfg_gen.generation.enabled, "num_samples ->", cfg_gen.generation.num_samples) #Prints generation status


GEN mode: generation enabled -> True num_samples -> 10


In [6]:
#Running generation-based experiments on FEVER.
# It compares clean vs poisoned settings while generation
# and self-consistency are enabled.
rates = [0.0, 0.2] #Defines poisoning rates for generation runs

for r in rates: #Loops over the selected poisoning rates
    cfg_r = replace(cfg_gen, poisoning=replace(cfg_gen.poisoning, enabled=(r > 0.0), rate=r)) #Creates config for this rate
    print("\n=== GEN RUN | poison enabled:", cfg_r.poisoning.enabled, "rate:", cfg_r.poisoning.rate, "===") #Prints run info

    pipeline_fever = RAGTrustPipeline(cfg_r, corpus_fever) #Builds the pipeline for this generation run
    res = run_fever(cfg_r, pipeline_fever, fever_ex) #Runs FEVER evaluation with generation enabled
    print("FEVER summary:", res["summary"]) #Prints summary results



=== GEN RUN | poison enabled: False rate: 0.0 ===


Batches: 100%|██████████| 51/51 [00:33<00:00,  1.54it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FEVER summary: {'acc': 0.25, 'hallucination': 0.4342105263157895, 'self_consistency': 0.3085526315789474}

=== GEN RUN | poison enabled: True rate: 0.2 ===


Batches: 100%|██████████| 51/51 [00:32<00:00,  1.59it/s]
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FEVER summary: {'acc': 0.23684210526315788, 'hallucination': 0.40789473684210525, 'self_consistency': 0.3414473684210526}


In [8]:
from ragtrust.evaluation.diagnostics import evidence_recall_at_k #Imports recall@k diagnostic metric

N = 200 #Sets number of examples used for diagnostics
print("Recall@10:", evidence_recall_at_k(pipeline_fever_base, fever_ex[:N], k=10)) #Computes and prints recall@10

Recall@10: 0.9671052631578947


In [9]:
from tqdm import tqdm #Imports progress bar utility

conflicts = 0 #Counts how many claims show evidence conflict
N = min(len(fever_ex), 200) #Limits analysis to at most 200 examples

for ex in tqdm(fever_ex[:N], desc="Conflict analysis", leave=False): #Loops over examples
    out = pipeline_fever_base.run_one(ex.claim) #Runs pipeline on one claim
    if out["verification"].get("conflict", False): #Checks if verifier detected conflicting evidences
        conflicts += 1 #Increments conflict counter

print("Conflict %:", conflicts / N) #Prints percentage of conflicting claims

                                                                    

Conflict %: 0.3881578947368421


