# Run Fact Checking Benchmark
A notebook for running the full fact checking benchmark via contrastive knowledge assessment.
The goal is to probe if factual statements are predicted at a higher probability than a given counterfactual across all items in the [CalibraGPT/Fact_Checking](https://huggingface.co/datasets/CalibraGPT/Fact_Checking) dataset. You can use models including `Flan-t5`, `Bert`, and `GPT2` (among many others, see [repo](https://github.com/daniel-furman/Capstone#model-families-tested) for more).

<a target="_blank" href="https://colab.research.google.com/github/daniel-furman/Capstone/blob/main/notebooks/fact_checking_notebooks/fact-checking-run-main.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


## Dependencies

In [None]:
!git clone https://github.com/daniel-furman/Capstone.git
!pip install -r /content/Capstone/requirements.txt

## Imports

In [None]:
import os
import glob
import torch
from argparse import Namespace
from transformers import set_seed
from datasets import load_dataset

os.chdir('/content/Capstone/src/fact_checking_scripts')
from compare_models import compare_models

## Setup

In [None]:
# args config for running the benchmark
args = Namespace(
    model='google/flan-t5-xl', 
    language='fr',
) 

args

In [None]:
# ensure GPU access
if not torch.cuda.is_available():
    raise Exception("Change runtime type to include a GPU.")

In [None]:
# set warning level
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [None]:
# optionally attach drive
#from google.colab import drive
#drive.mount('/content/drive')

## Run

In [None]:
# run the fact checking benchmark
print("Running the fact_checking benchmark...")

set_seed(42)

# load in the dataset corresponding to the input language
if (args.language.lower() == "english") or (args.language.lower() == "en"):
    dataset = load_dataset("CalibraGPT/Fact_Checking", split="English")
elif (args.language.lower() == "french") or (args.language.lower() == "fr"):
    dataset = load_dataset("CalibraGPT/Fact_Checking", split="French")
elif (args.language.lower() == "spanish") or (args.language.lower() == "es"):
    dataset = load_dataset("CalibraGPT/Fact_Checking", split="Spanish")
else:
    raise Exception("Language not supported.")

# check the input model is compatible
compatible_model_prefixes = [
    "flan",
    "t5",
    "pythia",
    "gpt",
    "opt",
    "llama",
    "roberta",
    "bert",
    "bloom",
]

model_supported = False
for model_prefix in compatible_model_prefixes:
    if model_prefix in args.model.lower():
        model_supported = True

if not model_supported:
    raise Exception("Model not supported.")

# create a config for running the pipeline
config = {
    "models": [args.model],
    "input_information": dataset,
    "verbosity": False,
}

# run the contrastive knowledge assessment function
# logs saved at './content/logging/'
score_dicts = compare_models(
    config["models"], config["input_information"], config["verbosity"]
)

# print the summary results
print(f"\nScore dict summary:\n{score_dicts[1]}")

In [None]:
# optionally save logs to drive after running

#os.chdir('../../../')
#log = glob.glob('/content/logging/*json')[0]
#log_name = log.split('/')[-1]
#log_new_path = f'/content/drive/MyDrive/Colab Files/cka_benchmark_logs/{log_name}'
#!cp {log} '{log_new_path}'
