# Run Fact Completion Benchmark
This notebook contains the code to run the CalibraGPT fact completion benchmark. Fact completion is accomplished by probing whether factual statements are predicted at a higher probability compared to paired counterfactual statements. We will be using the [CalibraGPT/Fact-Completion](https://huggingface.co/datasets/CalibraGPT/Fact-Completion) dataset. See the Repo's [README](https://github.com/daniel-furman/Polyglot-or-Not) for compatible models and more information.

<a target="_blank" href="https://colab.research.google.com/github/daniel-furman/Polyglot-or-Not/blob/main/notebooks/fact_completion_notebooks/fact-completion-full-benchmark.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


## Colab + Drive setup

In [None]:
from google.colab import drive

# Attach drive
drive.mount("/content/drive")

In [None]:
# Global var for folder to save results logs to
DRIVE_FOLDER_OUT = "/content/drive/MyDrive/Colab Files/cka_benchmark_logs/"

## Dependencies

In [None]:
!git clone https://github.com/daniel-furman/Capstone.git
!pip install -r /content/Capstone/requirements.txt
#!pip install -r /content/Capstone/requirements_llama.txt

## Imports

In [None]:
import os
import glob
import torch
from argparse import Namespace
from transformers import set_seed
from datasets import load_dataset

os.chdir("/content/Capstone/src/fact_completion_scripts")
from compare_models import compare_models

## Configure Args

In [None]:
# args config for running the benchmark
args = Namespace(
    model="bert-base-multilingual-cased",
    language="en",
)

args

## Setup

In [None]:
# ensure GPU access
if not torch.cuda.is_available():
    raise Exception("Change runtime type to include a GPU.")

In [None]:
# set warning level
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

## Run

In [None]:
# run the fact completion benchmark
print("Running the fact_completion benchmark...")

set_seed(42)

# change the below to work on a list of ("english", "en") pairs
# load in the dataset corresponding to the input language
supported_languages = [
    ("english", "en"),
    ("french", "fr"),
    ("spanish", "es"),
    ("german", "de"),
    ("ukrainian", "uk"),
    ("romanian", "ro"),
    ("bulgarian", "bg"),
    ("catalan", "ca"),
    ("danish", "da"),
    ("croatian", "hr"),
    ("hungarian", "hu"),
    ("italian", "it"),
    ("dutch", "nl"),
    ("polish", "pl"),
    ("portuguese", "pt"),
    ("russian", "ru"),
    ("slovenian", "sl"),
    ("serbian", "sr"),
    ("swedish", "sv"),
    ("czech", "cs"),
]

dataset_bool = False
for lang_arr in supported_languages:
    if (args.language.lower() == lang_arr[0]) or (args.language.lower() == lang_arr[1]):
        dataset = load_dataset(
            "CalibraGPT/Fact-Completion", split=lang_arr[0].capitalize()
        )
        dataset_bool = True

if not dataset_bool:
    raise Exception("Language not supported.")

# check the input model is compatible
compatible_model_prefixes = [
    "t5",
    "pythia",
    "gpt",
    "opt",
    "llama",
    "roberta",
    "bert",
    "bloom",
]

model_supported = False
for model_prefix in compatible_model_prefixes:
    if model_prefix in args.model.lower():
        model_supported = True

if not model_supported:
    raise Exception("Model not supported.")

# create a config for running the pipeline
config = {
    "models": [args.model],
    "input_information": dataset,
    "verbosity": False,
}

# run the contrastive knowledge assessment function
# logs saved at './content/logging/'
score_dicts, log_fpath = compare_models(
    config["models"], config["input_information"], config["verbosity"]
)

# print the summary results
print(f"\nScore dict summary:\n{score_dicts[1]}")

In [None]:
# save result logs to drive

log = glob.glob(f"/content/Capstone/src/fact_completion_scripts/{log_fpath}")[0]
log_name = args.language + "-" + log.split("/")[-1]
log_new_path = os.path.join(DRIVE_FOLDER_OUT, log_name)
!cp {log} '{log_new_path}'