In [1]:
from typing import Any, Generator, Protocol, List, Tuple
import pandas as pd
# import nltk
# nltk.download('punkt_tab')
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_score, recall_score
from minicheck.minicheck import MiniCheck
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def MinicheckEval(filename, model_name, col_name = None, update=True, batch_size=20, outputfile=None):
    df = pd.read_csv(filename, encoding='utf-8').fillna('')
    if not col_name:
        col_name = f'minicheck-{model_name}'
    if (not update) and (col_name in df): # store HHEM scores
        return
    model = MiniCheck(model_name=model_name, cache_dir='./ckpts')
    scores = []
    n = len(df['source'].tolist())
    for i in range(0,n,batch_size):
        pred_label, raw_prob, _, _ = model.score(docs=df['source'].tolist()[i:i+batch_size], claims=df['summary'].tolist()[i:i+
                                                                                                                           batch_size])
        scores.extend(pred_label)
    if col_name in df:
        df[col_name] = scores
    else:
        df.insert(len(df.columns), col_name, scores)
    if not outputfile:
        outputfile = filename
    df.to_csv(outputfile, mode='w', index=False, header=True)
    print(f'Minicheck {model_name} Scores have been saved')


In [3]:
minicheck_models = ['roberta-large', 'deberta-v3-large', 'flan-t5-large']#, 'Bespoke-MiniCheck-7B']
# selected_models = [
#     "openai/GPT-3.5-Turbo",
#     "openai/gpt-4o",
#     "Qwen/Qwen2.5-7B-Instruct",
#     "microsoft/Phi-3-mini-4k-instruct",
#     "cohere/command-r-08-2024",
#     "meta-llama/Meta-Llama-3.1-8B-Instruct",
#     "meta-llama/Meta-Llama-3.1-70B-Instruct",
#     "google/gemini-1.5-flash-001",
#     "Anthropic/claude-3-5-sonnet-20240620",
#     "mistralai/Mistral-7B-Instruct-v0.3",
# ]
# for idx, minichecker in enumerate(minicheck_models):
#     print(f"Run model {str(idx)}: {minichecker} ......")
#     for model_name in selected_models:
#         filename = f"../backup_data_with_detector_results/{model_name}.csv"
#         print(f"Processing P{model_name}")
    
#         MinicheckEval(filename, minichecker, update=False)

for idx, minichecker in enumerate(minicheck_models):
    print(f"Run model {str(idx)}: {minichecker} ......")
    filename = f"../assign/examples_to_annotate.csv"
    MinicheckEval(filename, minichecker, update=False)


Run model 0: roberta-large ......


Evaluating: 100%|██████████| 20/20 [00:00<00:00, 27.86it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 19.26it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 18.94it/s]
Evaluating: 100%|██████████| 20/20 [00:00<00:00, 21.50it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 19.69it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 17.32it/s]
Evaluating: 100%|██████████| 20/20 [00:00<00:00, 35.31it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 19.64it/s]
Evaluating: 100%|██████████| 20/20 [00:00<00:00, 20.82it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 18.53it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 18.83it/s]
Evaluating: 100%|██████████| 20/20 [00:00<00:00, 23.51it/s]
Evaluating: 100%|██████████| 20/20 [00:00<00:00, 29.44it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 18.73it/s]
Evaluating: 100%|██████████| 20/20 [00:00<00:00, 22.06it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 18.34it/s]
Evaluating: 100%|██████████| 20/20 [00:0

Minicheck roberta-large Scores have been saved
Run model 1: deberta-v3-large ......


Evaluating: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
Evaluating: 100%|██████████| 20/20 [00:49<00:00,  2.48s/it]
Evaluating: 100%|██████████| 20/20 [00:48<00:00,  2.43s/it]
Evaluating: 100%|██████████| 20/20 [00:40<00:00,  2.01s/it]
Evaluating: 100%|██████████| 20/20 [00:45<00:00,  2.27s/it]
Evaluating: 100%|██████████| 20/20 [00:53<00:00,  2.66s/it]
Evaluating: 100%|██████████| 20/20 [00:17<00:00,  1.14it/s]
Evaluating: 100%|██████████| 20/20 [00:46<00:00,  2.32s/it]
Evaluating: 100%|██████████| 20/20 [00:41<00:00,  2.07s/it]
Evaluating: 100%|██████████| 20/20 [00:48<00:00,  2.41s/it]
Evaluating: 100%|██████████| 20/20 [00:45<00:00,  2.25s/it]
Evaluating: 100%|██████████| 20/20 [00:37<00:00,  1.85s/it]
Evaluating: 100%|██████████| 20/20 [00:28<00:00,  1.40s/it]
Evaluating: 100%|██████████| 20/20 [00:51<00:00,  2.60s/it]
Evaluating: 100%|██████████| 20/20 [00:39<00:00,  1.98s/it]
Evaluating: 100%|██████████| 20/20 [00:51<00:00,  2.57s/it]
Evaluating: 100%|██████████| 20/20 [00:5

Minicheck deberta-v3-large Scores have been saved
Run model 2: flan-t5-large ......


Evaluating: 100%|██████████| 20/20 [00:01<00:00, 14.26it/s]
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  9.64it/s]
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  9.18it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 10.40it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 10.35it/s]
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  8.29it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 14.57it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 10.03it/s]
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  9.93it/s]
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  9.26it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 10.03it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 10.28it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 12.68it/s]
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  9.16it/s]
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 10.29it/s]
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  9.13it/s]
Evaluating: 100%|██████████| 20/20 [00:0

Minicheck flan-t5-large Scores have been saved
