In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, f1_score
import pandas as pd
import json
import pickle
import os
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from importlib import reload
import sys
sys.path.append("/private/home/ziweiji/Hallu_Det/src/")
sys.path.append("/home/ziweiji/Hallu_Det/src/")
import binary_threshold_utils
reload(binary_threshold_utils)
from binary_threshold_utils import load_data


In [2]:
def train_test(X_train, y_train, X_val, y_val, val_refusal=None, return_all=False):
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Create and train the logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    

    # Make predictions
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:,1]
    if val_refusal:
        y_pred = y_pred[val_refusal]
        y_prob = y_prob[val_refusal]
        y_val = y_val[val_refusal]
    print("y_pred", np.mean(y_pred), len(y_pred))
    print("y_val", np.mean(y_val), len(y_val))


    # Evaluate the model
    # AUROC
    auroc = roc_auc_score(y_val, y_prob, average='macro') *100
    accuracy = accuracy_score(y_val, y_pred) *100
    f1 = f1_score(y_val, y_pred, average='macro') *100
    loss = log_loss(y_val, y_prob)

    precision = precision_score(y_val, y_pred, average='macro') *100
    recall = recall_score(y_val, y_pred, average='macro') *100

    # print(f"Accuracy F1 Pre Recall: {accuracy:.2f}\t{f1:.2f}\t{precision:.2f}\t{recall:.2f}")
    if return_all:
        return auroc, accuracy, f1, precision, recall, model, y_pred, y_prob
    else:
        return auroc, accuracy, f1, precision, recall

In [6]:
prompt_type = 'sentence'
label_name = 'label'
filter_refusal = False
use_predicted_test = False
train_split = 'train'
# model_name = "Qwen2.5-7B-Instruct"
# model_name = "Mistral-7B-Instruct-v0.3"
model_name = 'Meta-Llama-3.1-8B-Instruct'
# "Qwen2.5-7B-Instruct"

outputs = []
FEATURES = [
    # ['sentence_semantic_entropy'],
    # # ['sentence_eigen'],
    # ['ling_uncertainty'],
    ['ling_uncertainty', f'{prompt_type}_semantic_entropy'],
    # ['ling_uncertainty', f'{prompt_type}_eigen'],
    ]
    
for dataset in ['trivia_qa', 'nq_open', 'pop_qa']:
    test_dataset = dataset
    out_dir = f"LR_outputs/{dataset}/{model_name}/"
    os.makedirs(out_dir, exist_ok=True)
    print(f"Training on {dataset}")
    for feature in FEATURES:
        X_train, y_train = load_data(model_name, dataset, train_split, feature, label_name, prompt_type, filter_refusal=filter_refusal)
        X_val, y_val = load_data(model_name, test_dataset, 'test', feature, label_name, prompt_type, use_predicted_test=use_predicted_test, filter_refusal=filter_refusal)
        auroc, accuracy, f1, precision, recall, model, y_pred, y_prob = train_test(X_train, y_train, X_val, y_val, return_all=True)
        outputs.append(f"{auroc}\t{accuracy}")
        if use_predicted_test:
            output_path = out_dir+ "_".join(feature)+"_use_predicted_test.json"
        else:
            output_path = out_dir+ "_".join(feature)+".json"

        with open(output_path, 'w') as f:
            data = {"auroc": auroc, "accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall,
            "y_pred": y_pred.tolist(), "y_prob": y_prob.tolist(), "y_val": y_val.tolist()}
            json.dump(data, f)
            
for o in outputs:
    print(o)


Training on trivia_qa
label hallucinated rate 0.2041


y_pred 0.043 1000
y_val 0.199 1000
Training on nq_open
label hallucinated rate 0.2941
y_pred 0.087 1000
y_val 0.288 1000
Training on pop_qa
label hallucinated rate 0.1867
y_pred 0.031 1000
y_val 0.19 1000
79.7116042133263	80.80000000000001
66.02415925405742	70.3
74.30116959064328	81.10000000000001


In [None]:
dataset = 'nq_open'
test_dataset = 'nq_open'
prompt_type = 'sentence'
# label_name = 'label'
label_name = 'accuracy'
filter_refusal = False
train_split = 'train'
FEATURES = [
            ['sentence_semantic_entropy'],
            ['sentence_eigen'],
            # ['no_refuse_sentence_semantic_entropy'],
            # ['no_refuse_sentence_eigen'],
            ['word_semantic_entropy'],
            ['word_eigen'],
            ['ling_uncertainty'],
            ['ling_uncertainty', f'{prompt_type}_semantic_entropy', f'{prompt_type}_eigen'],
            ['ling_uncertainty', f'{prompt_type}_semantic_entropy'],
            ['ling_uncertainty', f'{prompt_type}_eigen'],
            # ['ling_uncertainty', f'no_refuse_{prompt_type}_semantic_entropy', f'{prompt_type}_eigen'],
            # ['ling_uncertainty', f'no_refuse_{prompt_type}_semantic_entropy'],
            # ['ling_uncertainty', f'no_refuse_{prompt_type}_eigen'],
                ]
for use_predicted_test in [False, True]:
    print('use_predicted_test', use_predicted_test)
    outputs = []
    for feature in FEATURES:
        X_train, y_train = load_data(model_name, dataset, train_split, feature, label_name, prompt_type, filter_refusal=filter_refusal)
        X_val, y_val = load_data(model_name, test_dataset, 'test', feature, label_name, prompt_type, use_predicted_test=use_predicted_test, filter_refusal=filter_refusal)
        auroc, accuracy, f1, precision, recall = train_test(X_train, y_train, X_val, y_val)
        outputs.append(f"{round(auroc, 2)}\t{round(accuracy, 2)}\t{round(f1, 2)}\t{round(precision, 2)}\t{round(recall, 2)}")

    for o in outputs:
        print(o)
