In [1]:
%cd ~/RATER-D

/home/daved/RATER-D


In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, f1_score
from netcal.metrics import ECE

from pprint import pprint

In [3]:
def calc_deltas (model_name, results_ft, results_noft):

    df_ft =   pd.read_csv(results_ft)
    df_noft = pd.read_csv(results_noft)

    y_true_ft = df_ft['target']
    preds_ft = np.where(df_ft['pred'] >= 5, 1, 0)
    probs_ft = (df_ft['pred'] - 1) / 6

    y_true_noft = df_noft['target']
    preds_noft = np.where(df_noft['pred'] >= 5, 1, 0)
    probs_noft = (df_noft['pred'] - 1) / 6

    n_bins = 7

    auc_ft = roc_auc_score(y_true_ft, probs_ft)
    macro_f1_ft = f1_score(y_true_ft, preds_ft, average = 'macro')
    ece_ft = ECE(bins = n_bins).measure(np.array(probs_ft), np.array(y_true_ft))

    auc_noft = roc_auc_score(y_true_noft, probs_noft)
    macro_f1_noft = f1_score(y_true_noft, preds_noft, average = 'macro')
    ece_noft = ECE(bins = n_bins).measure(np.array(probs_noft), np.array(y_true_noft))

    delta_auc = np.round(auc_ft - auc_noft, 3)
    delta_macro_f1 = np.round(macro_f1_ft - macro_f1_noft, 3)
    delta_ece = np.round(ece_ft - ece_noft, 3)

    metrics = {
        'model': model_name,
        'auc': delta_auc,
        'macro_f1': delta_macro_f1,
        'ece': delta_ece
    }

    pprint(metrics, sort_dicts = False)

In [4]:
model_name = 'gpt4o'
results_ft = 'results/ft/gpt4o.csv'
results_noft = 'results/noft/gpt4o-noft.csv'

calc_deltas(model_name, results_ft, results_noft)

{'model': 'gpt4o', 'auc': 0.017, 'macro_f1': -0.017, 'ece': 0.157}


In [5]:
model_name = 'Qwen2.5 32B'
results_ft = 'results/ft/unsloth_Qwen2.5-32B-Instruct-bnb-4bit.csv'
results_noft = 'results/noft/unsloth_Qwen2.5-32B-Instruct-bnb-4bit.csv'

calc_deltas(model_name, results_ft, results_noft)

{'model': 'Qwen2.5 32B', 'auc': 0.05, 'macro_f1': 0.093, 'ece': 0.067}


In [6]:
model_name = 'Llama 3 70B'
results_ft = 'results/ft/unsloth_llama-3-70b-Instruct-bnb-4bit.csv'
results_noft = 'results/noft/unsloth_llama-3-70b-Instruct-bnb-4bit.csv'

calc_deltas(model_name, results_ft, results_noft)

{'model': 'Llama 3 70B', 'auc': 0.038, 'macro_f1': 0.061, 'ece': 0.019}


In [7]:
model_name = 'Llama 3.1 70B'
results_ft = 'results/ft/unsloth_Meta-Llama-3.1-70B-Instruct-bnb-4bit.csv'
results_noft = 'results/noft/unsloth_Meta-Llama-3.1-70B-Instruct-bnb-4bit.csv'

calc_deltas(model_name, results_ft, results_noft)

{'model': 'Llama 3.1 70B', 'auc': 0.041, 'macro_f1': 0.012, 'ece': 0.072}


In [8]:
model_name = 'gpt4omini'
results_ft = 'results/ft/gpt4omini.csv'
results_noft = 'results/noft/gpt4omini-noft.csv'

calc_deltas(model_name, results_ft, results_noft)

{'model': 'gpt4omini', 'auc': 0.039, 'macro_f1': 0.012, 'ece': 0.045}


In [9]:
model_name = 'gpt35turbo'
results_ft = 'results/ft/gpt35turbo.csv'
results_noft = 'results/noft/gpt35turbo-noft.csv'

calc_deltas(model_name, results_ft, results_noft)

{'model': 'gpt35turbo', 'auc': 0.203, 'macro_f1': 0.057, 'ece': -0.105}
