In [None]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import os
import json

import warnings
warnings.filterwarnings('ignore')

def extract_task_pred(iat_triplet_json_str, task):
    s = iat_triplet_json_str
    start_index = s.find('{')
    end_index = s.rfind('}')
    if start_index != -1 and end_index != -1:
        s = s[start_index : end_index + 1]
    iat_triplet = json.loads(s)
    t = iat_triplet[task]
    if t == "UNCERTAIN":
        return 'NONE'
    return t

def format_gpt_iat_preds(iat_preds_path):
    df = pd.read_csv(iat_preds_path)
    df['instrument_pred'] = df['iat_triplet'].apply(lambda x: extract_task_pred(x, 'instrument'))
    df['action_pred'] = df['iat_triplet'].apply(lambda x: extract_task_pred(x, 'action'))
    df['tissue_pred'] = df['iat_triplet'].apply(lambda x: extract_task_pred(x, 'tissue'))
    df.drop(columns=['iat_triplet'], inplace=True)
    return df

def compute_metrics(df):
    # AUROC (weighted), F1 (macro), Precision (macro), Recall (macro)
    
    tasks = ['instrument', 'action', 'tissue']
    
    labels = {
        task: list(df[task].values)
        for task in tasks
    }
    preds = {
        task: list(df[f'{task}_pred'].values)
        for task in tasks
    }
    classes = {
        task: list(set(labels[task]).union(set(preds[task])))
        for task in tasks
    }

    label_encoders = {
        task: LabelEncoder().fit(classes[task])
        for task in tasks
    }
    encoded_labels = {
        task: np.array(label_encoders[task].transform(labels[task]))
        for task in tasks
    }
    encoded_preds = {
        task: np.array(label_encoders[task].transform(df[f'{task}_pred']))
        for task in tasks
    }
    
    auc_scores = {}
    f1_scores = {}
    precision_scores = {}
    recall_scores = {}
    
    # AUROC
    for task in tasks:
        true_probs = np.zeros((encoded_labels[task].shape[0], len(label_encoders[task].classes_)))
        for i in range(encoded_labels[task].shape[0]):
            true_probs[i, encoded_labels[task][i]] = 1
        pred_probs = np.zeros((encoded_labels[task].shape[0], len(label_encoders[task].classes_)))
        for i in range(encoded_labels[task].shape[0]):
            pred_probs[i, encoded_preds[task][i]] = 1
        auc = roc_auc_score(true_probs, pred_probs, multi_class='ovo', average='weighted')
        auc_scores[task] = auc
    
    # F1
    for task in tasks:
        f1 = f1_score(encoded_labels[task], encoded_preds[task], average='macro')
        f1_scores[task] = f1
        
    # Precision
    for task in tasks:
        precision = precision_score(encoded_labels[task], encoded_preds[task], average='macro')
        precision_scores[task] = precision
        
    # Recall
    for task in tasks:
        recall = recall_score(encoded_labels[task], encoded_preds[task], average='macro')
        recall_scores[task] = recall
    
    return {
        'auc_scores': auc_scores,
        'f1_scores': f1_scores,
        'precision_scores': precision_scores,
        'recall_scores': recall_scores
    }


In [25]:
iat_preds_path = '../../outputs/iat_predictions/gpt4o-vision+procedure+task--raw.csv'
df = format_gpt_iat_preds(iat_preds_path)
new_iat_preds_path = '../../outputs/iat_predictions_final/gpt4o-vision+procedure+task.csv'
df.to_csv(new_iat_preds_path, index=False)

In [45]:
print(new_iat_preds_path)

metrics = compute_metrics(df)
metrics

../../outputs/iat_predictions_final/gpt4o-vision+procedure+task.csv


{'auc_scores': {'instrument': 0.5034504590740607,
  'action': 0.5073372738123906,
  'tissue': 0.5180532346146944},
 'f1_scores': {'instrument': 0.0035574204141202224,
  'action': 0.009707163110371415,
  'tissue': 0.061029444703963265},
 'precision_scores': {'instrument': 0.04276315789473684,
  'action': 0.03419972438418738,
  'tissue': 0.07035232965751236},
 'recall_scores': {'instrument': 0.0018559055867516895,
  'action': 0.061274867672821635,
  'tissue': 0.11742182938004714}}