### Evaluation of Phi4 Local

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score as f1_func

In [3]:
ground_truth_df = pd.read_csv('mimic-cxr-2.1.0-test-set-labeled.csv')

results_df = pd.read_json('phi4_output.json')

labels_df = pd.json_normalize(results_df['labels'])
results_df = pd.concat([results_df.drop(columns=['labels']), labels_df], axis = 1)

results_df = results_df.replace({
        'Yes' : 1,
        'No' : 0,
        'Maybe' : -1,
        'Undefined' : np.nan
    }
)

results_df['patient_id'] = results_df["patient_id"].str.replace("p", "").astype(int)
results_df['report_name'] = results_df['report_name'].str.replace('s', '').str.replace('.txt', '').astype(int)
results_df = results_df.rename(columns = {
    'patient_id' : 'subject_id',
    'report_name' : 'study_id'
})


ground_truth_df = ground_truth_df.drop('No Finding', axis = 1)
results_df = results_df.drop('subject_id', axis = 1)

ground_truth_df = ground_truth_df.fillna(0)
results_df = results_df.fillna(0)

ground_truth_df = ground_truth_df.rename(columns = {'Airspace Opacity' : 'Lung Opacity'})
reordered_columns = [col for col in ground_truth_df.columns]
results_df = results_df[reordered_columns]

ground_truth_ids = {id for id in ground_truth_df['study_id']}
results_ids = {id for id in results_df['study_id']}
missing_ids_in_ground_truth = {id for id in ground_truth_ids if id not in results_ids}
ground_truth_df = ground_truth_df.set_index('study_id').drop(missing_ids_in_ground_truth).reset_index()

  results_df = results_df.replace({


### Evaluation Metrics

In [5]:
# List of all relevant category columns (excluding 'study_id')
categories = [col for col in ground_truth_df.columns if col != "study_id"]

# Container to hold metric information for each category
metric_results = []

for cat in categories:
    # Reset indices to ensure proper alignment
    y_true = ground_truth_df[cat].reset_index(drop=True)
    y_pred = results_df[cat].reset_index(drop=True)

    total_valid = len(y_true)

    if total_valid == 0:
        matches = 0
        accuracy = np.nan
        precision = np.nan
        recall = np.nan
        f1_val = np.nan
    else:
        matches = (y_true == y_pred).sum()
        accuracy = (matches / total_valid) * 100  # convert to percentage
        # Compute metrics using 'macro' averaging for multiclass (-1, 0, 1) and convert to percentage
        precision = precision_score(y_true, y_pred, average='macro', zero_division=0) * 100
        recall = recall_score(y_true, y_pred, average='macro', zero_division=0) * 100
        f1_val = f1_func(y_true, y_pred, average='macro', zero_division=0) * 100

    metric_results.append({
        "Category": cat,
        "Total Valid": total_valid,
        "Matches": matches,
        "Accuracy (%)": accuracy,
        "Precision (%)": precision,
        "Recall (%)": recall,
        "F1-Score (%)": f1_val
    })

# Create and display the dataframe with percentages
metrics_df = pd.DataFrame(metric_results)
metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy (%),Precision (%),Recall (%),F1-Score (%)
0,Enlarged Cardiomediastinum,685,531,77.518248,37.998429,39.006563,36.660902
1,Cardiomegaly,685,368,53.722628,37.22434,39.384506,37.280316
2,Lung Lesion,685,510,74.452555,34.038969,35.540929,33.43523
3,Lung Opacity,685,340,49.635036,34.24104,30.71801,31.377384
4,Edema,685,391,57.080292,32.507317,32.383966,32.371611
5,Consolidation,685,449,65.547445,35.180081,36.893752,34.009349
6,Pneumonia,685,426,62.189781,34.6337,33.681371,33.777363
7,Atelectasis,685,293,42.773723,29.494364,28.531922,28.629542
8,Pneumothorax,685,585,85.40146,30.805687,30.612245,30.708661
9,Pleural Effusion,685,300,43.79562,32.828844,33.620781,32.487542


### Evaluation Metrics based on 1's and 0's

In [7]:
# List of all relevant category columns (excluding 'study_id')
categories = [col for col in ground_truth_df.columns if col != "study_id"]

# Container to hold metric information for each category
metric_results = []

for cat in categories:
    # Filter rows where the ground truth is not -1
    valid_mask = ground_truth_df[cat] != -1
    y_true = ground_truth_df.loc[valid_mask, cat].reset_index(drop=True)
    y_pred = results_df.loc[valid_mask, cat].reset_index(drop=True)

    total_valid = len(y_true)

    if total_valid == 0:
        matches = 0
        accuracy = np.nan
        precision = np.nan
        recall = np.nan
        f1_val = np.nan
    else:
        matches = (y_true == y_pred).sum()
        accuracy = (matches / total_valid) * 100  # Convert to percentage

        # Compute metrics using 'macro' averaging for classes 0 and 1 (since -1 is discarded)
        precision = precision_score(y_true, y_pred, average='macro', zero_division=0) * 100
        recall = recall_score(y_true, y_pred, average='macro', zero_division=0) * 100
        f1_val = f1_func(y_true, y_pred, average='macro', zero_division=0) * 100

    metric_results.append({
        "Category": cat,
        "Total Valid": total_valid,
        "Matches": matches,
        "Accuracy (%)": accuracy,
        "Precision (%)": precision,
        "Recall (%)": recall,
        "F1-Score (%)": f1_val
    })

# Create a dataframe from the results for a nice visual summary
metrics_df = pd.DataFrame(metric_results)
metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy (%),Precision (%),Recall (%),F1-Score (%)
0,Enlarged Cardiomediastinum,647,521,80.525502,36.159364,30.234633,32.256944
1,Cardiomegaly,646,359,55.572755,35.514806,31.692198,33.287891
2,Lung Lesion,676,509,75.295858,33.648688,31.837225,32.297646
3,Lung Opacity,685,340,49.635036,34.24104,30.71801,31.377384
4,Edema,644,388,60.248447,32.808968,29.944942,31.309777
5,Consolidation,659,444,67.37481,34.892822,30.483495,32.053659
6,Pneumonia,597,411,68.844221,35.282407,27.999553,31.116364
7,Atelectasis,645,292,45.271318,30.78469,27.698589,28.836382
8,Pneumothorax,675,585,86.666667,31.300161,30.612245,30.952381
9,Pleural Effusion,662,297,44.864048,32.384179,29.272955,30.745215


### Evaluation Metrics based on -1's

In [9]:
maybe_counts = (ground_truth_df.drop(['study_id'], axis=1) == -1).sum()

maybe_percentages = (maybe_counts / len(ground_truth_df)) * 100

maybe_summary_df = pd.DataFrame({
    '-1 Count': maybe_counts,
    '-1 Percentage': maybe_percentages
})

maybe_summary_df

Unnamed: 0,-1 Count,-1 Percentage
Enlarged Cardiomediastinum,38,5.547445
Cardiomegaly,39,5.693431
Lung Lesion,9,1.313869
Lung Opacity,0,0.0
Edema,41,5.985401
Consolidation,26,3.79562
Pneumonia,88,12.846715
Atelectasis,40,5.839416
Pneumothorax,10,1.459854
Pleural Effusion,23,3.357664


In [10]:
# List of relevant category columns (excluding 'study_id')
categories = [col for col in ground_truth_df.columns if col != "study_id"]

metric_results = []

for cat in categories:
    # Select only the rows where the ground truth equals -1
    valid_mask = ground_truth_df[cat] == -1
    y_true = ground_truth_df.loc[valid_mask, cat].reset_index(drop=True)
    y_pred = results_df.loc[valid_mask, cat].reset_index(drop=True)

    total_valid = len(y_true)

    if total_valid == 0:
        matches = 0
        accuracy = np.nan
        precision = np.nan
        recall = np.nan
        f1_val = np.nan
    else:
        # Accuracy: proportion of predictions that are exactly -1 (matching the ground truth)
        matches = (y_true == y_pred).sum()
        accuracy = (matches / total_valid) * 100

        # To use binary metrics, convert y_true and y_pred to binary:
        # For all samples, since ground truth is -1, we map that to 1 (the positive class).
        y_true_bin = np.ones(total_valid, dtype=int)
        # For predictions, mark as 1 if predicted equals -1, else 0.
        y_pred_bin = (y_pred == -1).astype(int)

        precision = precision_score(y_true_bin, y_pred_bin, pos_label=1, zero_division=0) * 100
        recall = recall_score(y_true_bin, y_pred_bin, pos_label=1, zero_division=0) * 100
        f1_val = f1_func(y_true_bin, y_pred_bin, pos_label=1, zero_division=0) * 100

    metric_results.append({
        "Category": cat,
        "Total Valid": total_valid,
        "Matches": matches,
        "Accuracy (%)": accuracy,
        "Precision (%)": precision,
        "Recall (%)": recall,
        "F1-Score (%)": f1_val
    })

metrics_df = pd.DataFrame(metric_results)
metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy (%),Precision (%),Recall (%),F1-Score (%)
0,Enlarged Cardiomediastinum,38,10,26.315789,100.0,26.315789,41.666667
1,Cardiomegaly,39,9,23.076923,100.0,23.076923,37.5
2,Lung Lesion,9,1,11.111111,100.0,11.111111,20.0
3,Lung Opacity,0,0,,,,
4,Edema,41,3,7.317073,100.0,7.317073,13.636364
5,Consolidation,26,5,19.230769,100.0,19.230769,32.258065
6,Pneumonia,88,15,17.045455,100.0,17.045455,29.126214
7,Atelectasis,40,1,2.5,100.0,2.5,4.878049
8,Pneumothorax,10,0,0.0,0.0,0.0,0.0
9,Pleural Effusion,23,3,13.043478,100.0,13.043478,23.076923
