### Evaluation of DeepSeek R1 Distill Local

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score as f1_func

In [2]:
ground_truth_df = pd.read_csv('mimic-cxr-2.1.0-test-set-labeled.csv')

results_df = pd.read_json('deepseek_r1_distill_local_output.json')

labels_df = pd.json_normalize(results_df['labels'])
results_df = pd.concat([results_df.drop(columns=['labels']), labels_df], axis = 1)

results_df = results_df.replace({
        'Yes' : 1,
        'No' : 0,
        'Maybe' : -1,
        'Undefined' : np.nan
    }
)

results_df['patient_id'] = results_df["patient_id"].str.replace("p", "").astype(int)
results_df['report_name'] = results_df['report_name'].str.replace('s', '').str.replace('.txt', '').astype(int)
results_df = results_df.rename(columns = {
    'patient_id' : 'subject_id',
    'report_name' : 'study_id'
})


ground_truth_df = ground_truth_df.drop('No Finding', axis = 1)
results_df = results_df.drop('subject_id', axis = 1)

ground_truth_df = ground_truth_df.fillna(0)
results_df = results_df.fillna(0)

ground_truth_df = ground_truth_df.rename(columns = {'Airspace Opacity' : 'Lung Opacity'})
reordered_columns = [col for col in ground_truth_df.columns]
results_df = results_df[reordered_columns]

ground_truth_ids = {id for id in ground_truth_df['study_id']}
results_ids = {id for id in results_df['study_id']}
missing_ids_in_ground_truth = {id for id in ground_truth_ids if id not in results_ids}
ground_truth_df = ground_truth_df.set_index('study_id').drop(missing_ids_in_ground_truth).reset_index()

  results_df = results_df.replace({


### Evaluation Metrics

In [4]:
# List of all relevant category columns (excluding 'study_id')
categories = [col for col in ground_truth_df.columns if col != "study_id"]

# Container to hold metric information for each category
metric_results = []

for cat in categories:
    # Reset indices to ensure proper alignment
    y_true = ground_truth_df[cat].reset_index(drop=True)
    y_pred = results_df[cat].reset_index(drop=True)

    total_valid = len(y_true)

    if total_valid == 0:
        matches = 0
        accuracy = np.nan
        precision = np.nan
        recall = np.nan
        f1_val = np.nan
    else:
        matches = (y_true == y_pred).sum()
        accuracy = (matches / total_valid) * 100  # convert to percentage
        # Compute metrics using 'macro' averaging for multiclass (-1, 0, 1) and convert to percentage
        precision = precision_score(y_true, y_pred, average='macro', zero_division=0) * 100
        recall = recall_score(y_true, y_pred, average='macro', zero_division=0) * 100
        f1_val = f1_func(y_true, y_pred, average='macro', zero_division=0) * 100

    metric_results.append({
        "Category": cat,
        "Total Valid": total_valid,
        "Matches": matches,
        "Accuracy (%)": accuracy,
        "Precision (%)": precision,
        "Recall (%)": recall,
        "F1-Score (%)": f1_val
    })

# Create and display the dataframe with percentages
metrics_df = pd.DataFrame(metric_results)
metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy (%),Precision (%),Recall (%),F1-Score (%)
0,Enlarged Cardiomediastinum,685,488,71.240876,39.000252,37.807443,37.844031
1,Cardiomegaly,685,297,43.357664,36.079683,36.985752,32.587969
2,Lung Lesion,685,427,62.335766,32.676106,27.449852,29.137223
3,Lung Opacity,685,235,34.306569,32.5579,32.158693,22.895749
4,Edema,685,359,52.408759,34.842029,35.840212,34.243476
5,Consolidation,685,371,54.160584,35.404772,43.52353,30.317367
6,Pneumonia,685,385,56.20438,31.430309,30.388013,30.255225
7,Atelectasis,685,244,35.620438,30.887333,30.479288,25.855564
8,Pneumothorax,685,602,87.883212,30.871795,31.501832,31.183631
9,Pleural Effusion,685,281,41.021898,34.52983,39.628033,32.84726


### Evaluation Metrics based on 1's and 0's

In [6]:
# List of all relevant category columns (excluding 'study_id')
categories = [col for col in ground_truth_df.columns if col != "study_id"]

# Container to hold metric information for each category
metric_results = []

for cat in categories:
    # Filter rows where the ground truth is not -1
    valid_mask = ground_truth_df[cat] != -1
    y_true = ground_truth_df.loc[valid_mask, cat].reset_index(drop=True)
    y_pred = results_df.loc[valid_mask, cat].reset_index(drop=True)

    total_valid = len(y_true)

    if total_valid == 0:
        matches = 0
        accuracy = np.nan
        precision = np.nan
        recall = np.nan
        f1_val = np.nan
    else:
        matches = (y_true == y_pred).sum()
        accuracy = (matches / total_valid) * 100  # Convert to percentage

        # Compute metrics using 'macro' averaging for classes 0 and 1 (since -1 is discarded)
        precision = precision_score(y_true, y_pred, average='macro', zero_division=0) * 100
        recall = recall_score(y_true, y_pred, average='macro', zero_division=0) * 100
        f1_val = f1_func(y_true, y_pred, average='macro', zero_division=0) * 100

    metric_results.append({
        "Category": cat,
        "Total Valid": total_valid,
        "Matches": matches,
        "Accuracy (%)": accuracy,
        "Precision (%)": precision,
        "Recall (%)": recall,
        "F1-Score (%)": f1_val
    })

# Create a dataframe from the results for a nice visual summary
metrics_df = pd.DataFrame(metric_results)
metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy (%),Precision (%),Recall (%),F1-Score (%)
0,Enlarged Cardiomediastinum,647,479,74.034003,32.523602,29.912706,30.607346
1,Cardiomegaly,646,284,43.962848,35.856815,25.87464,29.688055
2,Lung Lesion,676,427,63.16568,33.228301,27.449852,29.350036
3,Lung Opacity,685,235,34.306569,32.5579,32.158693,22.895749
4,Edema,644,351,54.503106,33.984569,29.336147,31.185218
5,Consolidation,659,354,53.717754,33.850129,21.728658,26.467574
6,Pneumonia,597,369,61.809045,32.990438,24.327407,27.985588
7,Atelectasis,645,243,37.674419,30.8241,29.645954,25.573031
8,Pneumothorax,675,602,89.185185,31.354167,31.501832,31.427826
9,Pleural Effusion,662,272,41.087613,33.274643,26.584555,29.554162


### Evaluation Metrics based on -1's

In [8]:
maybe_counts = (ground_truth_df.drop(['study_id'], axis=1) == -1).sum()

maybe_percentages = (maybe_counts / len(ground_truth_df)) * 100

maybe_summary_df = pd.DataFrame({
    '-1 Count': maybe_counts,
    '-1 Percentage': maybe_percentages
})

maybe_summary_df

Unnamed: 0,-1 Count,-1 Percentage
Enlarged Cardiomediastinum,38,5.547445
Cardiomegaly,39,5.693431
Lung Lesion,9,1.313869
Lung Opacity,0,0.0
Edema,41,5.985401
Consolidation,26,3.79562
Pneumonia,88,12.846715
Atelectasis,40,5.839416
Pneumothorax,10,1.459854
Pleural Effusion,23,3.357664


In [9]:
# List of relevant category columns (excluding 'study_id')
categories = [col for col in ground_truth_df.columns if col != "study_id"]

metric_results = []

for cat in categories:
    # Select only the rows where the ground truth equals -1
    valid_mask = ground_truth_df[cat] == -1
    y_true = ground_truth_df.loc[valid_mask, cat].reset_index(drop=True)
    y_pred = results_df.loc[valid_mask, cat].reset_index(drop=True)

    total_valid = len(y_true)

    if total_valid == 0:
        matches = 0
        accuracy = np.nan
        precision = np.nan
        recall = np.nan
        f1_val = np.nan
    else:
        # Accuracy: proportion of predictions that are exactly -1 (matching the ground truth)
        matches = (y_true == y_pred).sum()
        accuracy = (matches / total_valid) * 100

        # To use binary metrics, convert y_true and y_pred to binary:
        # For all samples, since ground truth is -1, we map that to 1 (the positive class).
        y_true_bin = np.ones(total_valid, dtype=int)
        # For predictions, mark as 1 if predicted equals -1, else 0.
        y_pred_bin = (y_pred == -1).astype(int)

        precision = precision_score(y_true_bin, y_pred_bin, pos_label=1, zero_division=0) * 100
        recall = recall_score(y_true_bin, y_pred_bin, pos_label=1, zero_division=0) * 100
        f1_val = f1_func(y_true_bin, y_pred_bin, pos_label=1, zero_division=0) * 100

    metric_results.append({
        "Category": cat,
        "Total Valid": total_valid,
        "Matches": matches,
        "Accuracy (%)": accuracy,
        "Precision (%)": precision,
        "Recall (%)": recall,
        "F1-Score (%)": f1_val
    })

metrics_df = pd.DataFrame(metric_results)
metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy (%),Precision (%),Recall (%),F1-Score (%)
0,Enlarged Cardiomediastinum,38,9,23.684211,100.0,23.684211,38.297872
1,Cardiomegaly,39,13,33.333333,100.0,33.333333,50.0
2,Lung Lesion,9,0,0.0,0.0,0.0,0.0
3,Lung Opacity,0,0,,,,
4,Edema,41,8,19.512195,100.0,19.512195,32.653061
5,Consolidation,26,17,65.384615,100.0,65.384615,79.069767
6,Pneumonia,88,16,18.181818,100.0,18.181818,30.769231
7,Atelectasis,40,1,2.5,100.0,2.5,4.878049
8,Pneumothorax,10,0,0.0,0.0,0.0,0.0
9,Pleural Effusion,23,9,39.130435,100.0,39.130435,56.25
