# This file includes:
#### Calculating evaluation metrics for Gemini API & CheXPert
##### • Accuracy, Precision, Recall, F1 Score
#### Calculating correlation between Gemini API & CheXPert
------------------------------------------------------------
##### Last Update: 30.01.25

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
file_path = "C:/Users/emred/OneDrive/Masaüstü/Docs/3rd Year/Spring/CENG404 - Special Topics in CENG"

### Loading Datas

In [4]:
ground_truth_df = pd.read_csv(file_path + '/Doktor Raporları/data' + '/mimic-cxr-2.1.0-test-set-labeled.csv')



results_df = pd.read_json(file_path + '/merged.json')
labels_df = pd.json_normalize(results_df['labels'])
results_df = pd.concat([results_df.drop(columns=['labels']), labels_df], axis = 1)



results_df = results_df.replace({
        'Yes' : 1,
        'No' : 0,
        'Maybe' : -1,
        'Undefined' : np.nan
    }
)



results_df['patient_id'] = results_df["patient_id"].str.replace("p", "").astype(int)
results_df['report_name'] = results_df['report_name'].str.replace('s', '').str.replace('.txt', '').astype(int)
results_df = results_df.rename(columns = {
    'patient_id' : 'subject_id',
    'report_name' : 'study_id'
})



chexpert_results_df = pd.read_csv(file_path + '/comparison_relevant.csv')
ground_truth_df = ground_truth_df.drop('No Finding', axis = 1)
results_df = results_df.drop('subject_id', axis = 1)
chexpert_results_df = chexpert_results_df.drop('No Finding', axis = 1)



# Fill NaN values w/ 0
ground_truth_df = ground_truth_df.fillna(0)
results_df = results_df.fillna(0)
chexpert_results_df = chexpert_results_df.fillna(0)



ground_truth_df = ground_truth_df.rename(columns = {'Airspace Opacity' : 'Lung Opacity'})
reordered_columns = [col for col in ground_truth_df.columns]
results_df = results_df[reordered_columns]
chexpert_results_df = chexpert_results_df[reordered_columns]

  results_df = results_df.replace({


### Find which reports are missing in results_df, and save them as a json file.

In [6]:
print(f'unfiltered results_df len = {len(results_df)}')
results_df = results_df.drop_duplicates(subset = ['study_id'], keep = 'last')
print(f'filtered results_df len = {len(results_df)}')

ground_truth_ids = {id for id in ground_truth_df['study_id']}

results_ids = {id for id in results_df['study_id']}

missing_ids = {id for id in ground_truth_ids if id not in results_ids}

with open("relevant_reports.json", "r") as json_file:
    data = json.load(json_file)

filtered_data = [entry for entry in data if int(entry['study_id'].replace("s", "")) in missing_ids]

output_file = "filtered_data.json"
with open(output_file, "w") as json_file:
    json.dump(filtered_data, json_file, indent=4)

print(f'File {output_file} is saved.')

unfiltered results_df len = 820
filtered results_df len = 685
File filtered_data.json is saved.


### 1. Correlation Between Gemini API Methodology & CheXPert
#### (Including 1's, 0's and -1's.)

In [8]:
# matched_df = results_df.merge(ground_truth_df, on = ['subject_id', 'study_id'], suffixes = ('_results', '_ground_truth'))
matched_df = results_df.merge(ground_truth_df, on = ['study_id'], suffixes = ('_results', '_ground_truth'))

In [9]:
# Merge the two DataFrames on 'study_id'
merged_df = pd.merge(results_df, chexpert_results_df, on='study_id', suffixes=('_gemini', '_chexpert'))

# Initialize a list to store correlation results
correlation_results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in results_df.drop('study_id', axis = 1).columns];

# Iterate through each category to calculate correlation
for col in columns_to_compare:
    # Get predictions from both models for the current category
    model1_preds = merged_df[f"{col}_gemini"]
    model2_preds = merged_df[f"{col}_chexpert"]

    # Handle NaN values (e.g., exclude rows with NaN in either model)
    valid_indices = (~model1_preds.isna()) & (~model2_preds.isna())
    model1_valid = model1_preds[valid_indices]
    model2_valid = model2_preds[valid_indices]

    # Calculate correlation (Pearson)
    if len(model1_valid) > 1:  # Ensure there are enough data points
        correlation = model1_valid.corr(model2_valid)
    else:
        correlation = None  # Not enough data to calculate correlation

    # Append results for the current category
    correlation_results.append({
        'Category': col,
        'Correlation': correlation
    })

# Create a DataFrame to display the results
correlation_df = pd.DataFrame(correlation_results)

# Display the resulting DataFrame
correlation_df

Unnamed: 0,Category,Correlation
0,Enlarged Cardiomediastinum,0.059602
1,Cardiomegaly,0.513145
2,Lung Lesion,0.487517
3,Lung Opacity,0.759691
4,Edema,0.566107
5,Consolidation,0.448026
6,Pneumonia,0.182475
7,Atelectasis,0.786729
8,Pneumothorax,0.412915
9,Pleural Effusion,0.661374


### 2. Evaluation Metrics for Gemini API

In [10]:
# Merge the results and ground truth dataframes
merged = pd.merge(results_df, ground_truth_df, on=['study_id'], suffixes=('_results', '_ground_truth'))

# Initialize a list to store results
results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in results_df.columns if col not in ['study_id']]

# Iterate through each column to calculate metrics
for col in columns_to_compare:
    # Get the predictions and ground truth for the current column
    y_pred = merged[f"{col}_results"]
    y_true = merged[f"{col}_ground_truth"]

    # Filter out rows where either y_pred or y_true is NaN (Undefined)
    valid_indices = ~y_pred.isna() & ~y_true.isna()
    y_pred = y_pred[valid_indices]
    y_true = y_true[valid_indices]

    # Total valid rows
    total_valid = len(y_true)

    # Calculate matches for accuracy
    matches = (y_pred == y_true).sum()
    accuracy = (matches / total_valid) * 100 if total_valid > 0 else 0

    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN) for each class
    tp = ((y_pred == 1) & (y_true == 1)).sum()  # True Positives for "Yes"
    fp = ((y_pred == 1) & (y_true != 1)).sum()  # False Positives for "Yes"
    fn = ((y_pred != 1) & (y_true == 1)).sum()  # False Negatives for "Yes"

    # Precision, Recall, and F1-Score for "Yes" class
    precision = (tp / (tp + fp) * 100) if (tp + fp) > 0 else 0
    recall = (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

    # Append results for the current column
    results.append({
        'Category': col,
        'Total Valid': total_valid,
        'Matches': matches,
        'Accuracy': f"{accuracy:.2f}%",
        'Precision': f"{precision:.2f}%",
        'Recall': f"{recall:.2f}%",
        'F1-Score': f"{f1_score:.2f}%"
    })

# Create a DataFrame to display the results
accuracy_df = pd.DataFrame(results)

# Display the resulting DataFrame
accuracy_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy,Precision,Recall,F1-Score
0,Enlarged Cardiomediastinum,685,601,87.74%,54.17%,28.26%,37.14%
1,Cardiomegaly,685,594,86.72%,74.87%,92.72%,82.84%
2,Lung Lesion,685,653,95.33%,71.21%,88.68%,78.99%
3,Lung Opacity,685,567,82.77%,62.40%,92.31%,74.46%
4,Edema,685,615,89.78%,82.78%,92.59%,87.41%
5,Consolidation,685,658,96.06%,72.00%,96.43%,82.44%
6,Pneumonia,685,650,94.89%,84.91%,81.82%,83.33%
7,Atelectasis,685,612,89.34%,77.37%,95.43%,85.45%
8,Pneumothorax,685,674,98.39%,91.43%,84.21%,87.67%
9,Pleural Effusion,685,623,90.95%,91.48%,90.48%,90.98%


### Evaluation Metrics of Gemini API (Considering only 1's and 0's)

In [13]:
# Filter out rows where predictions or ground truth are -1 or NaN
copied_merged = merged.copy()

# Iterate through each category to calculate metrics
results = []
categories = [col.replace("_results", "") for col in merged.columns if "_results" in col]

for category in categories:
    # Filter valid rows for the current category
    y_pred = copied_merged[f"{category}_results"]
    y_true = copied_merged[f"{category}_ground_truth"]
    valid_indices = (y_pred.isin([0, 1])) & (y_true.isin([0, 1]))
    
    y_pred = y_pred[valid_indices]
    y_true = y_true[valid_indices]
    
    # Total valid rows
    total_valid = len(y_true)
    
    # Calculate matches for accuracy
    matches = (y_pred == y_true).sum()
    accuracy = (matches / total_valid) * 100 if total_valid > 0 else 0
    
    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
    tp = ((y_pred == 1) & (y_true == 1)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    
    # Precision, Recall, and F1-Score
    precision = (tp / (tp + fp) * 100) if (tp + fp) > 0 else 0
    recall = (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
    
    # Append results for the current category
    results.append({
        'Category': category,
        'Total Valid': total_valid,
        'Matches': matches,
        'Accuracy': f"{accuracy:.2f}%",
        'Precision': f"{precision:.2f}%",
        'Recall': f"{recall:.2f}%",
        'F1-Score': f"{f1_score:.2f}%"
    })

# Create a DataFrame to display the results
filtered_metrics_df = pd.DataFrame(results)

# Display the resulting DataFrame
filtered_metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy,Precision,Recall,F1-Score
0,Enlarged Cardiomediastinum,629,598,95.07%,56.52%,38.24%,45.61%
1,Cardiomegaly,629,583,92.69%,76.50%,97.90%,85.89%
2,Lung Lesion,667,649,97.30%,75.81%,94.00%,83.93%
3,Lung Opacity,672,567,84.38%,62.40%,93.41%,74.82%
4,Edema,627,603,96.17%,89.29%,93.28%,91.24%
5,Consolidation,658,642,97.57%,79.41%,96.43%,87.10%
6,Pneumonia,580,578,99.66%,97.83%,97.83%,97.83%
7,Atelectasis,631,589,93.34%,82.82%,98.43%,89.95%
8,Pneumothorax,672,667,99.26%,96.97%,88.89%,92.75%
9,Pleural Effusion,628,608,96.82%,94.27%,98.02%,96.11%


### Correlation Between Gemini API & CheXPert (considering only 1's and 0's)

In [14]:
# Merge the two DataFrames on 'study_id'
merged_df = pd.merge(results_df, chexpert_results_df, on='study_id', suffixes=('_results', '_chexpert'))

# Initialize a list to store correlation results
correlation_results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in results_df.columns if col != 'study_id']

# Iterate through each category to calculate correlation
for col in columns_to_compare:
    # Get predictions from both models for the current category
    model1_preds = merged_df[f"{col}_results"]
    model2_preds = merged_df[f"{col}_chexpert"]
    
    # Filter valid rows (exclude -1 and NaN)
    valid_indices = (model1_preds.isin([0, 1])) & (model2_preds.isin([0, 1]))
    model1_valid = model1_preds[valid_indices]
    model2_valid = model2_preds[valid_indices]
    
    # Calculate correlation (Pearson)
    if len(model1_valid) > 1:  # Ensure there are enough data points
        correlation = model1_valid.corr(model2_valid)
    else:
        correlation = None  # Not enough data to calculate correlation
    
    # Append results for the current category
    correlation_results.append({
        'Category': col,
        'Correlation': correlation
    })

# Create a DataFrame to display the results
correlation_df = pd.DataFrame(correlation_results)

# Display the resulting DataFrame
correlation_df

Unnamed: 0,Category,Correlation
0,Enlarged Cardiomediastinum,0.361223
1,Cardiomegaly,0.832818
2,Lung Lesion,0.752613
3,Lung Opacity,0.887886
4,Edema,0.845382
5,Consolidation,0.934651
6,Pneumonia,0.725025
7,Atelectasis,0.962614
8,Pneumothorax,0.758836
9,Pleural Effusion,0.973438


In [15]:
len(ground_truth_df)

687

In [16]:
maybe_counts = (ground_truth_df.drop(['study_id'], axis=1) == -1).sum()

maybe_percentages = (maybe_counts / len(ground_truth_df)) * 100

maybe_summary_df = pd.DataFrame({
    '-1 Count': maybe_counts,
    '-1 Percentage': maybe_percentages
})

maybe_summary_df

Unnamed: 0,-1 Count,-1 Percentage
Enlarged Cardiomediastinum,38,5.531295
Cardiomegaly,39,5.676856
Lung Lesion,9,1.310044
Lung Opacity,0,0.0
Edema,41,5.967977
Consolidation,26,3.784571
Pneumonia,89,12.954876
Atelectasis,41,5.967977
Pneumothorax,10,1.455604
Pleural Effusion,23,3.347889


In [17]:
len(results_df)

685

In [18]:
len(ground_truth_df)

687

In [19]:
len(missing_ids)

2

In [20]:
len(missing_ids)

2

In [21]:
missing_ids

{54107786, 58592606}

In [22]:
len(results_df)

685

In [23]:
len(ground_truth_df)

687

### 5. Evaluation Metrics for CheXPert Labeler

In [25]:
# Merge the results and ground truth dataframes
merged = pd.merge(chexpert_results_df, ground_truth_df, on=['study_id'], suffixes=('_chexpert_results', '_ground_truth'))

# Initialize a list to store results
results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in chexpert_results_df.columns if col not in ['study_id']]

# Iterate through each column to calculate metrics
for col in columns_to_compare:
    # Get the predictions and ground truth for the current column
    y_pred = merged[f"{col}_chexpert_results"]
    y_true = merged[f"{col}_ground_truth"]

    # Filter out rows where either y_pred or y_true is NaN (Undefined)
    valid_indices = ~y_pred.isna() & ~y_true.isna()
    y_pred = y_pred[valid_indices]
    y_true = y_true[valid_indices]

    # Total valid rows
    total_valid = len(y_true)

    # Calculate matches for accuracy
    matches = (y_pred == y_true).sum()
    accuracy = (matches / total_valid) * 100 if total_valid > 0 else 0

    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN) for each class
    tp = ((y_pred == 1) & (y_true == 1)).sum()  # True Positives for "Yes"
    fp = ((y_pred == 1) & (y_true != 1)).sum()  # False Positives for "Yes"
    fn = ((y_pred != 1) & (y_true == 1)).sum()  # False Negatives for "Yes"

    # Precision, Recall, and F1-Score for "Yes" class
    precision = (tp / (tp + fp) * 100) if (tp + fp) > 0 else 0
    recall = (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

    # Append results for the current column
    results.append({
        'Category': col,
        'Total Valid': total_valid,
        'Matches': matches,
        'Accuracy': f"{accuracy:.2f}%",
        'Precision': f"{precision:.2f}%",
        'Recall': f"{recall:.2f}%",
        'F1-Score': f"{f1_score:.2f}%"
    })

# Create a DataFrame to display the results
accuracy_df = pd.DataFrame(results)

# Display the resulting DataFrame
accuracy_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy,Precision,Recall,F1-Score
0,Enlarged Cardiomediastinum,685,565,82.48%,30.34%,58.70%,40.00%
1,Cardiomegaly,685,569,83.07%,58.97%,91.39%,71.69%
2,Lung Lesion,685,652,95.18%,69.84%,83.02%,75.86%
3,Lung Opacity,685,550,80.29%,60.08%,93.49%,73.15%
4,Edema,685,637,92.99%,85.92%,90.37%,88.09%
5,Consolidation,685,645,94.16%,65.82%,92.86%,77.04%
6,Pneumonia,685,585,85.40%,43.82%,70.91%,54.17%
7,Atelectasis,685,623,90.95%,79.49%,94.42%,86.31%
8,Pneumothorax,685,649,94.74%,56.67%,89.47%,69.39%
9,Pleural Effusion,685,620,90.51%,88.93%,91.21%,90.05%


In [26]:
# Filter out rows where predictions or ground truth are -1 or NaN
copied_merged = merged.copy()

# Iterate through each category to calculate metrics
results = []
categories = [col.replace("_chexpert_results", "") for col in merged.columns if "_chexpert_results" in col]

for category in categories:
    # Filter valid rows for the current category
    y_pred = copied_merged[f"{category}_chexpert_results"]
    y_true = copied_merged[f"{category}_ground_truth"]
    valid_indices = (y_pred.isin([0, 1])) & (y_true.isin([0, 1]))
    
    y_pred = y_pred[valid_indices]
    y_true = y_true[valid_indices]
    
    # Total valid rows
    total_valid = len(y_true)
    
    # Calculate matches for accuracy
    matches = (y_pred == y_true).sum()
    accuracy = (matches / total_valid) * 100 if total_valid > 0 else 0
    
    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
    tp = ((y_pred == 1) & (y_true == 1)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    
    # Precision, Recall, and F1-Score
    precision = (tp / (tp + fp) * 100) if (tp + fp) > 0 else 0
    recall = (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
    
    # Append results for the current category
    results.append({
        'Category': category,
        'Total Valid': total_valid,
        'Matches': matches,
        'Accuracy': f"{accuracy:.2f}%",
        'Precision': f"{precision:.2f}%",
        'Recall': f"{recall:.2f}%",
        'F1-Score': f"{f1_score:.2f}%"
    })

# Create a DataFrame to display the results
filtered_metrics_df = pd.DataFrame(results)

# Display the resulting DataFrame
filtered_metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy,Precision,Recall,F1-Score
0,Enlarged Cardiomediastinum,610,541,88.69%,33.75%,62.79%,43.90%
1,Cardiomegaly,635,562,88.50%,67.65%,95.17%,79.08%
2,Lung Lesion,672,651,96.88%,74.58%,88.00%,80.73%
3,Lung Opacity,663,550,82.96%,60.08%,95.18%,73.66%
4,Edema,625,605,96.80%,90.37%,94.57%,92.42%
5,Consolidation,649,638,98.31%,86.67%,94.55%,90.43%
6,Pneumonia,555,529,95.32%,60.94%,97.50%,75.00%
7,Atelectasis,638,594,93.10%,83.41%,96.37%,89.42%
8,Pneumothorax,668,647,96.86%,65.38%,91.89%,76.40%
9,Pleural Effusion,638,609,95.45%,92.91%,96.14%,94.50%
