In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
file_path = "C:/Users/emred/OneDrive/Masaüstü/Docs/3rd Year/Spring/CENG404 - Special Topics in CENG"

### Loading Datas

In [4]:
ground_truth_df = pd.read_csv(file_path + '/Doktor Raporları/data' + '/mimic-cxr-2.1.0-test-set-labeled.csv')



results_df = pd.read_json(file_path + '/merged.json')
labels_df = pd.json_normalize(results_df['labels'])
results_df = pd.concat([results_df.drop(columns=['labels']), labels_df], axis = 1)



results_df = results_df.replace({
        'Yes' : 1,
        'No' : 0,
        'Maybe' : -1,
        'Undefined' : np.nan
    }
)



results_df['patient_id'] = results_df["patient_id"].str.replace("p", "").astype(int)
results_df['report_name'] = results_df['report_name'].str.replace('s', '').str.replace('.txt', '').astype(int)
results_df = results_df.rename(columns = {
    'patient_id' : 'subject_id',
    'report_name' : 'study_id'
})



chexpert_results_df = pd.read_csv(file_path + '/comparison_relevant.csv')
ground_truth_df = ground_truth_df.drop('No Finding', axis = 1)
results_df = results_df.drop('subject_id', axis = 1)
chexpert_results_df = chexpert_results_df.drop('No Finding', axis = 1)



# Fill NaN values w/ 0
ground_truth_df = ground_truth_df.fillna(0)
results_df = results_df.fillna(0)
chexpert_results_df = chexpert_results_df.fillna(0)



ground_truth_df = ground_truth_df.rename(columns = {'Airspace Opacity' : 'Lung Opacity'})
reordered_columns = [col for col in ground_truth_df.columns]
results_df = results_df[reordered_columns]
chexpert_results_df = chexpert_results_df[reordered_columns]

  results_df = results_df.replace({


### Find which reports are missing in results_df, and save them as a json file.

In [6]:
results_df = results_df.drop_duplicates()

ground_truth_ids = {id for id in ground_truth_df['study_id']}

results_ids = {id for id in results_df['study_id']}

missing_ids = {id for id in ground_truth_ids if id not in results_ids}

with open("relevant_reports.json", "r") as json_file:
    data = json.load(json_file)

filtered_data = [entry for entry in data if int(entry['study_id'].replace("s", "")) in missing_ids]

output_file = "filtered_data.json"
with open(output_file, "w") as json_file:
    json.dump(filtered_data, json_file, indent=4)

print(f'File {output_file} is saved.')

File filtered_data.json is saved.


### Element-wise comparison

In [8]:
# matched_df = results_df.merge(ground_truth_df, on = ['subject_id', 'study_id'], suffixes = ('_results', '_ground_truth'))
matched_df = results_df.merge(ground_truth_df, on = ['study_id'], suffixes = ('_results', '_ground_truth'))

In [9]:
merged = pd.merge(results_df, ground_truth_df, on=['study_id'], suffixes=('_results', '_ground_truth'))

results = []

columns_to_compare = [col for col in results_df.columns if col not in ['study_id']]
total = len(results_df) 

for col in columns_to_compare:
    matches = (merged[f"{col}_results"] == merged[f"{col}_ground_truth"]).sum()
    accuracy = matches / total * 100 
    results.append({'Category': col, 'Total': total, 'Matches': matches, 'Accuracy': f"{accuracy:.2f}%"})

accuracy_df = pd.DataFrame(results)

accuracy_df

Unnamed: 0,Category,Total,Matches,Accuracy
0,Enlarged Cardiomediastinum,558,498,89.25%
1,Cardiomegaly,558,483,86.56%
2,Lung Lesion,558,531,95.16%
3,Lung Opacity,558,466,83.51%
4,Edema,558,503,90.14%
5,Consolidation,558,539,96.59%
6,Pneumonia,558,529,94.80%
7,Atelectasis,558,497,89.07%
8,Pneumothorax,558,547,98.03%
9,Pleural Effusion,558,508,91.04%


In [10]:
# Merge the results and ground truth dataframes
merged = pd.merge(results_df, ground_truth_df, on=['study_id'], suffixes=('_results', '_ground_truth'))

# Initialize a list to store results
results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in results_df.columns if col not in ['study_id']]

# Iterate through each column to calculate metrics
for col in columns_to_compare:
    # Get the predictions and ground truth for the current column
    y_pred = merged[f"{col}_results"]
    y_true = merged[f"{col}_ground_truth"]

    # Filter out rows where either y_pred or y_true is NaN (Undefined)
    valid_indices = ~y_pred.isna() & ~y_true.isna()
    y_pred = y_pred[valid_indices]
    y_true = y_true[valid_indices]

    # Total valid rows
    total_valid = len(y_true)

    # Calculate matches for accuracy
    matches = (y_pred == y_true).sum()
    accuracy = (matches / total_valid) * 100 if total_valid > 0 else 0

    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN) for each class
    tp = ((y_pred == 1) & (y_true == 1)).sum()  # True Positives for "Yes"
    fp = ((y_pred == 1) & (y_true != 1)).sum()  # False Positives for "Yes"
    fn = ((y_pred != 1) & (y_true == 1)).sum()  # False Negatives for "Yes"

    # Precision, Recall, and F1-Score for "Yes" class
    precision = (tp / (tp + fp) * 100) if (tp + fp) > 0 else 0
    recall = (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

    # Append results for the current column
    results.append({
        'Category': col,
        'Total Valid': total_valid,
        'Matches': matches,
        'Accuracy': f"{accuracy:.2f}%",
        'Precision': f"{precision:.2f}%",
        'Recall': f"{recall:.2f}%",
        'F1-Score': f"{f1_score:.2f}%"
    })

# Create a DataFrame to display the results
accuracy_df = pd.DataFrame(results)

# Display the resulting DataFrame
accuracy_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy,Precision,Recall,F1-Score
0,Enlarged Cardiomediastinum,558,498,89.25%,68.42%,31.71%,43.33%
1,Cardiomegaly,558,483,86.56%,75.54%,90.52%,82.35%
2,Lung Lesion,558,531,95.16%,71.43%,90.91%,80.00%
3,Lung Opacity,558,466,83.51%,64.29%,92.65%,75.90%
4,Edema,558,503,90.14%,84.07%,89.62%,86.76%
5,Consolidation,558,539,96.59%,77.05%,95.92%,85.45%
6,Pneumonia,558,529,94.80%,86.05%,78.72%,82.22%
7,Atelectasis,558,497,89.07%,76.80%,94.30%,84.66%
8,Pneumothorax,558,547,98.03%,91.67%,82.50%,86.84%
9,Pleural Effusion,558,508,91.04%,90.95%,90.13%,90.54%


In [11]:
# Merge the two DataFrames on 'study_id'
merged_df = pd.merge(results_df, chexpert_results_df, on='study_id', suffixes=('_gemini', '_chexpert'))

# Initialize a list to store correlation results
correlation_results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in results_df.drop('study_id', axis = 1).columns];

# Iterate through each category to calculate correlation
for col in columns_to_compare:
    # Get predictions from both models for the current category
    model1_preds = merged_df[f"{col}_gemini"]
    model2_preds = merged_df[f"{col}_chexpert"]

    # Handle NaN values (e.g., exclude rows with NaN in either model)
    valid_indices = (~model1_preds.isna()) & (~model2_preds.isna())
    model1_valid = model1_preds[valid_indices]
    model2_valid = model2_preds[valid_indices]

    # Calculate correlation (Pearson)
    if len(model1_valid) > 1:  # Ensure there are enough data points
        correlation = model1_valid.corr(model2_valid)
    else:
        correlation = None  # Not enough data to calculate correlation

    # Append results for the current category
    correlation_results.append({
        'Category': col,
        'Correlation': correlation
    })

# Create a DataFrame to display the results
correlation_df = pd.DataFrame(correlation_results)

# Display the resulting DataFrame
correlation_df

Unnamed: 0,Category,Correlation
0,Enlarged Cardiomediastinum,0.075756
1,Cardiomegaly,0.490634
2,Lung Lesion,0.473895
3,Lung Opacity,0.757803
4,Edema,0.593614
5,Consolidation,0.508017
6,Pneumonia,0.123933
7,Atelectasis,0.758218
8,Pneumothorax,0.43496
9,Pleural Effusion,0.626461


In [12]:
merged = pd.merge(results_df, ground_truth_df, on=['study_id'], suffixes=('_results', '_ground_truth'))

results = []

columns_to_compare = [col for col in results_df.columns if col not in ['study_id']]
total = len(results_df) 

for col in columns_to_compare:
    matches = (merged[f"{col}_results"] == merged[f"{col}_ground_truth"]).sum()
    accuracy = matches / total * 100 
    results.append({'Category': col, 'Total': total, 'Matches': matches, 'Accuracy': f"{accuracy:.2f}%"})

accuracy_df = pd.DataFrame(results)

In [13]:
accuracy_df

Unnamed: 0,Category,Total,Matches,Accuracy
0,Enlarged Cardiomediastinum,558,498,89.25%
1,Cardiomegaly,558,483,86.56%
2,Lung Lesion,558,531,95.16%
3,Lung Opacity,558,466,83.51%
4,Edema,558,503,90.14%
5,Consolidation,558,539,96.59%
6,Pneumonia,558,529,94.80%
7,Atelectasis,558,497,89.07%
8,Pneumothorax,558,547,98.03%
9,Pleural Effusion,558,508,91.04%


In [14]:
# Merge the results and ground truth dataframes
merged = pd.merge(results_df, ground_truth_df, on=['study_id'], suffixes=('_results', '_ground_truth'))

# Initialize a list to store results
results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in results_df.columns if col not in ['study_id']]

# Iterate through each column to calculate metrics
for col in columns_to_compare:
    # Get the predictions and ground truth for the current column
    y_pred = merged[f"{col}_results"]
    y_true = merged[f"{col}_ground_truth"]

    # Filter out rows where either y_pred or y_true is NaN (Undefined)
    valid_indices = ~y_pred.isna() & ~y_true.isna()
    y_pred = y_pred[valid_indices]
    y_true = y_true[valid_indices]

    # Total valid rows
    total_valid = len(y_true)

    # Calculate matches for accuracy
    matches = (y_pred == y_true).sum()
    accuracy = (matches / total_valid) * 100 if total_valid > 0 else 0

    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN) for each class
    tp = ((y_pred == 1) & (y_true == 1)).sum()  # True Positives for "Yes"
    fp = ((y_pred == 1) & (y_true != 1)).sum()  # False Positives for "Yes"
    fn = ((y_pred != 1) & (y_true == 1)).sum()  # False Negatives for "Yes"

    # Precision, Recall, and F1-Score for "Yes" class
    precision = (tp / (tp + fp) * 100) if (tp + fp) > 0 else 0
    recall = (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

    # Append results for the current column
    results.append({
        'Category': col,
        'Total Valid': total_valid,
        'Matches': matches,
        'Accuracy': f"{accuracy:.2f}%",
        'Precision': f"{precision:.2f}%",
        'Recall': f"{recall:.2f}%",
        'F1-Score': f"{f1_score:.2f}%"
    })

# Create a DataFrame to display the results
accuracy_df = pd.DataFrame(results)

# Display the resulting DataFrame
accuracy_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy,Precision,Recall,F1-Score
0,Enlarged Cardiomediastinum,558,498,89.25%,68.42%,31.71%,43.33%
1,Cardiomegaly,558,483,86.56%,75.54%,90.52%,82.35%
2,Lung Lesion,558,531,95.16%,71.43%,90.91%,80.00%
3,Lung Opacity,558,466,83.51%,64.29%,92.65%,75.90%
4,Edema,558,503,90.14%,84.07%,89.62%,86.76%
5,Consolidation,558,539,96.59%,77.05%,95.92%,85.45%
6,Pneumonia,558,529,94.80%,86.05%,78.72%,82.22%
7,Atelectasis,558,497,89.07%,76.80%,94.30%,84.66%
8,Pneumothorax,558,547,98.03%,91.67%,82.50%,86.84%
9,Pleural Effusion,558,508,91.04%,90.95%,90.13%,90.54%


In [15]:
# Merge the two DataFrames on 'study_id'
merged_df = pd.merge(results_df, chexpert_results_df, on='study_id', suffixes=('_gemini', '_chexpert'))

# Initialize a list to store correlation results
correlation_results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in results_df.drop('study_id', axis = 1).columns];

# Iterate through each category to calculate correlation
for col in columns_to_compare:
    # Get predictions from both models for the current category
    model1_preds = merged_df[f"{col}_gemini"]
    model2_preds = merged_df[f"{col}_chexpert"]

    # Handle NaN values (e.g., exclude rows with NaN in either model)
    valid_indices = (~model1_preds.isna()) & (~model2_preds.isna())
    model1_valid = model1_preds[valid_indices]
    model2_valid = model2_preds[valid_indices]

    # Calculate correlation (Pearson)
    if len(model1_valid) > 1:  # Ensure there are enough data points
        correlation = model1_valid.corr(model2_valid)
    else:
        correlation = None  # Not enough data to calculate correlation

    # Append results for the current category
    correlation_results.append({
        'Category': col,
        'Correlation': correlation
    })

# Create a DataFrame to display the results
correlation_df = pd.DataFrame(correlation_results)

# Display the resulting DataFrame
correlation_df

Unnamed: 0,Category,Correlation
0,Enlarged Cardiomediastinum,0.075756
1,Cardiomegaly,0.490634
2,Lung Lesion,0.473895
3,Lung Opacity,0.757803
4,Edema,0.593614
5,Consolidation,0.508017
6,Pneumonia,0.123933
7,Atelectasis,0.758218
8,Pneumothorax,0.43496
9,Pleural Effusion,0.626461


### This time, I evaluate based on 1's and 0's

In [17]:
# Filter out rows where predictions or ground truth are -1 or NaN
copied_merged = merged.copy()

# Iterate through each category to calculate metrics
results = []
categories = [col.replace("_results", "") for col in merged.columns if "_results" in col]

for category in categories:
    # Filter valid rows for the current category
    y_pred = copied_merged[f"{category}_results"]
    y_true = copied_merged[f"{category}_ground_truth"]
    valid_indices = (y_pred.isin([0, 1])) & (y_true.isin([0, 1]))
    
    y_pred = y_pred[valid_indices]
    y_true = y_true[valid_indices]
    
    # Total valid rows
    total_valid = len(y_true)
    
    # Calculate matches for accuracy
    matches = (y_pred == y_true).sum()
    accuracy = (matches / total_valid) * 100 if total_valid > 0 else 0
    
    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
    tp = ((y_pred == 1) & (y_true == 1)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    
    # Precision, Recall, and F1-Score
    precision = (tp / (tp + fp) * 100) if (tp + fp) > 0 else 0
    recall = (tp / (tp + fn) * 100) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
    
    # Append results for the current category
    results.append({
        'Category': category,
        'Total Valid': total_valid,
        'Matches': matches,
        'Accuracy': f"{accuracy:.2f}%",
        'Precision': f"{precision:.2f}%",
        'Recall': f"{recall:.2f}%",
        'F1-Score': f"{f1_score:.2f}%"
    })

# Create a DataFrame to display the results
filtered_metrics_df = pd.DataFrame(results)

# Display the resulting DataFrame
filtered_metrics_df

Unnamed: 0,Category,Total Valid,Matches,Accuracy,Precision,Recall,F1-Score
0,Enlarged Cardiomediastinum,519,495,95.38%,68.42%,41.94%,52.00%
1,Cardiomegaly,509,475,93.32%,77.21%,97.22%,86.07%
2,Lung Lesion,542,526,97.05%,75.47%,93.02%,83.33%
3,Lung Opacity,545,466,85.50%,64.29%,93.33%,76.13%
4,Edema,514,494,96.11%,89.62%,91.35%,90.48%
5,Consolidation,537,527,98.14%,85.45%,95.92%,90.38%
6,Pneumonia,478,476,99.58%,97.37%,97.37%,97.37%
7,Atelectasis,513,480,93.57%,82.78%,98.68%,90.03%
8,Pneumothorax,546,541,99.08%,97.06%,89.19%,92.96%
9,Pleural Effusion,513,496,96.69%,94.37%,97.57%,95.94%


In [18]:
# Merge the two DataFrames on 'study_id'
merged_df = pd.merge(results_df, chexpert_results_df, on='study_id', suffixes=('_results', '_chexpert'))

# Initialize a list to store correlation results
correlation_results = []

# Columns to compare (excluding 'study_id')
columns_to_compare = [col for col in results_df.columns if col != 'study_id']

# Iterate through each category to calculate correlation
for col in columns_to_compare:
    # Get predictions from both models for the current category
    model1_preds = merged_df[f"{col}_results"]
    model2_preds = merged_df[f"{col}_chexpert"]
    
    # Filter valid rows (exclude -1 and NaN)
    valid_indices = (model1_preds.isin([0, 1])) & (model2_preds.isin([0, 1]))
    model1_valid = model1_preds[valid_indices]
    model2_valid = model2_preds[valid_indices]
    
    # Calculate correlation (Pearson)
    if len(model1_valid) > 1:  # Ensure there are enough data points
        correlation = model1_valid.corr(model2_valid)
    else:
        correlation = None  # Not enough data to calculate correlation
    
    # Append results for the current category
    correlation_results.append({
        'Category': col,
        'Correlation': correlation
    })

# Create a DataFrame to display the results
correlation_df = pd.DataFrame(correlation_results)

# Display the resulting DataFrame
correlation_df

Unnamed: 0,Category,Correlation
0,Enlarged Cardiomediastinum,0.350916
1,Cardiomegaly,0.837424
2,Lung Lesion,0.749318
3,Lung Opacity,0.887408
4,Edema,0.846068
5,Consolidation,0.970276
6,Pneumonia,0.718809
7,Atelectasis,0.957743
8,Pneumothorax,0.768349
9,Pleural Effusion,0.967379


In [19]:
len(ground_truth_df)

687

In [20]:
maybe_counts = (ground_truth_df.drop(['study_id'], axis=1) == -1).sum()

maybe_percentages = (maybe_counts / len(ground_truth_df)) * 100

maybe_summary_df = pd.DataFrame({
    '-1 Count': maybe_counts,
    '-1 Percentage': maybe_percentages
})

maybe_summary_df

Unnamed: 0,-1 Count,-1 Percentage
Enlarged Cardiomediastinum,38,5.531295
Cardiomegaly,39,5.676856
Lung Lesion,9,1.310044
Lung Opacity,0,0.0
Edema,41,5.967977
Consolidation,26,3.784571
Pneumonia,89,12.954876
Atelectasis,41,5.967977
Pneumothorax,10,1.455604
Pleural Effusion,23,3.347889


In [21]:
len(results_df)

558

In [22]:
len(ground_truth_df)

687

In [23]:
len(missing_ids)

140