In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch, os
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
from PIL import Image

from scipy.stats import iqr, gaussian_kde, kruskal
from scikit_posthocs import posthoc_dunn

import nltk, string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud

In [None]:
# install corpora and models

#nltk.download('punkt')
#nltk.download('punkt_tab')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Uncertain and NaN LLM Pneumonia Labels in MIMIC-CXR

In [2]:
def processed_file_name(img_path):
    name = os.path.basename(img_path)
    pre = os.path.dirname(img_path)

    name = name.split('.')[0] + '.png' if not name.endswith('.png') else name
    name = 'PROCESSED_' + name
    return os.path.join(pre, name)

In [3]:
mimic_meta = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata-augmented-trimmed.csv')

# only retain high-quality, frontal images
def mimic_include(dicom_id):
    row = mimic_meta[mimic_meta['dicom_id'] == dicom_id].iloc[0]
    if row['CorrectedView'] == 'FRONTAL' and not row['QualityIssue'] and not row['GenderMismatch']:
        return True
    return False

In [None]:
mimic_llm = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_llm_complete.csv')
mimic_llm = mimic_llm[(mimic_llm.LLM_class == -1) | (mimic_llm.LLM_class.isna())]

mimic_llm['Keep'] = mimic_llm['dicom_id'].apply(mimic_include)
mimic_llm = mimic_llm[mimic_llm['Keep'] == True]

mimic_llm
#mimic_llm.to_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_llm_uncertain+nan_frontal.csv', index=False)

In [None]:
# after processing

mimic_llm = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_llm_uncertain+nan_frontal.csv')

mimic_llm['Processed_Path'] = mimic_llm.Path.apply(processed_file_name)
mimic_llm['Processed_Exists'] = mimic_llm['Processed_Path'].apply(os.path.isfile)
mimic_llm = mimic_llm[mimic_llm['Processed_Exists'] == True]

mimic_llm

In [None]:
mimic_uncertain = mimic_llm[mimic_llm.LLM_class == -1][['Processed_Path', 'LLM_class']]
mimic_nan = mimic_llm[mimic_llm.LLM_class.isna()][['Processed_Path', 'LLM_class']]

mimic_uncertain.rename(columns={'Processed_Path': 'Path',
                                'LLM_class': 'Label'}, inplace=True)
mimic_nan.rename(columns={'Processed_Path': 'Path',
                          'LLM_class': 'Label'}, inplace=True)

mimic_uncertain

# Positive & Negative Test Set Labels

In [None]:
mimic_test = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/test_llm.csv')
mimic_test

In [None]:
mimic_positive = mimic_test[mimic_test['Label'] == 1]
mimic_negative = mimic_test[mimic_test['Label'] == 0]

# Inference

In [7]:
class PneumoniaDataset(Dataset):
    def __init__(self, df, transform=None):
        """
        df: DataFrame with columns 'Path' (to CXRs) and 'Label' (pneumonia +/-)
        transform: transforms to apply to CXRs
        """
        self.df = df.reset_index(drop=True)
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path,label = row['Path'],row['Label']

        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)
        
        return image, label

def evaluate(model, loader):
    model.eval()
    all_probs = []
    
    with torch.no_grad():
        for images,_ in loader:
            images = images.to(device)
            
            outputs = model(images)
            probs = F.softmax(outputs, dim=1)[:, 1] # probabilities for pneumonia-positive class
            
            all_probs.extend(probs.cpu().numpy())
    
    return all_probs

In [8]:
test_transforms = transforms.Compose([
    transforms.Resize((480, 480)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[.485, .456, .406],
                         std=[.229, .224, .225]) # ImageNet vals
])

In [9]:
positive_data = PneumoniaDataset(mimic_positive, transform=test_transforms)
positive_loader = DataLoader(positive_data, batch_size=32, shuffle=False, num_workers=4)

negative_data = PneumoniaDataset(mimic_negative, transform=test_transforms)
negative_loader = DataLoader(negative_data, batch_size=32, shuffle=False, num_workers=4)

uncertain_data = PneumoniaDataset(mimic_uncertain, transform=test_transforms)
uncertain_loader = DataLoader(uncertain_data, batch_size=32, shuffle=False, num_workers=4)

nan_data = PneumoniaDataset(mimic_nan, transform=test_transforms)
nan_loader = DataLoader(nan_data, batch_size=32, shuffle=False, num_workers=4)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = models.densenet121(weights=None)

model.classifier = nn.Sequential(
    nn.Linear(model.classifier.in_features, 512), 
    nn.ReLU(), 
    nn.BatchNorm1d(512), 
    nn.Dropout(p=.3),
    nn.Linear(512, 2)
)

model.load_state_dict(torch.load('/Data/mimic_llm_model.pth', weights_only=True))
model = model.to(device)

In [22]:
# pneumonia probabilities for each class in the MIMIC-CXR test set

positive_preds = evaluate(model, positive_loader)
negative_preds = evaluate(model, negative_loader)
uncertain_preds = evaluate(model, uncertain_loader)
nan_preds = evaluate(model, nan_loader)

In [24]:
# convert to %

positive_preds = list(np.array(positive_preds) * 100)
negative_preds = list(np.array(negative_preds) * 100)
uncertain_preds = list(np.array(uncertain_preds) * 100)
nan_preds = list(np.array(nan_preds) * 100)

In [None]:
dic = {'Prob': positive_preds+negative_preds+uncertain_preds+nan_preds,
       'Label': ['Positive' for _ in range(len(positive_preds))] + ['Negative' for _ in range(len(negative_preds))] + ['Uncertain' for _ in range(len(uncertain_preds))] + ['NoLabel' for _ in range(len(nan_preds))]
      } 

data = pd.DataFrame(dic)
data

#data.to_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/test_label_confidence.csv', index=False)

# Visualisation

In [None]:
data = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/test_label_confidence.csv')
data

In [None]:
positive_preds = data[data.Label == 'Positive'].Prob
negative_preds = data[data.Label == 'Negative'].Prob
uncertain_preds = data[data.Label == 'Uncertain'].Prob
nan_preds = data[data.Label == 'NoLabel'].Prob

In [8]:
# ridgeline plot of predicted pneumonia probabilities for each class using KDE curves

def ridge_plot(data, overlap=0, stats=True, fill=[], labels=[], title=''):
    fig, ax = plt.subplots(nrows=1, figsize=(8,5))
    
    xx = np.linspace(np.min(np.concatenate(data)), np.max(np.concatenate(data)), 5000)
    ys = []
    
    for i,d in enumerate(data):
        pdf = gaussian_kde(d)
        y = i*(1-overlap)
        ys.append(y)
        curve = pdf(xx)

        ax.plot(xx, curve+y, c='k', zorder=len(data)-i+1)

        if fill:
            ax.fill_between(xx, np.ones(5000)*y, curve+y, zorder=len(data)-i+1, color=fill[i], alpha=.5)
        if stats:
            med = np.median(d)
            iqr_val = iqr(d)

            # asymmetric error bars such that they don't go beyond 0-100%
            left_err = med - max(med - iqr_val, 0)
            right_err = min(med + iqr_val, 100) - med
            
            ax.scatter(med, max(curve+y), color=fill[i] if fill else 'k', label=f'{labels[i]} median: {round(med,3)}')
            ax.errorbar(med, max(curve+y), xerr=[[left_err], [right_err]], color=fill[i] if fill else 'k', capsize=3)

    if labels:
        ax.set_yticks(ys, labels)

    if title:
        ax.set_title(title)

In [None]:
plt.style.use('ggplot')

ridge_plot([negative_preds, nan_preds, uncertain_preds, positive_preds],
           overlap=.97,
           fill=['#019959', '#eb4fa9', '#ffce50', '#0296dc'],
           labels=[f'Negative\n(N={len(negative_preds)})', f'N/A\n(N={len(nan_preds):,})', f'Uncertain\n(N={len(uncertain_preds):,})', f'Positive\n(N={len(positive_preds)})'],
           title='Model Confidence in Pneumonia Stratified by Ground Truth')

plt.savefig('/Data/Figures/ridge_plot.svg')

In [None]:
print(f'Positive median, IQR: {np.median(positive_preds):.2f}, {iqr(positive_preds):.2f}')
print(f'Negative median, IQR: {np.median(negative_preds):.2f}, {iqr(negative_preds):.2f}')
print(f'Uncertain median, IQR: {np.median(uncertain_preds):.2f}, {iqr(uncertain_preds):.2f}')
print(f'N/A median, IQR: {np.median(nan_preds):.2f}, {iqr(nan_preds):.2f}')

# Significant Differences in Confidence

In [None]:
# at least one sig diff
_, p_value = kruskal(positive_preds, negative_preds, uncertain_preds, nan_preds)
print(f'P = {p_value}')

# pairwise
if p_value < .01:
    posthoc_results = posthoc_dunn([positive_preds, negative_preds, uncertain_preds, nan_preds], p_adjust='bonferroni')
    print(posthoc_results)

# Error Analysis

In [15]:
# preprocess radiology report text

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    
    tokens = word_tokenize(text) # tokenisation
    filtered_tokens = [t for t in tokens if t not in stopwords.words('english')] # remove stopwords
    
    lemmatiser = WordNetLemmatizer() # lemmatisation
    lemmatised_tokens = [lemmatiser.lemmatize(t) for t in filtered_tokens]
    
    return ' '.join(lemmatised_tokens)

In [None]:
# reports
mimic_llm = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_llm_complete.csv')
mimic_llm['Processed_Path'] = mimic_llm.Path.apply(processed_file_name)

# separate pneumonia-positive cases into true-positives and false-negatives, assuming the default classification threshold of 50%
mimic_positive['Prob'] = positive_preds
false_negative_paths = mimic_positive[mimic_positive.Prob < 50].Path.tolist()
false_negative_reports = mimic_llm[mimic_llm.Processed_Path.isin(false_negative_paths)].Report

true_positive_paths = mimic_positive[mimic_positive.Prob >= 50].Path.tolist()
true_positive_reports = mimic_llm[mimic_llm.Processed_Path.isin(true_positive_paths)].Report

In [None]:
# n-gram analysis

processed_negative_reports = [preprocess_text(report) for report in false_negative_reports]
processed_positive_reports = [preprocess_text(report) for report in true_positive_reports]

vectoriser = CountVectorizer(ngram_range=(2, 4))  # bigrams, trigrams, and quadrigrams
X = vectoriser.fit_transform(processed_negative_reports)

words = vectoriser.get_feature_names_out()
word_freq = X.toarray().sum(axis=0)

freq_dict = dict(zip(words, word_freq))

sorted(freq_dict.items(), key=lambda x : x[1], reverse=True)

In [None]:
# wordcloud of false-negative n-grams

wordcloud = WordCloud(width=800, height=400, background_color='white')
wordcloud.generate_from_frequencies(freq_dict)

plt.figure(figsize=(15, 7.5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

In [None]:
# combine collections for vectorisation
all_reports = processed_negative_reports + processed_positive_reports

tfidf = TfidfVectorizer(ngram_range=(2, 4), stop_words='english')
tfidf_matrix = tfidf.fit_transform(all_reports)
feature_names = tfidf.get_feature_names_out()

# average TF-IDF for false-negatives and true-positives
false_negative_tfidf = tfidf_matrix[:len(processed_negative_reports)].toarray().mean(axis=0)
true_positive_tfidf = tfidf_matrix[len(processed_positive_reports):].toarray().mean(axis=0)

# terms more common in false-negative reports than true-positive
diff = false_negative_tfidf - true_positive_tfidf
important_terms_indices = np.argsort(diff)[-10:]  # top 10
important_terms = [feature_names[i] for i in important_terms_indices]
print('False negative terms:', important_terms)

# Test Set Performance per Dataset

In [None]:
# model performance for all 6 datasets across 9 runs
# columns: Dataset, Metric (Precision/Recall/F1/AUC), Val (% Metric value)

df = pd.read_csv('/Data/dataset_performances.csv')
df

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(10,6))

palette = {'Precision': '#019959', 'Sensitivity': '#eb4fa9', 'F1': '#ffce50', 'AUC': '#0296dc'}
sns.boxplot(data=df, x='Dataset', y='Val', hue='Metric', palette=palette, saturation=1, ax=ax)

ax.set_ylabel('% Performance')
ax.legend(title='Metric', facecolor='white')

ax.set_title('Model Performance Stratified by Dataset')

plt.savefig('/Data/Figures/dataset_performance.svg')