In [2]:
import pandas as pd, numpy as np
import ast, os, cv2, torch, shutil, pydicom
from torchvision import models, transforms
from PIL import Image
import matplotlib.pyplot as plt

# Data

In [None]:
rsna = pd.read_csv('/Data/RSNA_Data/stage_2_train_labels.csv')
nih = pd.read_csv('/Data/NIH_Data/Data_Entry_2017.csv')
chex = pd.read_csv('/Data/CheX_Data/chexpertchestxrays-u20210408/train_cheXbert.csv') # recommended labels
pad = pd.read_csv('/Data/BIMCV-PadChest-FULL/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv', encoding='utf-8')
mimic = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv')
vindr = pd.read_csv('/Data/VinDR_Data/physionet.org/files/vindr-cxr/1.0.0/annotations/image_labels_merged.csv')

mimic_meta = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv')

In [None]:
# samples
#chex.sample(n=10_000).to_csv('/Data/cheXbert_10K_sample.csv', index=False)
#nih.sample(n=10_000).to_csv('/Data/nih_10K_sample.csv', index=False)
#pad.sample(n=10_000).to_csv('/Data/pad_10K_sample.csv', index=False)

# Merging VinDR Sets

In [None]:
vindr_train = pd.read_csv('/Data/VinDR_Data/physionet.org/files/vindr-cxr/1.0.0/annotations/image_labels_train.csv')

# majority vote for pneumonia per image id
def majority_vote(group):
    majority_value = group['Pneumonia'].mode().iloc[0]
    return group[group['Pneumonia'] == majority_value].iloc[0]

vindr_train = vindr_train.groupby('image_id').apply(majority_vote).reset_index(drop=True)

vindr_train['Set'] = ['train' for _ in range(len(vindr_train))]
vindr_train

In [None]:
vindr_test = pd.read_csv('/Data/VinDR_Data/physionet.org/files/vindr-cxr/1.0.0/annotations/image_labels_test.csv')
vindr_test['Set'] = ['test' for _ in range(len(vindr_test))]

vindr_merged = pd.concat([vindr_train, vindr_test]).reset_index(drop=True)
vindr_merged

In [77]:
vindr_merged.to_csv('/Data/VinDR_Data/physionet.org/files/vindr-cxr/1.0.0/annotations/image_labels_merged.csv', index=False)

# Sorting MIMIC data by Image

In [None]:
#mimic = mimic[mimic.Pneumonia.isin((0,1))] # do not include for LLM relabelling
mimic

In [None]:
new_rows = []
for _, row in mimic.iterrows():
    patient, study = row.subject_id.astype(int), row.study_id.astype(int)
    pneumonia_status = row.Pneumonia

    study_path = f'/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p{str(patient)[:2]}/p{patient}/s{study}/'
    images = [file for file in os.listdir(study_path) if file.endswith('.jpg')]

    new_rows.extend([{'Path': study_path+image, 'Patient': patient, 'Pneumonia': pneumonia_status} for image in images])

df = pd.DataFrame(new_rows)
df

In [None]:
df.Pneumonia.value_counts()

In [27]:
#df.to_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia.csv', index=False) # only (0,1) pneumonia labels
df.to_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full.csv', index=False)

# PadChest Localisations

In [None]:
pad_pneumonia = pad[pad.Labels.str.contains('pneumonia', na=False)]
pneumonia_locs = []

for i in range(len(pad_pneumonia)):
    locs = ast.literal_eval(pad_pneumonia.LabelsLocalizationsBySentence.iloc[i])
    for loc in locs:
        if 'pneumonia' in loc:
            filter_locs = [x[4:].capitalize() for x in loc if x.startswith('loc')]
            if filter_locs:
                pneumonia_locs.extend(filter_locs)

plt.style.use('fivethirtyeight')

_,ax = plt.subplots(1, figsize=(10,5))

pd.Series(pneumonia_locs).value_counts()[:20].plot(kind='bar', ax=ax)
ax.set_xlabel('Localisation')
ax.set_ylabel('Frequency')
ax.set_title('Top 20 Frequent Pneumonia Localisations - PadChest Dataset')

ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha='right')

plt.tight_layout()

#plt.savefig('pneumonia_localisations.svg')

# Accessing Images

In [26]:
def get_chex_img_path(cheXbert_path:str) -> str:
    path = '/Data/CheX_Data/chexpertchestxrays-u20210408/CheXpert-v1.0'
    
    patient_num = int(cheXbert_path[27:32])
    if 1 <= patient_num <= 21513:
        path += ' batch 2 (train 1)/'
    elif 21514 <= patient_num <= 43017:
        path += ' batch 3 (train 2)/'
    elif 43018 <= patient_num <= 64540:
        path += ' batch 4 (train 3)/'
    else:
        path += ' batch 1 (validate & csv)/valid/'

    return path + cheXbert_path[20:]

def get_nih_img_path(nih_img_idx:str) -> str:
    path = '/Data/NIH_Data/'

    # apologies...
    img = nih_img_idx[:-4]
    if '00000001_000' <= img <= '00001335_006':
        path += f'images_001/images/{nih_img_idx}'
    elif '00001336_000' <= img <= '00003923_013':
        path += f'images_002/images/{nih_img_idx}'
    elif '00003923_014' <= img <= '00006585_006':
        path += f'images_003/images/{nih_img_idx}'
    elif '00006585_007' <= img <= '00009232_003':
        path += f'images_004/images/{nih_img_idx}'
    elif '00009232_004' <= img <= '00011558_007':
        path += f'images_005/images/{nih_img_idx}'
    elif '00011558_008' <= img <= '00013774_025':
        path += f'images_006/images/{nih_img_idx}'
    elif '00013774_026' <= img <= '00016051_009':
        path += f'images_007/images/{nih_img_idx}'
    elif '00016051_010' <= img <= '00018387_034':
        path += f'images_008/images/{nih_img_idx}'
    elif '00018387_035' <= img <= '00020945_049':
        path += f'images_009/images/{nih_img_idx}'
    elif '00020945_050' <= img <= '00024717_000':
        path += f'images_010/images/{nih_img_idx}'
    elif '00024718_000' <= img <= '00028173_002':
        path += f'images_011/images/{nih_img_idx}'
    else:
        path += f'images_012/images/{nih_img_idx}'

    return path

get_pad_img_path = lambda pad_row : f'/Data/BIMCV-PadChest-FULL/{pad_row["ImageDir"]}/{pad_row["ImageID"]}'

get_vindr_img_path = lambda vindr_row : f'/Data/VinDR_Data/physionet.org/files/vindr-cxr/1.0.0/{vindr_row["Set"]}/{vindr_row["image_id"]}.dicom'

get_rsna_img_path = lambda patientId : f'/Data/RSNA_Data/stage_2_train_images/{patientId}.dcm'

# Frontal vs. Lateral Images

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
# pre-trained ResNet-50 model to distinguish frontal vs lateral CXRs
resnet50 = models.resnet50()
resnet50.fc = torch.nn.Linear(resnet50.fc.in_features, 2)

state_dict = '/Data/jacky_models/resnet50_frontal_vs_lateral.pth'
resnet50.load_state_dict(torch.load(state_dict, weights_only=True))

resnet50 = resnet50.to(device)

resnet50.eval()

In [217]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

def classify_chex_quality(cheXbert_path):
    image_path = get_chex_img_path(cheXbert_path)
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = resnet50(image)
        _, predicted = torch.max(output, 1)

    return predicted.item()

def classify_nih_quality(nih_img_idx):
    image_path = get_nih_img_path(nih_img_idx)
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = resnet50(image)
        _, predicted = torch.max(output, 1)

    return predicted.item()

def classify_pad_quality(pad_row):
    image_path = get_pad_img_path(pad_row)
    image = Image.open(image_path)

    image_np = np.array(image, dtype=np.uint16)
    image_np = cv2.normalize(image_np, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8) # scale to 8-bit

    image = Image.fromarray(image_np, mode='L').convert('RGB')
    
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        output = resnet50(image)
        _, predicted = torch.max(output, 1)

    return predicted.item()

def classify_vindr_quality(vindr_row):
    image_path = get_vindr_img_path(vindr_row)
    
    dicom = pydicom.dcmread(image_path)
    image_np = dicom.pixel_array

    # convert to standard grayscale
    if dicom.PhotometricInterpretation == 'MONOCHROME1':
        image_np = image_np.max() - image_np

    # windowing
    if 'WindowCenter' in dicom:
        window_center = float(dicom.WindowCenter)
        window_width = float(dicom.WindowWidth)

        lower_bound = window_center - (window_width / 2)
        upper_bound = window_center + (window_width / 2)
        image_np = np.clip(image_np, lower_bound, upper_bound)

    image_np = cv2.normalize(image_np, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    image = Image.fromarray(image_np, mode='L').convert('RGB')
    
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        output = resnet50(image)
        _, predicted = torch.max(output, 1)

    return predicted.item()

def classify_rsna_quality(rsna_path):
    image_path = get_rsna_img_path(rsna_path)

    dicom = pydicom.dcmread(image_path)
    image_np = dicom.pixel_array

    if dicom.PhotometricInterpretation == 'MONOCHROME1':
        image_np = image_np.max() - image_np

    if 'WindowCenter' in dicom:
        window_center = float(dicom.WindowCenter)
        window_width = float(dicom.WindowWidth)

        lower_bound = window_center - (window_width / 2)
        upper_bound = window_center + (window_width / 2)
        image_np = np.clip(image_np, lower_bound, upper_bound)

    image = Image.fromarray(image_np).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        output = resnet50(image)
        _, predicted = torch.max(output, 1)

    return predicted.item()

In [22]:
#chex['Pred_View'] = chex.Path.apply(classify_chex_quality)
#nih['Pred_View'] = nih['Image Index'].apply(classify_nih_quality)
pad['Pred_View'] = pad.apply(classify_pad_quality, axis=1)

# Disagreements Between Reported/Predicted Views

In [None]:
#chex['Pred_View'] = chex['Pred_View'].apply(lambda val : 'Lateral' if val else 'Frontal')
#chex['ViewAgreement'] = chex.apply(lambda row : row['Frontal/Lateral'] == row['Pred_View'], axis=1)

#nih['Pred_View'] = nih['Pred_View'].apply(lambda val : 'Lateral' if val else 'Frontal')
#nih['ViewAgreement'] = nih['Pred_View'].apply(lambda val : val == 'Frontal')

pad['Pred_View'] = pad['Pred_View'].apply(lambda val : 'Lateral' if val else 'Frontal')
pad['Frontal/Lateral'] = pad.Projection.apply(lambda view : 'Lateral' if view == 'L' else 'Frontal')
pad['ViewAgreement'] = pad.apply(lambda row : row['Frontal/Lateral'] == row['Pred_View'], axis=1)

pad

In [54]:
#chex.to_csv('/Data/CheX_Data/chexpertchestxrays-u20210408/train_cheXbert_w_views.csv', index=False)
#nih.to_csv('/Data/NIH_Data/Data_Entry_2017_w_views.csv', index=False)

In [None]:
#disagree = chex[chex.ViewAgreement == False]
#disagree = nih[nih.ViewAgreement == False]
disagree = pad[pad.ViewAgreement == False]

disagree

In [None]:
#disagree_paths = disagree.Path.apply(get_chex_img_path).tolist()
#disagree_paths = disagree['Image Index'].apply(get_nih_img_path).tolist()
disagree_paths = disagree.apply(get_pad_img_path, axis=1).tolist()

disagree_paths[:10]

In [87]:
#target_dir = '/Data/CheX_view_disagreement/'
target_dir = '/Data/Pad_view_disagreement/'

for image_path in disagree_paths:
    study_dir = os.path.basename(os.path.dirname(image_path))
    patient_dir = os.path.basename(os.path.dirname(os.path.dirname(image_path)))

    original_filename = os.path.basename(image_path)
    new_filename = f'{patient_dir}_{study_dir}_{original_filename}'

    dest_path = os.path.join(target_dir, new_filename)

    shutil.copy(image_path, dest_path)