# Set up for ChestX-ray14 Analysis

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

2025-01-01 15:21:13.377022: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-01 15:21:13.398229: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-01 15:21:13.405012: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-01 15:21:13.420984: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Global Variables

In [2]:
num_labels = 14

local_image_directory = 'nih_xrays_320'

target_w, target_h = 320, 320

input_shape        = (target_w, target_h, 3)

target_size        = (target_w, target_h)

dataset_size       = 112120

## Set computer to local time

In [3]:
!sudo timedatectl set-timezone US/Mountain

## Set seeds for reproducibility

In [4]:
import os
import random

random_state=17

random.seed(random_state)
np.random.seed(random_state)
tf.random.set_seed(random_state)
tf.keras.utils.set_random_seed(random_state)
os.environ['PYTHONHASHSEED'] = str(random_state)

# Load csv file

In [5]:
# Step 1: Load the CSV File
csv_path = 'NIH_data.csv'  
df       = pd.read_csv(csv_path)

## select the most-common diseases

In [6]:
disease_df = df.drop(['Image', 'PatientId'], axis=1)

disease_counts = disease_df.sum()
disease_counts = disease_counts.sort_values(ascending=False)
print("Disease Frequency")
print("-----------------")
display(disease_counts)

diseases_in_order = disease_counts.index.to_list()
print(f"\n{diseases_in_order = }")

label_text = diseases_in_order[:num_labels]
print(f"\n{num_labels = }")
print(f"{label_text = }\n")

df = df[['Image', 'PatientId'] + label_text]
print(f"{df.shape = }")
display(df.head())

Disease Frequency
-----------------


Infiltration          19894
Effusion              13317
Atelectasis           11559
Nodule                 6331
Mass                   5782
Pneumothorax           5302
Consolidation          4667
Pleural_Thickening     3385
Cardiomegaly           2776
Emphysema              2516
Edema                  2303
Fibrosis               1686
Pneumonia              1431
Hernia                  227
dtype: int64


diseases_in_order = ['Infiltration', 'Effusion', 'Atelectasis', 'Nodule', 'Mass', 'Pneumothorax', 'Consolidation', 'Pleural_Thickening', 'Cardiomegaly', 'Emphysema', 'Edema', 'Fibrosis', 'Pneumonia', 'Hernia']

num_labels = 14
label_text = ['Infiltration', 'Effusion', 'Atelectasis', 'Nodule', 'Mass', 'Pneumothorax', 'Consolidation', 'Pleural_Thickening', 'Cardiomegaly', 'Emphysema', 'Edema', 'Fibrosis', 'Pneumonia', 'Hernia']

df.shape = (112120, 16)


Unnamed: 0,Image,PatientId,Infiltration,Effusion,Atelectasis,Nodule,Mass,Pneumothorax,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Edema,Fibrosis,Pneumonia,Hernia
0,00000001_000.png,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,00000001_001.png,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2,00000001_002.png,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,00000002_000.png,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,00000003_001.png,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## Shuffle and pick how many samples to use

In [7]:
# Step 3: Shuffle the Data
df_shuffled = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

# Step 4: Extract a  Subset
df_subset = df_shuffled.head(dataset_size)

print(f"{df_subset.shape = }")
display(df_subset.head())

df_subset.shape = (112120, 16)


Unnamed: 0,Image,PatientId,Infiltration,Effusion,Atelectasis,Nodule,Mass,Pneumothorax,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Edema,Fibrosis,Pneumonia,Hernia
0,00001249_010.png,1249,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,00018521_004.png,18521,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,00029173_000.png,29173,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,00019508_014.png,19508,1,0,0,0,0,0,0,0,0,0,1,0,0,0
4,00008858_002.png,8858,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Split into train, val and test; make PatientId unique

In [9]:
from sklearn.model_selection import train_test_split

# Step 5: Split into Train, Validation, and Test Sets
# Ensure no patient appears in more than one set
def split_by_patient_id(df, test_size=0.05, val_size=0.05, random_state=random_state):
    # Sum diseases for each patient
    patients_df = df.groupby('PatientId')[label_text].max().reset_index()
    
    # Create a simpler stratification target - just count of diseases
    patients_df['n_diseases'] = patients_df[label_text].sum(axis=1)
    
    train_patients, test_patients = train_test_split(
        patients_df, 
        test_size=test_size, 
        random_state=random_state,
        stratify=patients_df['n_diseases']  # Stratify by disease count
    )
    
    train_patients, val_patients = train_test_split(
        train_patients,
        test_size=val_size/(1-test_size),
        random_state=random_state,
        stratify=train_patients['n_diseases']  # Stratify by disease count
    )
    
    df_train = df[df['PatientId'].isin(train_patients['PatientId'])]
    df_val = df[df['PatientId'].isin(val_patients['PatientId'])]
    df_test = df[df['PatientId'].isin(test_patients['PatientId'])]
    
    return df_train, df_val, df_test

# Split the subset by PatientId
df_train, df_val, df_test = split_by_patient_id(df_subset)

# Print the results
print(f"{df_train.shape = }")
print(f"{df_val.shape   = }")
print(f"{df_test.shape  = }\n")

display(df_train.head(2))

df_train.shape = (101136, 16)
df_val.shape   = (5429, 16)
df_test.shape  = (5555, 16)



Unnamed: 0,Image,PatientId,Infiltration,Effusion,Atelectasis,Nodule,Mass,Pneumothorax,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Edema,Fibrosis,Pneumonia,Hernia
0,00001249_010.png,1249,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,00029173_000.png,29173,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Disease distribution across sets

In [10]:
def check_distribution(df_train, df_val, df_test, diseases):
    print("Disease prevalence (%):")
    print(f"{'Disease':12} {'Train':>8} {'Val':>8} {'Test':>8}")
    print("-" * 40)
    for disease in diseases:
        train_prev = df_train[disease].mean() * 100
        val_prev = df_val[disease].mean() * 100
        test_prev = df_test[disease].mean() * 100
        print(f"{disease:12} {train_prev:8.2f} {val_prev:8.2f} {test_prev:8.2f}")

check_distribution(df_train, df_val, df_test, label_text)

# Check multi-label distribution
print("\nNumber of conditions per image:")
for name, df in [('Train', df_train), ('Val', df_val), ('Test', df_test)]:
    conditions = df[label_text].sum(axis=1)
    print(f"\n{name} set:")
    print(conditions.value_counts().sort_index())

Disease prevalence (%):
Disease         Train      Val     Test
----------------------------------------
Infiltration    17.78    15.99    18.76
Effusion        11.91    11.11    12.12
Atelectasis     10.27    10.24    11.14
Nodule           5.61     6.69     5.36
Mass             5.10     6.63     4.77
Pneumothorax     4.71     5.62     4.23
Consolidation     4.11     4.13     5.24
Pleural_Thickening     3.02     3.15     2.79
Cardiomegaly     2.53     1.86     2.07
Emphysema        2.22     2.67     2.30
Edema            2.07     1.69     2.18
Fibrosis         1.51     1.31     1.60
Pneumonia        1.28     1.05     1.39
Hernia           0.19     0.39     0.31

Number of conditions per image:

Train set:
0    54465
1    27957
2    12891
3     4362
4     1113
5      272
6       60
7       13
8        1
9        2
Name: count, dtype: int64

Val set:
0    2956
1    1454
2     696
3     231
4      69
5      18
6       3
7       2
Name: count, dtype: int64

Test set:
0    2940
1    1552


### Patient Characteristics

In [11]:
def check_patient_stats(df, name):
    images_per_patient = df.groupby('PatientId').size()
    print(f"\n{name} set patient statistics:")
    print(f"Unique patients: {len(images_per_patient)}")
    print(f"Images per patient: min={images_per_patient.min()}, max={images_per_patient.max()}, avg={images_per_patient.mean():.2f}")

for name, df in [('Train', df_train), ('Val', df_val), ('Test', df_test)]:
    check_patient_stats(df, name)


Train set patient statistics:
Unique patients: 27723
Images per patient: min=1, max=184, avg=3.65

Val set patient statistics:
Unique patients: 1541
Images per patient: min=1, max=81, avg=3.52

Test set patient statistics:
Unique patients: 1541
Images per patient: min=1, max=173, avg=3.60


In [12]:
# Get unique patient IDs from each dataset
train_patients = set(df_train['PatientId'])
val_patients = set(df_val['PatientId'])
test_patients = set(df_test['PatientId'])

# Check intersections
train_val_overlap = train_patients.intersection(val_patients)
train_test_overlap = train_patients.intersection(test_patients)
val_test_overlap = val_patients.intersection(test_patients)

print("Checking for patient overlap between datasets:")
print(f"\nTrain-Val overlap: {len(train_val_overlap)} patients")
if len(train_val_overlap) > 0:
   print("Patients in both train and val:", train_val_overlap)

print(f"\nTrain-Test overlap: {len(train_test_overlap)} patients")
if len(train_test_overlap) > 0:
   print("Patients in both train and test:", train_test_overlap)

print(f"\nVal-Test overlap: {len(val_test_overlap)} patients")
if len(val_test_overlap) > 0:
   print("Patients in both val and test:", val_test_overlap)

# Print total unique patients in each set for reference
print("\nTotal unique patients in each set:")
print(f"Train: {len(train_patients)}")
print(f"Val: {len(val_patients)}")
print(f"Test: {len(test_patients)}")

Checking for patient overlap between datasets:

Train-Val overlap: 0 patients

Train-Test overlap: 0 patients

Val-Test overlap: 0 patients

Total unique patients in each set:
Train: 27723
Val: 1541
Test: 1541


# Save variables

In [13]:
import os 
import pickle

def clear_files_keep_folders(root_dir):
   """Delete all files in directory tree but keep the folder structure."""
   for dirpath, dirnames, filenames in os.walk(root_dir):
       for filename in filenames:
           file_path = os.path.join(dirpath, filename)
           try:
               os.remove(file_path)
               print(f"Deleted: {file_path}")
           except Exception as e:
               print(f"Error deleting {file_path}: {e}")

In [14]:
y_pred_threshold_gt = 0.50

In [15]:
data_files = {
    "target_w":              target_w,
    "target_h":              target_h,
    "input_shape":           input_shape,
    "target_size":           target_size,
    "dataset_size":          dataset_size,
    "local_image_directory": local_image_directory,
    "num_labels":            num_labels,
    "label_text":            label_text,
    "y_pred_threshold_gt":   y_pred_threshold_gt,
    "df_train":              df_train,
    "df_val":                df_val,
    "df_test":               df_test
}

# Clean up existing directories
variable_dir = 'saved_variables'
clear_files_keep_folders(variable_dir)
os.makedirs(variable_dir, exist_ok=True)

with open('saved_variables/data_files.pkl', 'wb') as f:
    pickle.dump(data_files, f)
    print("Saved 'data_files'")

Deleted: saved_variables/data_files.pkl
Saved 'data_files'


# Load Variables to test

In [16]:
import pickle
import numpy as np

with open('saved_variables/data_files.pkl', 'rb') as f:
    data_files = pickle.load(f)

for key, value in data_files.items():
    if key.startswith('df_'):
        # For dataframes, print head(2) instead of full frame
        print(f"\n{key}:")
        print(f"{value.shape})")
        display(value.head(2))
    elif key == 'y_true' and isinstance(value, (list, tuple, np.ndarray)):
        # If 'y_true' is found, display its shape
        print(f"\n{key:<25} shape = {value.shape}")
    else:
        # For other variables, print normally
        print(f"{key:<25} = {value}")

for key, value in data_files.items():
    globals()[key] = value

target_w                  = 320
target_h                  = 320
input_shape               = (320, 320, 3)
target_size               = (320, 320)
dataset_size              = 112120
local_image_directory     = nih_xrays_320
num_labels                = 14
label_text                = ['Infiltration', 'Effusion', 'Atelectasis', 'Nodule', 'Mass', 'Pneumothorax', 'Consolidation', 'Pleural_Thickening', 'Cardiomegaly', 'Emphysema', 'Edema', 'Fibrosis', 'Pneumonia', 'Hernia']
y_pred_threshold_gt       = 0.5

df_train:
(101136, 16))


Unnamed: 0,Image,PatientId,Infiltration,Effusion,Atelectasis,Nodule,Mass,Pneumothorax,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Edema,Fibrosis,Pneumonia,Hernia
0,00001249_010.png,1249,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,00029173_000.png,29173,0,0,0,0,0,0,0,0,0,0,0,0,0,0



df_val:
(5429, 16))


Unnamed: 0,Image,PatientId,Infiltration,Effusion,Atelectasis,Nodule,Mass,Pneumothorax,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Edema,Fibrosis,Pneumonia,Hernia
1,00018521_004.png,18521,0,0,0,0,0,0,0,0,0,0,0,0,0,0
72,00016429_012.png,16429,0,0,0,0,0,1,0,0,0,0,0,0,0,0



df_test:
(5555, 16))


Unnamed: 0,Image,PatientId,Infiltration,Effusion,Atelectasis,Nodule,Mass,Pneumothorax,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Edema,Fibrosis,Pneumonia,Hernia
15,00020057_000.png,20057,0,0,0,0,0,0,0,0,0,1,0,0,0,0
40,00025085_018.png,25085,0,0,0,0,0,0,0,0,0,0,0,0,0,0
