# NIH Dataset EDA

In [None]:
# pip install -r requirements.txt

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')  # For SCC compatibility (no display)
import matplotlib.pyplot as plt
import seaborn as sns

# Deep Learning Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
import torchvision.datasets as datasets

# Evaluation Metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score

# For reproducibility
import random


In [2]:
# Set the path where dataset CSV files are stored on the cluster
data_path = "/projectnb/dl4ds/projects/dca_project/nih_data"

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Load the CSV files
bbox_df = pd.read_csv(os.path.join(data_path, "BBox_List_2017.csv"))
data_entry_df = pd.read_csv(os.path.join(data_path, "Data_Entry_2017.csv"))

# Display basic info
print("Bounding Box Data:--------------------------------------------------------------")
print(bbox_df.head())
print("\nImage Metadata:---------------------------------------------------------------")
print(data_entry_df.head())



Bounding Box Data:--------------------------------------------------------------
        Image Index Finding Label     Bbox [x           y           w          h]  Unnamed: 6  Unnamed: 7  Unnamed: 8
0  00013118_008.png   Atelectasis  225.084746  547.019217   86.779661   79.186441         NaN         NaN         NaN
1  00014716_007.png   Atelectasis  686.101695  131.543498  185.491525  313.491525         NaN         NaN         NaN
2  00029817_009.png   Atelectasis  221.830508  317.053115  155.118644  216.949153         NaN         NaN         NaN
3  00014687_001.png   Atelectasis  726.237288  494.951420  141.016949   55.322034         NaN         NaN         NaN
4  00017877_001.png   Atelectasis  660.067797  569.780787  200.677966   78.101695         NaN         NaN         NaN

Image Metadata:---------------------------------------------------------------
        Image Index          Finding Labels  Follow-up #  Patient ID  Patient Age Patient Gender View Position  OriginalImage[Width

In [None]:
from PIL import Image
from IPython.display import display  # Import display function

# Set image path
images_path = os.path.join(data_path, "images_001/images")

# Get first 5 images
image_filenames = os.listdir(images_path)[:1]
print("Images found:", image_filenames) 

# Display images directly in Jupyter Notebook
for img_filename in image_filenames:
    img_path = os.path.join(images_path, img_filename)
    img = Image.open(img_path)  
    display(img) 


# Code for similar project: https://github.com/thibaultwillmann/CheXNet-Pytorch/blob/master/CheXnet.ipynb

# Find amount of unique labels
# Find percentage of images that are labeled as 'no finding'

In [10]:
import os
import pandas as pd

# Set the path where dataset CSV files are stored
data_path = "/projectnb/dl4ds/projects/dca_project/nih_data"

# Load the Data_Entry_2017.csv file
data_entry_df = pd.read_csv(os.path.join(data_path, "Data_Entry_2017.csv"))

# Total number of cases (rows in the dataset)
total_cases = len(data_entry_df)

# 1. Find unique individual finding labels
all_individual_labels = set()
for labels in data_entry_df['Finding Labels']:
    if pd.notna(labels):  # Check for missing values
        labels_list = labels.split('|')  # Split by "|"
        for label in labels_list:
            all_individual_labels.add(label.strip())  # Add stripped label to set

# Convert to sorted list for consistent display
unique_individual_labels = sorted(all_individual_labels)

# Print the number of unique individual labels
print(f"Number of unique individual finding labels: {len(unique_individual_labels)}")
print("Unique individual finding labels:")
for label in unique_individual_labels:
    print(label)

# 2. Find unique combinations of finding labels with counts
# Use value_counts() to get the frequency of each combination
label_combinations_counts = data_entry_df['Finding Labels'].dropna().value_counts()

# Convert to a list of tuples (combination, count) and sort by count descending
sorted_combinations = sorted(label_combinations_counts.items(), key=lambda x: x[1], reverse=True)

# Print the number of unique combinations
print(f"\nNumber of unique possible outcomes in 'Finding Labels': {len(sorted_combinations)}")
print(f"Total number of cases: {total_cases}")
print("Unique combinations of finding labels (sorted by number of cases):")
print("Combination | Number of Cases | Percentage of Total Cases")
print("-" * 60)

# Print each combination with count and percentage
for combo, count in sorted_combinations:
    percentage = (count / total_cases) * 100
    print(f"{combo:<40} | {count:<15} | {percentage:.2f}%")

Number of unique individual finding labels: 15
Unique individual finding labels:
Atelectasis
Cardiomegaly
Consolidation
Edema
Effusion
Emphysema
Fibrosis
Hernia
Infiltration
Mass
No Finding
Nodule
Pleural_Thickening
Pneumonia
Pneumothorax

Number of unique possible outcomes in 'Finding Labels': 836
Total number of cases: 112120
Unique combinations of finding labels (sorted by number of cases):
Combination | Number of Cases | Percentage of Total Cases
------------------------------------------------------------
No Finding                               | 60361           | 53.84%
Infiltration                             | 9547            | 8.51%
Atelectasis                              | 4215            | 3.76%
Effusion                                 | 3955            | 3.53%
Nodule                                   | 2705            | 2.41%
Pneumothorax                             | 2194            | 1.96%
Mass                                     | 2139            | 1.91%
Effusion|Infil

# NUmber of pneumonia cases

In [1]:
import os
import pandas as pd

# Set the path where dataset CSV files are stored
data_path = "/projectnb/dl4ds/projects/dca_project/nih_data"

# Load the Data_Entry_2017.csv file
data_entry_df = pd.read_csv(os.path.join(data_path, "Data_Entry_2017.csv"))

# Count X-rays with "Pneumonia" in the "Finding Labels"
pneumonia_count = data_entry_df['Finding Labels'].str.contains('Pneumonia', na=False).sum()

# Total number of X-rays in the dataset
total_xrays = len(data_entry_df)

# Calculate percentage
pneumonia_percentage = (pneumonia_count / total_xrays) * 100

# Print results
print(f"Total number of X-rays in the dataset: {total_xrays}")
print(f"Number of X-rays with 'Pneumonia': {pneumonia_count}")
print(f"Percentage of X-rays with 'Pneumonia': {pneumonia_percentage:.2f}%")

Total number of X-rays in the dataset: 112120
Number of X-rays with 'Pneumonia': 1431
Percentage of X-rays with 'Pneumonia': 1.28%
