In [5]:
from PIL import Image

In [3]:
import pandas as pd
import os
import shutil
import uuid
import csv

df = pd.read_csv("train.csv")

target_labels = {
    "No Finding": 0,
    "Cardiomegaly": 1,
    "Lung Lesion": 2,         # equivalent to Nodule/Mass
    "Lung Opacity": 3,
    "Pleural Effusion": 4,
    "Consolidation": 5
}

# Frontal PA images only
df = df[(df["Frontal/Lateral"] == "Frontal") & (df["AP/PA"] == "PA")]

# Keep only rows with at least one target label
df_filtered = df[df[list(target_labels)].fillna(0).astype(float).any(axis=1)]

output_dir = "chexpert_subset"
os.makedirs(output_dir, exist_ok=True)
label_file = "chexpert_subset.csv"

# Create a CSV file to store the labels
with open(label_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["image_id", "class_name", "class_id", "label"])

    count = 0

    for _, row in df_filtered.iterrows():
        original_path = row["Path"].replace("CheXpert-v1.0-small/", "")
        if not os.path.exists(original_path):
            continue

        for class_name, class_id in target_labels.items():
            if float(row[class_name]) == 1.0:
                image_id = uuid.uuid4().hex
                ext = os.path.splitext(original_path)[1]
                new_filename = f"{image_id}{ext}"
                new_path = os.path.join(output_dir, new_filename)
                shutil.copy(original_path, new_path)

                writer.writerow([image_id, class_name, class_id, class_id])
                count += 1

print(f"Copied and labeled {count} image-label pairs to '{output_dir}' and '{label_file}'")


Copied and labeled 29842 image-label pairs to 'chexpert_subset' and 'chexpert_subset.csv'


In [6]:

image_folder = "chexpert_subset"
processed_folder = "chexpert_resized"
os.makedirs(processed_folder, exist_ok=True)


for img_name in os.listdir(image_folder):
    img_path = os.path.join(image_folder, img_name)
    save_path = os.path.join(processed_folder, img_name)

    try:
        # grayscale
        img = Image.open(img_path).convert("L")
        img = img.resize((224, 224))
        img.save(save_path)  
    except Exception as e:
        print(f"Error processing {img_name}: {e}")

print(f"Resized images saved to '{processed_folder}'")

Resized images saved to 'chexpert_resized'


In [7]:
## Mistake - I forgot to do label + class_id, so I fixed it here

# df = pd.read_csv("chexpert_subset.csv")
# df["label"] = df["class_id"]
# df.to_csv("chexpert_subset.csv", index=False)