In [None]:
import pandas as pd

## Generate Splits

In [None]:
# code from builder.py

import torch
import os
import random

# make sure to do both!

# label_type = "negbio"
label_type = "chexpert"

views = ["PA", "lateral"]

data_dir = ""

labels_df = pd.read_csv(
            os.path.join(f"{data_dir}{label_type}.csv")
)

# change NaNs to 0
# drop all -1.0
cleaned_labels_df = labels_df.fillna(0).replace(-1.0, pd.NA).dropna()

labels = []

for _, row in cleaned_labels_df.iterrows():
    labels.append(
        torch.tensor(row.to_list()[2:], dtype=torch.float)
    )  # drop subject_id and study_id

filtered_study_id_to_labels = dict(
    zip(
        cleaned_labels_df.study_id,
        torch.stack(labels),
    )
)

metadata_df = pd.read_csv(
    os.path.join(data_dir, f"{data_dir}metadata.csv")
)

# only choose the filtered metadata based on the filtered labels
filtered_metadata_df = metadata_df[
    metadata_df["study_id"].isin(list(filtered_study_id_to_labels.keys()))
]

# insert the labels
filtered_metadata_df["labels"] = filtered_metadata_df["study_id"].map(
    filtered_study_id_to_labels
)

# only the desired views
final_metadata_df = filtered_metadata_df[
    filtered_metadata_df["ViewPosition"].isin(views)
]

final_dicoms = final_metadata_df.dicom_id.to_list()

subset = True

if subset:
    num = 19659
    random.shuffle(final_dicoms)
    final_dicoms = final_dicoms[:num]

In [None]:
final_dicoms

In [None]:
len(final_dicoms)

In [None]:
from sklearn.model_selection import train_test_split

test_p = 0.2
seed = 42

train, test = train_test_split(final_dicoms, test_size=test_p, random_state=seed, shuffle=True)

In [None]:
len(train)

In [None]:
len(test)

In [None]:
def save_splits_csv(train_dicoms, test_dicoms, label_type, views):
    assert len(views) >= 1

    train_df = pd.DataFrame(train_dicoms, columns=["dicom_id"])
    train_df["split"] = "train"

    test_df = pd.DataFrame(test_dicoms, columns=["dicom_id"])
    test_df["split"] = "test"

    all_df = pd.concat([train_df, test_df], ignore_index=True, axis=0)
    
    if subset:
        filename = f"mimic-cxr-2.0.0-split-{label_type}-{'-'.join(views)}-{num}.csv"
    else:
        filename = f"mimic-cxr-2.0.0-split-{label_type}-{'-'.join(views)}.csv"
    all_df.to_csv(filename)

In [None]:
save_splits_csv(train, test, label_type, views)

## Labels

In [None]:
# labels = pd.read_csv("data/mimic-cxr-2.0.0-chexpert.csv")
labels = pd.read_csv("data/mimic-cxr-2.0.0-negbio.csv")

labels

In [None]:
metadata = pd.read_csv("data/mimic-cxr-2.0.0-metadata.csv")

metadata

In [None]:
only_lateral = metadata[metadata["ViewPosition"] == "PA"]

joined = pd.merge(only_lateral, labels, on="study_id", how="left")

joined

In [None]:
joined.head(15)

In [None]:
CLASSES = [
        "Atelectasis",
        "Cardiomegaly",
        "Consolidation",
        "Edema",
        "Enlarged Cardiomediastinum",
        "Fracture",
        "Lung Lesion",
        "Lung Opacity",
        "Pleural Effusion",
        "Pneumonia",
        "Pneumothorax",
        "Pleural Other",
        "Support Devices",
        "No Finding"
    ]

filtered = joined[["subject_id_x", "study_id", "ViewPosition"] + CLASSES]

dataset = filtered.fillna(0).replace(-1.0, pd.NA).dropna()

dataset

In [None]:
dataset["study_id"].nunique()