In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
# point to the parent directory that contains the folder 'CheXpert-v1.0'
img_data_dir = "/Users/felixkrones/python_projects/data/ChestXpert/"
split_file = "valid.csv" # valid.csv
img_size = 128
to_filter = False

# Study population

## Merge data and rename or label

In [None]:
data_dir = "../datafiles/chexpert/"
df_demo = pd.DataFrame(
    pd.read_excel(data_dir + "CHEXPERT DEMO.xlsx", engine="openpyxl")
)

df_demo = df_demo.rename(columns={"PRIMARY_RACE": "race"})
df_demo = df_demo.rename(columns={"PATIENT": "patient_id"})
df_demo = df_demo.rename(columns={"GENDER": "sex"})
df_demo = df_demo.rename(columns={"AGE_AT_CXR": "age"})
df_demo = df_demo.rename(columns={"ETHNICITY": "ethnicity"})
df_demo = df_demo.drop(["sex", "age"], axis=1)
df_demo.head(2)

In [None]:
df_img_data = pd.read_csv(data_dir + split_file)
print(df_img_data.shape)

In [None]:
if split_file == "train.csv":
    df_data_split = pd.read_csv(data_dir + "chexpert_split_2021_08_20.csv").set_index(
        "index"
    )

    df_img_data = pd.concat([df_img_data, df_data_split], axis=1)
    df_img_data = df_img_data[~df_img_data.split.isna()]

In [None]:
split = df_img_data.Path.str.split("/", expand=True)
df_img_data["patient_id"] = split[2]
df_img_data = df_img_data.rename(columns={"Age": "age"})
df_img_data = df_img_data.rename(columns={"Sex": "sex"})
print(df_img_data.shape)

In [None]:
if split_file == "train.csv":
    df_cxr = df_demo.merge(df_img_data, on="patient_id")
elif split_file == "valid.csv":
    df_cxr = pd.merge(df_img_data, df_demo, on="patient_id", how="left")
else:
    raise ValueError(f"split_file must be train.csv or valid.csv, not {split_file}")
print(df_img_data.shape)

In [None]:
white = "White"
asian = "Asian"
black = "Black"

In [None]:
mask = df_cxr.race.str.contains("Black", na=False)
df_cxr.loc[mask, "race"] = black

mask = df_cxr.race.str.contains("White", na=False)
df_cxr.loc[mask, "race"] = white

mask = df_cxr.race.str.contains("Asian", na=False)
df_cxr.loc[mask, "race"] = asian

df_cxr["race"].unique()
print(df_cxr.shape)

In [None]:
df_cxr = df_cxr[df_cxr["Frontal/Lateral"] == "Frontal"]
print(df_cxr.shape)

In [None]:
df_cxr["race_label"] = df_cxr["race"]

df_cxr.loc[df_cxr["race_label"] == white, "race_label"] = 0
df_cxr.loc[df_cxr["race_label"] == asian, "race_label"] = 1
df_cxr.loc[df_cxr["race_label"] == black, "race_label"] = 2

In [None]:
df_cxr["sex_label"] = df_cxr["sex"]

df_cxr.loc[df_cxr["sex_label"] == "Male", "sex_label"] = 0
df_cxr.loc[df_cxr["sex_label"] == "Female", "sex_label"] = 1

In [None]:
df_cxr["disease"] = np.nan
df_cxr.loc[df_cxr["No Finding"] == 1, "disease"] = "No Finding"
df_cxr.loc[df_cxr["Pleural Effusion"] == 1, "disease"] = "Pleural Effusion"
df_cxr.loc[df_cxr["disease"].isna(), "disease"] = "Other"

df_cxr["disease_label"] = df_cxr["disease"]
df_cxr.loc[df_cxr["disease_label"] == "No Finding", "disease_label"] = 0
df_cxr.loc[df_cxr["disease_label"] == "Pleural Effusion", "disease_label"] = 1
df_cxr.loc[df_cxr["disease_label"] == "Other", "disease_label"] = 2

## Filter

In [None]:
print(f"Shape : {df_cxr.shape}")
if to_filter:
    df_cxr = df_cxr[df_cxr.race.isin([asian, black, white])]
    df_cxr = df_cxr[df_cxr.ethnicity.isin(["Non-Hispanic/Non-Latino", "Not Hispanic"])]
    print(f"Shape after filter: {df_cxr.shape}")

In [None]:
df_cxr.head()

In [None]:
df_cxr.tail()

### Pre-processing of imaging dataset (this may take a few hours, needed only once)

In [None]:
from skimage.io import imread
from skimage.io import imsave
from skimage.transform import resize

df_cxr = df_cxr.reset_index(drop=True)
df_cxr["path_preproc"] = df_cxr["Path"]

preproc_dir = f"preproc_{img_size}x{img_size}_len_{len(df_cxr)}/"
out_dir = img_data_dir

if not os.path.exists(out_dir + preproc_dir):
    os.makedirs(out_dir + preproc_dir)

for idx, p in enumerate(tqdm(df_cxr["Path"])):
    split = p.split("/")
    preproc_filename = split[2] + "_" + split[3] + "_" + split[4]
    df_cxr.loc[idx, "path_preproc"] = preproc_dir + preproc_filename
    out_path = out_dir + preproc_dir + preproc_filename

    if not os.path.exists(out_path):
        image = imread(img_data_dir + p)
        image = resize(image, output_shape=(img_size, img_size), preserve_range=True)
        imsave(out_path, image.astype(np.uint8))

### Save full sample

In [None]:
df_cxr.tail()

In [None]:
df_cxr.to_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.csv")

## Creating subsets

## Train-val-test

In [None]:
df_cxr = pd.read_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.csv")

In [None]:
if split_file == "train.csv":
    df_train = df_cxr[df_cxr.split == "train"]
    df_val = df_cxr[df_cxr.split == "validate"]
    df_test = df_cxr[df_cxr.split == "test"]
elif split_file == "valid.csv":
    df_test = df_cxr
else:
    raise ValueError(f"split_file must be train.csv or valid.csv, not {split_file}")


In [None]:
if split_file == "train.csv":
    df_train.to_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.train.csv")
    df_val.to_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.val.csv")
df_test.to_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.test.csv")

## Additional (run only if split_file is train.csv)

In [None]:
df_train_white = df_train[df_train["race"] == white]
df_val_white = df_val[df_val["race"] == white]
df_train_white.to_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.train.white.csv")
df_val_white.to_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.val.white.csv")

df_cxr_white = df_cxr[df_cxr["race"] == white]
df_cxr_asian = df_cxr[df_cxr["race"] == asian]
df_cxr_black = df_cxr[df_cxr["race"] == black]

df_test_white = df_test[df_test["race"] == white]
df_test_asian = df_test[df_test["race"] == asian]
df_test_black = df_test[df_test["race"] == black]

df_train_asian = df_train[df_train["race"] == asian]
df_train_black = df_train[df_train["race"] == black]

df_val_asian = df_val[df_val["race"] == asian]
df_val_black = df_val[df_val["race"] == black]

In [None]:
df_train_male = df_train[df_train["sex"] == "Male"]
df_val_male = df_val[df_val["sex"] == "Male"]
df_train_male.to_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.train.male.csv")
df_val_male.to_csv(data_dir + f"chexpert.sample_{img_size}_from_{split_file.split('.')[0]}_filtered_{to_filter}.val.male.csv")

## Plots

In [None]:
df_cxr.loc[df_cxr["disease"] == "Pleural Effusion", "disease"] = "Pleur. Eff."
df_cxr.loc[df_cxr["disease"] == "No Finding", "disease"] = "No Find."

In [None]:
fontsize = 16
plt.rc("xtick", labelsize=fontsize)
plt.rc("ytick", labelsize=fontsize)
fig = sns.catplot(
    x="sex",
    y="age",
    hue="race",
    order=["Male", "Female"],
    hue_order=["White", "Asian", "Black"],
    kind="box",
    data=df_cxr,
)
fig._legend.remove()
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Age (years)", fontsize=fontsize)
plt.ylim([0, 100])
plt.legend(fontsize=fontsize, loc="lower center", ncol=3)
plt.show()
# fig.savefig("CheXpert-age-sex-race.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="disease",
    y="age",
    hue="race",
    order=["Other", "Pleur. Eff.", "No Find."],
    hue_order=["White", "Asian", "Black"],
    kind="box",
    data=df_cxr,
)
fig._legend.remove()
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Age (years)", fontsize=fontsize)
plt.ylim([0, 100])
plt.legend(fontsize=fontsize, loc="lower center", ncol=3)
plt.show()
# fig.savefig("CheXpert-age-pathology-race.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="disease",
    hue="sex",
    order=["Other", "Pleur. Eff.", "No Find."],
    hue_order=["Male", "Female"],
    data=df_cxr,
    kind="count",
)
fig._legend.remove()
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Count", fontsize=fontsize)
plt.legend(fontsize=fontsize)
plt.show()
# fig.savefig("CheXpert-sex-pathology.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="disease",
    hue="race",
    order=["Other", "Pleur. Eff.", "No Find."],
    hue_order=["White", "Asian", "Black"],
    data=df_cxr,
    kind="count",
)
fig._legend.remove()
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Count", fontsize=fontsize)
plt.legend(fontsize=fontsize)
plt.show()
# fig.savefig("CheXpert-race-pathology.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="disease",
    y="age",
    order=["Other", "Pleur. Eff.", "No Find."],
    kind="box",
    data=df_cxr,
)
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Age (years)", fontsize=fontsize)
plt.ylim([0, 100])
plt.show()
# fig.savefig("CheXpert-age-pathology.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(x="sex", y="age", order=["Male", "Female"], kind="box", data=df_cxr)
sns.stripplot(
    x="sex",
    y="age",
    color="k",
    order=["Male", "Female"],
    alpha=0.01,
    data=df_cxr,
    ax=fig.ax,
)
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Age (years)", fontsize=fontsize)
plt.ylim([0, 100])
plt.show()
# fig.savefig("CheXpert-age-sex.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="race", y="age", order=["White", "Asian", "Black"], kind="box", data=df_cxr
)
sns.stripplot(
    x="race",
    y="age",
    color="k",
    order=["White", "Asian", "Black"],
    alpha=0.01,
    data=df_cxr,
    ax=fig.ax,
)
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Age (years)", fontsize=fontsize)
plt.ylim([0, 100])
plt.show()
# fig.savefig("CheXpert-age-race.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="race", order=["White", "Asian", "Black"], data=df_cxr, kind="count"
)
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Count", fontsize=fontsize)
plt.show()
# fig.savefig("CheXpert-race.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(x="sex", order=["Male", "Female"], data=df_cxr, kind="count")
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Count", fontsize=fontsize)
plt.show()
# fig.savefig("CheXpert-sex.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="disease", order=["Other", "Pleur. Eff.", "No Find."], data=df_cxr, kind="count"
)
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Count", fontsize=fontsize)
plt.show()
# fig.savefig("CheXpert-pathology.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="race",
    hue="sex",
    order=["White", "Asian", "Black"],
    hue_order=["Male", "Female"],
    data=df_cxr,
    kind="count",
)
fig._legend.remove()
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Count", fontsize=fontsize)
plt.legend(fontsize=fontsize)
plt.show()
# fig.savefig("CheXpert-sex-race.png", bbox_inches='tight', dpi=300)

In [None]:
fig = sns.catplot(
    x="sex",
    hue="race",
    order=["Male", "Female"],
    hue_order=["White", "Asian", "Black"],
    data=df_cxr,
    kind="count",
)
fig._legend.remove()
plt.xlabel("", fontsize=fontsize)
plt.ylabel("Count", fontsize=fontsize)
plt.legend(fontsize=fontsize)
plt.show()
# fig.savefig("CheXpert-race-sex.png", bbox_inches='tight', dpi=300)