In [None]:
import pandas as pd
import random
import os
import shutil

# VinDr-CXR Preparation

In this notebook, I bring the [kaggle version of the VinDr-CXR dataset](https://www.kaggle.com/c/vinbigdata-chest-xray-abnormalities-detection) into the same format as the [official dataset from PhysioNet](https://physionet.org/content/vindr-cxr/1.0.0/). 

Download and unzip the data and enter its path into `data_path`.

In [None]:
data_path = ""

Like in the *Many Tasks Make Light Work* paper, I now create a training set consisting of 4000 healthy scans and a test set consisting of 1000 healthy and 1000 unhealthy scans.

First, let's have a look at the dataset:

In [None]:
df = pd.read_csv(data_path + "/train.csv").sort_values(by="image_id")
df

The possible labels are the following:

In [4]:
labels_dict = {
    0: "Aortic enlargement",
    1: "Atelectasis",
    2: "Calcification",
    3: "Cardiomegaly",
    4: "Consolidation",
    5: "ILD",
    6: "Infiltration",
    7: "Lung Opacity",
    8: "Nodule/Mass",
    9: "Other lesion",
    10: "Pleural effusion",
    11: "Pleural thickening",
    12: "Pneumothorax",
    13: "Pulmonary fibrosis",
    14: "No finding"
}

To create the training set, I extract the rows from the dataframe which belong to the scans that were labeled as healthy (`"No finding"`) by all three radiologists and store them in the dataframe `df_healthy`.

In [None]:
ids_three_occurences = df["image_id"].value_counts()[lambda x: x == 3].index
df_healthy = df[df["image_id"].isin(ids_three_occurences)]
df_healthy = df_healthy[df_healthy["class_name"] == "No finding"]
df_healthy

Next, I extract the rows belonging to the scans which were not labeled as healthy by any of the three radiologists into the dataframe `df_unhealthy`.

In [7]:
df_unhealthy = df.groupby("image_id").filter(lambda group: not (group["class_name"] == "No finding").any()) 

Now, 4000 healthy scans and further 1000 healthy and 1000 unhealthy scans are chosen randomly to form the training and the test set. The `image_id`s of these scans are stored in the dictionary `train_test_ids`.

In [8]:
random.seed(2005)
 
train_test_ids = {"train": [], "test": []}

image_ids_healthy = df_healthy["image_id"].unique().tolist() 
image_ids_healthy_5000 = random.sample(image_ids_healthy, 5000)
image_ids_unhealthy = df_unhealthy["image_id"].unique().tolist()
image_ids_unhealthy_1000 = random.sample(image_ids_unhealthy, 1000)

train_test_ids["train"].extend(image_ids_healthy_5000[:4000])
train_test_ids["test"].extend(image_ids_healthy_5000[4000:])
train_test_ids["test"].extend(image_ids_unhealthy_1000)

The annotations and labels of the training data are stored in the files `annotations_train.csv` and `image_labels_train.csv` respectively.

In [9]:
df_annotations_train = df[df["image_id"].isin(train_test_ids["train"])]

df_annotations_train.to_csv("annotations_train.csv", index=False)

df_image_labels_train = df_annotations_train.loc[:, ["image_id", "rad_id"]]
for label in labels_dict.values():
    df_image_labels_train[label] = 0
df_image_labels_train["No finding"] = 1

df_image_labels_train.to_csv("image_labels_train.csv", index=False)

The annotations of the original test data are a consensus of various radiologists and are therefore not associated with an individual radiologist. As the kaggle dataset does not contain these consensus annotations, I simply only keep one randomly chosen row per (`"image_id"`, `"class_name"`) pair and remove the column containing the ID `rad_id` of the radiologist.

In [12]:
df_annotations_test = df[df["image_id"].isin(train_test_ids["test"])]
df_annotations_test = df_annotations_test.drop_duplicates(subset=["image_id", "class_name"], keep="first")
del df_annotations_test["rad_id"]

The annotations and labels of the test data are stored in the files `annotations_test.csv` and `image_labels_test.csv` respectively.

In [17]:
df_annotations_test.to_csv("annotations_test.csv", index=False)

df_image_labels_test = pd.DataFrame(train_test_ids["test"], columns = ["image_id"])
for label in labels_dict.values():
    df_image_labels_test[label] = 0
df_image_labels_test["No finding"] = 0

for idx, row in df_annotations_test.iterrows():
    image_id_idx = df_image_labels_test.loc[df_image_labels_test["image_id"] == row["image_id"]].index[0]
    df_image_labels_test.loc[image_id_idx, row["class_name"]] = 1 

df_image_labels_test.to_csv("image_labels_test.csv", index=False)

In [136]:
os.makedirs(data_path + "/train", exist_ok=True)
os.makedirs(data_path + "/test", exist_ok=True)
os.makedirs(data_path + "/annotations", exist_ok=True)

Now, I remove the created CSV files to `<data_path>/annotations`, the scans forming the training and the test set to `<data_path>/train` and `<data_path>/test` and delete the remaining files at `data_path`.

In [None]:
for file_name in ["annotations_test.csv", "annotations_train.csv", "image_labels_test.csv", "image_labels_train.csv"]:
    shutil.move(file_name, data_path + "/annotations/" + file_name)
    
for mode in ["train", "test"]:
    for file_name in train_test_ids[mode]:
        shutil.move(data_path + "/" + file_name + ".dicom",
                    data_path + "/" + mode + "/" + file_name + ".dicom")
        
for entry in os.listdir(data_path):
    if os.path.isfile(data_path + "/" + entry):
        os.remove(data_path + "/" + entry)