In [1]:
import pandas as pd
import pydicom
import random
from tqdm.notebook import tqdm

In [2]:
TRAIN_IMG_DIR_PATH = "/content/drive/MyDrive/rsna-intracranial-hemorrhage-detection/stage_2_train/"
TEST_IMG_DIR_PATH = "/content/drive/MyDrive/rsna-intracranial-hemorrhage-detection/stage_2_test/"
TRAIN_DATAFRAME_PATH = "/content/drive/MyDrive/rsna-intracranial-hemorrhage-detection/stage_2_train.csv"
SUBMISSION_DATAFRAME_PATH = "/content/drive/MyDrive/rsna-intracranial-hemorrhage-detection/stage_2_sample_submission.csv"
OUTPUT_DIR = "/content/dataframes/"

It is needed to modify the structure of the original .csv files to simplify its usage during the neural network training. Five new files will be created:
* modified train dataframe with additional columns including image ID, hemorrhage type and label, study ID and vertical position in the CT sequence
* dataframe with images for the experiments
* dataframe with images itended for training the final network
* dataframe with images itended for testing the final network
* modified submission dataframe with separated image IDs and hemorrhage types

### Modifying stage_2_train.csv

In [3]:
stage_2_train = pd.read_csv(TRAIN_DATAFRAME_PATH)
stage_2_train.head(6)

Unnamed: 0,ID,Label
0,ID_12cadc6af_epidural,0
1,ID_12cadc6af_intraparenchymal,0
2,ID_12cadc6af_intraventricular,0
3,ID_12cadc6af_subarachnoid,0
4,ID_12cadc6af_subdural,0
5,ID_12cadc6af_any,0


In [4]:
split_id = stage_2_train['ID'].str.split('_', n=2, expand=False)
stage_2_train[['ID','Type']] = [['_'.join(spl[:2]), spl[-1]] for spl in split_id]
stage_2_train.head(6)

Unnamed: 0,ID,Label,Type
0,ID_12cadc6af,0,epidural
1,ID_12cadc6af,0,intraparenchymal
2,ID_12cadc6af,0,intraventricular
3,ID_12cadc6af,0,subarachnoid
4,ID_12cadc6af,0,subdural
5,ID_12cadc6af,0,any


Drop corrupted file.

In [5]:
stage_2_train.drop(index=stage_2_train.loc[stage_2_train['ID'] == "ID_6431af929"].index, inplace=True)

In [6]:
study_column = []
position_column = []
for i in tqdm(range(len(stage_2_train) // 6)):
    img_id = stage_2_train.iloc[i*6]['ID']
    img = pydicom.filereader.dcmread(TRAIN_IMG_DIR_PATH + img_id + ".dcm")
    study = img.StudyInstanceUID
    position = float(img.ImagePositionPatient[-1])
    study_column.extend([study]*6)
    position_column.extend([position] * 6)
stage_2_train['Study'] = study_column
stage_2_train['Position'] = position_column

  0%|          | 0/752806 [00:00<?, ?it/s]

In [7]:
stage_2_train.head(6)

Unnamed: 0,ID,Label,Type,Study,Position
0,ID_12cadc6af,0,epidural,ID_6dec708c74,71.652
1,ID_12cadc6af,0,intraparenchymal,ID_6dec708c74,71.652
2,ID_12cadc6af,0,intraventricular,ID_6dec708c74,71.652
3,ID_12cadc6af,0,subarachnoid,ID_6dec708c74,71.652
4,ID_12cadc6af,0,subdural,ID_6dec708c74,71.652
5,ID_12cadc6af,0,any,ID_6dec708c74,71.652


In [8]:
stage_2_train.to_csv(OUTPUT_DIR + "stage_2_train_mod.csv", index=False)

### Creating dataframe for the experiments

In [9]:
random.seed(123)
samples_per_type = len(stage_2_train["Study"].unique())//20//11

experiment_studies = []
studies_with_ich = []
used = []
for ich_type in ["epidural", "intraventricular", "subarachnoid", "intraparenchymal", "subdural", "any"]:
    if ich_type != 'any':
        studies = stage_2_train[(stage_2_train['Type'] == ich_type) & (stage_2_train['Label'] == 1) & (~stage_2_train['Study'].isin(used))]['Study'].unique()
        studies_with_ich.extend(studies)
        random_sample = random.sample(list(studies), samples_per_type * 2)
        experiment_studies.extend(random_sample)
        used.extend(studies)
    else:
        studies = stage_2_train[(stage_2_train['Type'] == ich_type) & (stage_2_train['Label'] == 0) & (~stage_2_train['Study'].isin(studies_with_ich))]['Study'].unique()
        random_sample = random.sample(list(studies), samples_per_type)
        experiment_studies.extend(random_sample)
experiments_df = stage_2_train[stage_2_train["Study"].isin(experiment_studies)]
experiments_df.head(6)

Unnamed: 0,ID,Label,Type,Study,Position
54,ID_ff7125125,0,epidural,ID_d512af590a,184.0
55,ID_ff7125125,0,intraparenchymal,ID_d512af590a,184.0
56,ID_ff7125125,0,intraventricular,ID_d512af590a,184.0
57,ID_ff7125125,1,subarachnoid,ID_d512af590a,184.0
58,ID_ff7125125,0,subdural,ID_d512af590a,184.0
59,ID_ff7125125,1,any,ID_d512af590a,184.0


In [10]:
experiments_df.to_csv(OUTPUT_DIR + "experiments.csv", index=False)

### Creating dataframe for the final network testing

In [11]:
test_studies = []
used = []
for ich_type in ["epidural", "intraventricular", "subarachnoid", "intraparenchymal", "subdural", "any"]:
    if ich_type != 'any':
        studies = stage_2_train[(stage_2_train['Type'] == ich_type) & (stage_2_train['Label'] == 1) & (~stage_2_train['Study'].isin(used)) & (~stage_2_train['Study'].isin(experiment_studies))]['Study'].unique()
        n_studies = len(studies) // 20
        random_sample = random.sample(list(studies), n_studies)
        test_studies.extend(random_sample)
        used.extend(studies)
    else:
        studies = stage_2_train[(stage_2_train['Type'] == ich_type) & (stage_2_train['Label'] == 0) & (~stage_2_train['Study'].isin(used)) & (~stage_2_train['Study'].isin(experiment_studies)) & (~stage_2_train['Study'].isin(studies_with_ich))]['Study'].unique()
        n_studies = len(studies) // 20
        test_studies.extend(random.sample(list(studies), n_studies))
test_df = stage_2_train[stage_2_train["Study"].isin(test_studies)]
test_df.head(6)

Unnamed: 0,ID,Label,Type,Study,Position
6,ID_38fd7baa0,0,epidural,ID_0a9ac70962,226.174
7,ID_38fd7baa0,0,intraparenchymal,ID_0a9ac70962,226.174
8,ID_38fd7baa0,0,intraventricular,ID_0a9ac70962,226.174
9,ID_38fd7baa0,0,subarachnoid,ID_0a9ac70962,226.174
10,ID_38fd7baa0,0,subdural,ID_0a9ac70962,226.174
11,ID_38fd7baa0,0,any,ID_0a9ac70962,226.174


In [12]:
test_df.to_csv(OUTPUT_DIR + "test_final.csv", index=False)

### Creating dataframe for the final network training

In [13]:
train_df = stage_2_train[~stage_2_train['Study'].isin(test_df['Study'].unique())]
train_df.head(6)

Unnamed: 0,ID,Label,Type,Study,Position
0,ID_12cadc6af,0,epidural,ID_6dec708c74,71.652
1,ID_12cadc6af,0,intraparenchymal,ID_6dec708c74,71.652
2,ID_12cadc6af,0,intraventricular,ID_6dec708c74,71.652
3,ID_12cadc6af,0,subarachnoid,ID_6dec708c74,71.652
4,ID_12cadc6af,0,subdural,ID_6dec708c74,71.652
5,ID_12cadc6af,0,any,ID_6dec708c74,71.652


In [14]:
train_df.to_csv(OUTPUT_DIR + "train_final.csv", index=False)

### Modifying stage_2_sample_submission.csv

In [15]:
stage_2_sample_submission = pd.read_csv(SUBMISSION_DATAFRAME_PATH)
stage_2_sample_submission.head(6)

Unnamed: 0,ID,Label
0,ID_0fbf6a978_epidural,0.5
1,ID_0fbf6a978_intraparenchymal,0.5
2,ID_0fbf6a978_intraventricular,0.5
3,ID_0fbf6a978_subarachnoid,0.5
4,ID_0fbf6a978_subdural,0.5
5,ID_0fbf6a978_any,0.5


In [16]:
split_id = stage_2_sample_submission['ID'].str.split('_', n=2, expand=False)
stage_2_sample_submission[['ID','Type']] = [['_'.join(spl[:2]), spl[-1]] for spl in split_id]
stage_2_sample_submission.head(6)

Unnamed: 0,ID,Label,Type
0,ID_0fbf6a978,0.5,epidural
1,ID_0fbf6a978,0.5,intraparenchymal
2,ID_0fbf6a978,0.5,intraventricular
3,ID_0fbf6a978,0.5,subarachnoid
4,ID_0fbf6a978,0.5,subdural
5,ID_0fbf6a978,0.5,any


In [17]:
study_column = []
position_column = []
for i in tqdm(range(len(stage_2_sample_submission) // 6)):
    img_id = stage_2_sample_submission.iloc[i*6]['ID']
    img = pydicom.filereader.dcmread(TEST_IMG_DIR_PATH + img_id + ".dcm")
    study = img.StudyInstanceUID
    position = float(img.ImagePositionPatient[-1])
    study_column.extend([study]*6)
    position_column.extend([position] * 6)
stage_2_sample_submission['Study'] = study_column
stage_2_sample_submission['Position'] = position_column

  0%|          | 0/121232 [00:00<?, ?it/s]

In [18]:
stage_2_sample_submission.head(6)

Unnamed: 0,ID,Label,Type,Study,Position
0,ID_0fbf6a978,0.5,epidural,ID_3710aff1c5,37.690857
1,ID_0fbf6a978,0.5,intraparenchymal,ID_3710aff1c5,37.690857
2,ID_0fbf6a978,0.5,intraventricular,ID_3710aff1c5,37.690857
3,ID_0fbf6a978,0.5,subarachnoid,ID_3710aff1c5,37.690857
4,ID_0fbf6a978,0.5,subdural,ID_3710aff1c5,37.690857
5,ID_0fbf6a978,0.5,any,ID_3710aff1c5,37.690857


In [19]:
stage_2_sample_submission.to_csv(OUTPUT_DIR + "stage_2_sample_submission_mod.csv", index=False)