In [None]:

import re
import os
import pydicom
import operator
import pandas as pd
import numpy as np


In [None]:
ROOT_DIR = "C:/Users/evbruh/Downloads/rsna-intracranial-hemorrhage-detection/"
TRAIN_DIR = ROOT_DIR + 'stage_1_train_images'
TEST_DIR = ROOT_DIR + 'stage_1_test_images'
ALL_DF_PATH = ROOT_DIR + 'all_df.p'
TRAIN_DF_PATH = ROOT_DIR + 'train_df.p'
VAL_DF_PATH = ROOT_DIR + 'val_df.p'
batch_size = 32


In [None]:

def fix_id(img_id, img_dir=TRAIN_DIR):
    if not re.match(r'ID_[a-z0-9]+', img_id):
        sop = re.search(r'[a-z0-9]+', img_id)
        if sop:
            img_id_new = f'ID_{sop[0]}'
            return img_id_new
        else:
            print(img_id)
    return img_id


def id_to_filepath(img_id, img_dir=TRAIN_DIR):
    filepath = f'{img_dir}/{img_id}.dcm'  # pydicom doesn't play nice with Path objects
    if os.path.exists(filepath):
        return filepath
    else:
        return 'DNE'


def get_patient_data(filepath):
    if filepath != 'DNE':
        dcm_data = pydicom.dcmread(filepath, stop_before_pixels=True)
        return dcm_data.PatientID, dcm_data.StudyInstanceUID, dcm_data.SeriesInstanceUID


In [None]:

all_df = pd.read_csv(ROOT_DIR + 'stage_1_train.csv')
all_df[['ID', 'Subtype']] = all_df['ID'].str.rsplit(pat='_', n=1, expand=True)
all_df['ID'] = all_df['ID'].apply(fix_id)
all_df = all_df.pivot_table(index='ID', columns='Subtype').reset_index()
all_df['Label', 'none'] = ((all_df.Label['any'].values + 1)%2)
all_df['filepath'] = all_df['ID'].apply(id_to_filepath)
all_df = all_df[[('ID',                    ''),
                 ('Label',             'none'),
                 ('Label',         'epidural'),
                 ('Label', 'intraparenchymal'),
                 ('Label', 'intraventricular'),
                 ('Label',     'subarachnoid'),
                 ('Label',         'subdural'),
                 ('filepath',              '')]]
all_df.to_pickle(ALL_DF_PATH)
