In [None]:
# Desired output size.
RESIZED_WIDTH, RESIZED_HEIGHT = 256, 256
OUTPUT_FORMAT = "png"
OUTPUT_DIR = "output"

In [None]:
import glob
import joblib
import numpy as np
import PIL
import pydicom
import tqdm

In [None]:
data_dir = "../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection"
!ls {data_dir}

In [None]:
train_dir = "stage_2_train"
train_paths = glob.glob(f"{data_dir}/{train_dir}/*.dcm")
len(train_paths)

In [None]:
def get_first_of_dicom_field_as_int(x):
    if type(x) == pydicom.multival.MultiValue:
        return int(x[0])
    return int(x)

def get_id(img_dicom):
    return str(img_dicom.SOPInstanceUID)

def get_Pid(img_dicom):
    return str(img_dicom.PatientID)

def get_metadata_from_dicom(img_dicom):
    metadata = {
        "window_center": img_dicom.WindowCenter,
        "window_width": img_dicom.WindowWidth,
        "intercept": img_dicom.RescaleIntercept,
        "slope": img_dicom.RescaleSlope,
    }
    return {k: get_first_of_dicom_field_as_int(v) for k, v in metadata.items()}

def window_image(img, window_center, window_width, intercept, slope):
    img = img * slope + intercept
    img_min = window_center - window_width // 2
    img_max = window_center + window_width // 2
    img[img < img_min] = img_min
    img[img > img_max] = img_max
    return img 

def resize(img, new_w, new_h):
    img = PIL.Image.fromarray(img.astype(np.int8), mode="L")
    return img.resize((new_w, new_h), resample=PIL.Image.BICUBIC)

def save_img(img_pil, subfolder, name):
    img_pil.save(f"{OUTPUT_DIR}/{subfolder}/{name}.{OUTPUT_FORMAT}")

def normalize_minmax(img):
    mi, ma = img.min(), img.max()
    return (img - mi) / (ma - mi)

def prepare_image(img_path):
    img_dicom = pydicom.read_file(img_path)
    img_id = get_id(img_dicom)
    metadata = get_metadata_from_dicom(img_dicom)
    img = window_image(img_dicom.pixel_array, **metadata)
    img = normalize_minmax(img) * 255
    img_pil = resize(img, RESIZED_WIDTH, RESIZED_HEIGHT)
    return img_id, img_pil

def prepare_and_save(img_path, subfolder):
    try:
        l.error("loading eso")
        img_id, img_pil = prepare_image(img_path)
        save_img(img_pil, subfolder, img_id)
    except KeyboardInterrupt:
        # Rais interrupt exception so we can stop the cell execution
        # without shutting down the kernel.
        raise
    except:
        l.error(f"Error processing the image: {img_path}")

def prepare_images(imgs_path, subfolder):
    for i in tqdm.tqdm(imgs_path):
        prepare_and_save(i, subfolder)
import logging as l
def prepare_images_njobs(img_paths, subfolder, n_jobs=-1):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(prepare_and_save)(i, subfolder) for i in tqdm.tqdm(img_paths))

In [None]:
!mkdir -p {OUTPUT_DIR}/{train_dir}

In [None]:
# prepare_images_njobs(train_paths, train_dir)
prepare_images_njobs(train_paths[300000:], train_dir)

Patient ID

In [None]:
import os
import gc
import pydicom # For accessing DICOM files
KAGGLE_DIR = '../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/'
IMG_PATH_TRAIN = KAGGLE_DIR + 'stage_2_train/'

train_images = os.listdir(IMG_PATH_TRAIN)

meta_cols = ['BitsAllocated','BitsStored','Columns','HighBit',
             'Modality','PatientID','PhotometricInterpretation',
             'PixelRepresentation','RescaleIntercept','RescaleSlope',
             'Rows','SOPInstanceUID','SamplesPerPixel','SeriesInstanceUID',
             'StudyID','StudyInstanceUID','ImagePositionPatient',
             'ImageOrientationPatient','PixelSpacing']

col_dict_train = {col: [] for col in meta_cols}

In [None]:
def prepare_PID(IMG_PATH_TRAIN,PIDlst,ImgIDlst,img):
    dicom_object = pydicom.dcmread(IMG_PATH_TRAIN + img)
    print(img)
    PIDlst.append(get_Pid(dicom_object))
    ImgIDlst.append(get_id(dicom_object))

def prepare_PID_njobs(IMG_PATH_TRAIN,PIDlst,ImgIDlst,train_images, n_jobs=-1):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(prepare_PID)(IMG_PATH_TRAIN,PIDlst,ImgIDlst,img) for img in tqdm.tqdm(train_images))

In [None]:
from multiprocessing import Process, Manager

PIDlst = list()
ImgIDlst = list()
prepare_PID_njobs(IMG_PATH_TRAIN,PIDlst,ImgIDlst,train_images[:10])

In [None]:
PIDlst = list()
ImgIDlst = list()
for img in tqdm.tqdm(train_images):
    dicom_object = pydicom.dcmread(IMG_PATH_TRAIN + img)
#     print(img)
    PIDlst.append(get_Pid(dicom_object))
    ImgIDlst.append(get_id(dicom_object))

In [None]:
# PIDlst = list()
# ImgIDlst = list()
# prepare_PID_njobs(IMG_PATH_TRAIN,PIDlst,ImgIDlst,train_images)

#meta_df_train = pd.DataFrame(col_dict_train)

In [None]:
PIDlst[:5]

In [None]:
import pandas as pd

metadf = pd.DataFrame(
    {'ID': ImgIDlst,
     'PatientID': PIDlst
    })
metadf.to_csv('stage_2_train_with_metadata.csv', index=False)


In [None]:
for img in tqdm.tqdm(train_images): 
    dicom_object = pydicom.dcmread(IMG_PATH_TRAIN + img)
    for col in meta_cols: 
        col_dict_train[col].append(str(getattr(dicom_object, col)))

# Store all information in a DataFrame
meta_df_train = pd.DataFrame(col_dict_train)
del col_dict_train
gc.collect()

In [None]:
# from zipfile import ZipFile
# import os

# zipObj = ZipFile('zippng.zip', 'w')

# for filename in os.listdir("/kaggle/working/output/stage_2_train/"):
#     zipObj.write(filename)
#     print(filename)
    
# zipObj.close()
# output_dir = "/kaggle/working/output/stage_2_train"
# !ls {output_dir}

# !zip -m images.zip * .png

In [None]:
from pathlib import Path

import zipfile

img_root = Path('/kaggle/working/output/stage_2_train')
with zipfile.ZipFile('image_png_256_2.zip', 'w') as z:
    for img_name in img_root.iterdir():
        z.write(img_name)


In [None]:
import os 
import pandas as pd
import pandas_profiling as pp

In [None]:
from google.cloud import storage
storage_client = storage.Client(project='chl7001groupproject')

In [None]:
# def upload_files(bucket_name, source_folder):
#     bucket = storage_client.get_bucket(bucket_name)
#     for filename in os.listdir(source_folder):
#         blob = bucket.blob(filename)
#         blob.upload_from_filename(source_folder + filename)
#     print("done")

In [None]:
# bucket_name = 'chl7001gp_bucket_png_128'         
# local_data = './output/stage_2_train/'
# upload_files(bucket_name, local_data)

In [None]:
# import os 
# for filename in os.listdir('./'):
#     print(filename)

In [None]:
bucket_name = 'chl7001gp_bucket_png_128'  
filename = 'image_png_256_2.zip'
source_folder = './'
bucket = storage_client.get_bucket(bucket_name)      
blob = bucket.blob(filename)
blob.upload_from_filename(source_folder + filename)