In [None]:
# https://www.kaggle.com/code/deannahedges/mammography-challenge-dicom-to-png
# results: https://www.kaggle.com/datasets/deannahedges/mammography-challenge-pngs

# Sources:
    # To go from Dicom -> PNG:
        # https://www.kaggle.com/code/radek1/how-to-process-dicom-images-to-pngs/notebook?scriptVersionId=113529850
    # To load the data, configure for performance, and build model in keras:
        # https://www.tensorflow.org/tutorials/load_data/images#:~:text=This%20tutorial%20shows%20how%20to%20load%20and%20preprocess,from%20the%20large%20catalog%20available%20in%20TensorFlow%20Datasets.
    # To augment the data:
        # https://www.tensorflow.org/tutorials/images/data_augmentation
    # To make the submission notebook:
        # https://www.kaggle.com/code/radek1/fast-ai-starter-pack-train-inference/notebook
        

import numpy as np
import pandas as pd

## Exploring training csv with labels

In [None]:
train_file = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/train.csv")
train_file.head()

In [None]:
train_file['cancer'] = train_file['cancer'].astype('float32')
train_file.info()

In [None]:
train_file['cancer'].value_counts()

## Creating function to categorize images as "positive" or "negative" based on file path

In [None]:
def pos_or_neg(img_directory):
    img_id = str(img_directory).split('/')[-1][:-4]
    diagnosis = train_file.loc[train_file['image_id']==int(img_id), 'cancer'].values[0]
    if diagnosis == 0:
        return "negative"
    else:
        return "positive"

In [None]:
!pip install dicomsdl

## Transforming images from DICOM format to PNG and sorting them into a "positive" and "negative" folder

In [None]:
import pydicom
import cv2
import os
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from pathlib import Path
from pydicom.pixel_data_handlers.util import apply_voi_lut
import dicomsdl
import sys
import time

RESIZE_TO = (256, 256)

In [None]:
%%time

!mkdir -p /kaggle/working/train_images_processed_cv2_dicomsdl_{RESIZE_TO[0]}/positive/
!mkdir -p /kaggle/working/train_images_processed_cv2_dicomsdl_{RESIZE_TO[0]}/negative/

# https://www.kaggle.com/code/tanlikesmath/brain-tumor-radiogenomic-classification-eda/notebook
def dicom_file_to_ary(path):
    dcm_file = dicomsdl.open(str(path))
    data = dcm_file.pixelData()

    data = (data - data.min()) / (data.max() - data.min())

    if dcm_file.getPixelDataInfo()['PhotometricInterpretation'] == "MONOCHROME1":
        data = 1 - data

    data = cv2.resize(data, RESIZE_TO)
    data = (data * 255).astype(np.uint8)
    return data

image_directories = []
for patient_dir in Path('/kaggle/input/rsna-breast-cancer-detection/train_images/').iterdir():
    for pic_dir in patient_dir.iterdir():
#         if pic_dir.stem not in done_ids:
        image_directories.append(pic_dir)
print(len(image_directories))

def process_directory(directory_path):
    parent_directory = pos_or_neg(directory_path)
    
    processed_ary = dicom_file_to_ary(directory_path)
        
    cv2.imwrite(
        f'train_images_processed_cv2_dicomsdl_{RESIZE_TO[0]}/{parent_directory}/{directory_path.stem}.png',
        processed_ary
    )
pos_dir = Path("/kaggle/working/train_images_processed_cv2_dicomsdl_256/positive/")
    
import multiprocessing as mp

with mp.Pool(64) as p:
    p.map(process_directory, image_directories)

## Insuring that the final number of images matches the original number

In [None]:
from pathlib import Path
data_dir = Path("/kaggle/working/train_images_processed_cv2_dicomsdl_256/")
done_paths = list(data_dir.glob('*/*.png'))
image_count = len(list(data_dir.glob('*/*.png')))
print(image_count)