# DOWNLOAD AND PREPARE KAGGLE IMAGES

In [1]:
# Requirements for reading dicom files
! pip install python-gdcm
! pip install pylibjpeg
! conda install -c conda-forge gdcm

/bin/bash: /home/marco/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /home/marco/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /home/marco/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [1]:
import os
import opendatasets as od
import zipfile
import pydicom
from PIL import Image
import numpy as np
from multiprocessing import Pool, cpu_count

## 1. Download

This step requires having an account on Kaggle and having the json file associated to the account saved in the root directory.

In [5]:
download_link = 'https://www.kaggle.com/competitions/rsna-breast-cancer-detection/data'

od.download(download_link)

Skipping, found downloaded files in "./rsna-breast-cancer-detection" (use force=True to force download)


## 2. Unzip

In [6]:
with zipfile.ZipFile('rsna-breast-cancer-detection/rsna-breast-cancer-detection.zip', 'r') as zip_:
        zip_.extractall('./DATA/')

## 3. Preprocess images

In [3]:
# Helper function to read dicom files FROM https://www.kaggle.com/code/raddar/convert-dicom-to-np-array-the-correct-way

def read_dicom(path, voi_lut=True):
    dicom = pydicom.dcmread(path)
    
    data = dicom.pixel_array
               
    # depending on this value,mammograms may look inverted - fix that:
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    return data


# Read each image in our whole data directory

data_dir = './DATA/train_images/'
output_dir = './DATA/train_images_resized/'

#Set the new size
new_size = (1024, 1024)


# Function to read and preprocess the images
def preprocess(file, subdir):

        # Only preprocess if it's dcm file
        filename, extension = file.split('.')
        if extension == 'dcm':

            image = os.path.join(subdir, file)
            patient_id = subdir.split('/')[-1]
            new_file = output_dir+patient_id+'/'+filename+'.png'
            if not os.path.isfile(new_file):

                # Open dicom file and extract pixel array
                img = read_dicom(image)

                # Scale 0-1
                img = (img-np.min(img))/(np.max(img)-np.min(img))

                #If there is a column with all zero pixels on the left, flip it:
                if img[:, 0].sum() == 0:
                    img = np.fliplr(img)

                #Remove empty space
                ##On cols
                img = img[:, np.any(img, axis=0)]
                ##On rows
                img = img[np.any(img, axis=1), :]

                # Resize
                img = Image.fromarray(img * 255).convert('L')
                img = img.resize(new_size)

                #Save (can only save as 0-255 array, will convert again later when feeding into the model)
                
                if not os.path.exists(output_dir+patient_id):
                    os.makedirs(output_dir+patient_id)

                img.save(output_dir+patient_id+'/'+filename+'.png')

# Iterate over the entire train folder and subfolder
file_iter = [(file, subdir) for subdir, dirs, files in os.walk(data_dir) for file in files]

# Run it in parallel to be faster
pool = Pool(cpu_count() - 1)
pool.starmap_async(preprocess, file_iter)
pool.close()