# import library

In [None]:
!pip install pydicom

In [None]:
from pathlib import Path
import os, zipfile, shutil
import numpy as np
import torch
import pydicom

# Helper method
def rm_any(pth: Path):
    if pth.is_file():
        return pth.unlink()
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_any(child)
    return pth.rmdir()

class metric():
    def __init__(self):
        self.data = dict()
        
    def update(self, key, patient):
        if key in self.data:
            tmp = self.data[key]
            tmp.append(patient)
            self.data[key] = tmp
        else:
            tmp = [patient]
            self.data[key] = tmp
    
    def __str__(self):
        return str(self.data)


## Pathlib
- `Path.rmdir(dir)` #directory nust be empty
- `Path.is_dir() or is_file`
- `Path.iterdir()`
- `Path.unlink(missing_ok=True)` # remove file - True: rm -f
- `Path.exists()` # file or dir

# Unzip and remove useless files

- `path`: folder path contains zip file of dataset.
- `path_new`: folder path contains preprocessed file.

In [None]:
path = Path('/content/drive/My Drive/Tumor_src/Datasets/zip/3Dircadb1') 
path_new = Path('/content/drive/My Drive/Tumor_src/Datasets/3Dircad/Origin')
path_new.mkdir(exist_ok=True)

lst_organs = metric()

for i in range(1, 21):
    patient = path / ('3Dircadb1.' + str(i))
    patient_processed = path_new / ('3Dircadb1.' + str(i))

    # Extract PATIENT_DICOM.zip and MASKS_DICOM 
    if not os.path.exists(patient_processed / 'PATIENT_DICOM'):
        try:
            zipfile.ZipFile(patient / 'PATIENT_DICOM.zip').extractall(patient_processed) 
            zipfile.ZipFile(patient / 'MASKS_DICOM.zip').extractall(patient_processed) 
        except:
            print('File PATIENT.zip or MASKS.zip not found in patient-', str(i))
    
    # Rename mask folder venacava to venoussystem which make folders are homogenous 
    mask_folder = patient_processed / 'MASKS_DICOM'
    for organ in os.listdir(mask_folder):
        if organ == 'portalvein1':
            os.rename(mask_folder / 'portalvein1', mask_folder / 'portalvein')
        if organ == 'venacava': 
            os.rename(mask_folder / 'venacava', mask_folder / 'venoussystem')

    # Remove label folders of useless organs: lung, kidney, skin...
    mask_folder = patient_processed / 'MASKS_DICOM'
    rm_organs = [file for file in os.listdir(mask_folder) 
                  if not (file == 'liver') and not file.startswith('venous') 
                  and not file.startswith('portalvein') and not file.startswith('artery')]

    for file in rm_organs:
        rm_any(mask_folder / file)
        print('Patient #{} remove: {}'.format(i, file))

    mask_folder = patient_processed / 'MASKS_DICOM'
    for organ in os.listdir(mask_folder):
        lst_organs.update(organ, i)
    print('----> Patient #{} completed'.format(str(i)))

print('Completed !')

In [None]:
for x in lst_organs.data:
    print(len(lst_organs.data[x]), '---', x, lst_organs.data[x])

12 --- artery [1, 4, 5, 6, 7, 8, 9, 11, 12, 13, 17, 20]
20 --- liver [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
20 --- portalvein [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
20 --- venoussystem [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


# Normalize and merge liver and vessel Label

- Some of patients have diff value of pixel segmentations: `[0 255] [0 1]` --> standardization `[0 1]`

    > patient: 1, 4, 8 ,6 , 11 ,14
- Combine venous and portal into vessel label 

In [None]:
def normalizeLabel(path_in, path_out):
    # one patient is in process
    path_in, path_out = str(path_in), str(path_out)
    for slice in range(len(os.listdir(path_in))):
        dicom_info = pydicom.dcmread(path_in + '/image_' + str(slice))
        dicom_data = dicom_info.pixel_array
        dicom_data[dicom_data>0] = 1
        dicom_info.PixelData = dicom_data.tobytes()
        dicom_info.save_as(path_out + '/image_' + str(slice))

# merge and normalize vessel label
def merge_vessel(path_in, path_out):
    # one patient is in process
    path_in, path_out = str(path_in), str(path_out)
    portal_dir = path_in + '/portalvein'
    venous_dir = path_in + '/venoussystem'

    for slice in range(len(os.listdir(portal_dir))):
        portal_path = portal_dir + '/image_' + str(slice)
        venous_path = venous_dir + '/image_' + str(slice)
        portal_info = pydicom.dcmread(portal_path)
        venous_info = pydicom.dcmread(venous_path)
        portal_data = portal_info.pixel_array
        venous_data = venous_info.pixel_array

        # normalize label 255 -> 1
        portal_data[portal_data>0] = 1
        venous_data[venous_data>0] = 1
        # merger        
        portal_data[venous_data>0] = 1

        portal_info.PixelData = portal_data.tobytes()
        portal_info.save_as(path_out + '/image_' + str(slice))

In [None]:
dataset_path = Path('/content/drive/My Drive/Tumor_src/Datasets/3Dircad/Processed')
dataset_path.mkdir(exist_ok=True)
for i in range(1, 21):
    patient = dataset_path / ('3Dircadb1.' + str(i))
    patient.mkdir(exist_ok=True)
    patient_mask = patient / 'MASKS_DICOM'
    patient_mask.mkdir(exist_ok=True)


In [None]:
import shutil

path = Path('/content/drive/My Drive/Tumor_src/Datasets/3Dircad')

for i in range(1, 21):
    path_in = path / ('Origin/3Dircadb1.' + str(i)) / 'MASKS_DICOM/liver'
    path_out = path / ('Processed/3Dircadb1.' + str(i)) / 'MASKS_DICOM/liver'
    path_out.mkdir(exist_ok=True)
    normalizeLabel(path_in, path_out)

    path_in = path / ('Origin/3Dircadb1.' + str(i)) / 'MASKS_DICOM'
    path_out = path / ('Processed/3Dircadb1.' + str(i)) / 'MASKS_DICOM/vessel'
    path_out.mkdir(exist_ok=True)
    merge_vessel(path_in, path_out)

    path_in = path / ('Origin/3Dircadb1.' + str(i)) / 'PATIENT_DICOM'
    path_out = path / ('Processed/3Dircadb1.' + str(i)) / 'PATIENT_DICOM'
    shutil.copytree(str(path_in), str(path_out))

    print('----> Patient #{} completed'.format(str(i)))
    
print('Normalize Label completed !!!! \n')


----> Patient #1 completed
----> Patient #2 completed
----> Patient #3 completed
----> Patient #4 completed
----> Patient #5 completed
----> Patient #6 completed
----> Patient #7 completed
----> Patient #8 completed
----> Patient #9 completed
----> Patient #10 completed
----> Patient #11 completed
----> Patient #12 completed
----> Patient #13 completed
----> Patient #14 completed
----> Patient #15 completed
----> Patient #16 completed
----> Patient #17 completed
----> Patient #18 completed
----> Patient #19 completed
----> Patient #20 completed
Normalize Label completed !!!! 



# List masks of patients in new processed dataset

In [None]:
path = Path('/content/drive/My Drive/Tumor_src/Datasets/3Dircad/Processed')
new_organ = metric()
for i in range(1,21):
    mask_folder = path / ('3Dircadb1.' + str(i)) / 'MASKS_DICOM'
    for organ in os.listdir(mask_folder):
        new_organ.update(organ, i)

for x in new_organ.data:
    print(len(new_organ.data[x]), '---', x, new_organ.data[x])

20 --- liver [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
20 --- vessel [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


# Convert dicom to tensor
> Save to file `pth`

In [None]:
class Helper:
    @staticmethod
    def dicom2tensor(path):
        path = str(path)
        volume = []
        for slice in range(len(os.listdir(path))):
            data = pydicom.dcmread(path + '/image_' + str(slice)).pixel_array
            volume.append(data)
        return torch.tensor(volume).to(torch.float32)

In [None]:
DIR = '/content/drive/My Drive/Tumor_src/Datasets/3Dircad/Processed'

PATH = Path('/content/drive/My Drive/Tumor_src/Datasets/3Dircad/PTH')
VOLUME = PATH / 'VOLUME'
LIVER = PATH / 'LIVER'
VESSEL = PATH / 'VESSEL'

PATH.mkdir(exist_ok=True)
VOLUME.mkdir(exist_ok=True)
LIVER.mkdir(exist_ok=True)
VESSEL.mkdir(exist_ok=True)

In [None]:
for i in range(20):
    print('Loading patient #{}'.format(i))
    patient = DIR + '/3Dircadb1.' + str(i)
    
    data = Helper.dicom2tensor(patient + '/PATIENT_DICOM')
    liver = Helper.dicom2tensor(patient + '/MASKS_DICOM/liver')
    vessel = Helper.dicom2tensor(patient + '/MASKS_DICOM/vessel')

    data_path = str(VOLUME) + '/volume_' + str(i) + '.pth'
    liver_path = str(LIVER) + '/volume_' + str(i) + '.pth'
    vessel_path = str(VESSEL) + '/volume_' + str(i) + '.pth'

    liver[liver > 0] = 1
    vessel[vessel > 0] = 1
    print('\t', data.shape)
    print('\t', data.max(), data.min())
    print('\t', liver.max(), liver.min())
    print('\t', vessel.max(), vessel.min())
    torch.save(data, data_path)
    torch.save(liver, liver_path)
    torch.save(vessel, vessel_path)
    

Loading patient #1
	 torch.Size([129, 512, 512])
	 tensor(1023.) tensor(-1024.)
	 tensor(1.) tensor(0.)
	 tensor(1.) tensor(0.)
Loading patient #2
	 torch.Size([172, 512, 512])
	 tensor(1023.) tensor(-1024.)
	 tensor(1.) tensor(0.)
	 tensor(1.) tensor(0.)
Loading patient #3
	 torch.Size([200, 512, 512])
	 tensor(3071.) tensor(-1024.)
	 tensor(1.) tensor(0.)
	 tensor(1.) tensor(0.)
Loading patient #4
	 torch.Size([91, 512, 512])
	 tensor(1023.) tensor(-1024.)
	 tensor(1.) tensor(0.)
	 tensor(1.) tensor(0.)
Loading patient #5
	 torch.Size([139, 512, 512])
	 tensor(1023.) tensor(-1024.)
	 tensor(1.) tensor(0.)
	 tensor(1.) tensor(0.)
Loading patient #6
	 torch.Size([135, 512, 512])
	 tensor(1023.) tensor(-1024.)
	 tensor(1.) tensor(0.)
	 tensor(1.) tensor(0.)
Loading patient #7
	 torch.Size([151, 512, 512])
	 tensor(1024.) tensor(-1023.)
	 tensor(1.) tensor(0.)
	 tensor(1.) tensor(0.)
Loading patient #8
	 torch.Size([124, 512, 512])
	 tensor(1525.) tensor(-2048.)
	 tensor(1.) tensor(0.)
	

# Save voxel size of patient into a tensor

In [None]:
import pydicom

origin_pth = '/content/drive/My Drive/Tumor_src/Datasets/3Dircad/Origin'
voxel_size = []
for i in range(1,21):
    patient = origin_pth + '/3Dircadb1.' + str(i) + '/PATIENT_DICOM/image_0'
    dicom = pydicom.dcmread(patient)
    size = round(dicom.SliceThickness,2), round(dicom.PixelSpacing[0], 2), round(dicom.PixelSpacing[1],2)
    voxel_size.append(list(size))

voxel_size = torch.tensor(voxel_size)
torch.save(voxel_size, '/content/drive/My Drive/Tumor_src/Datasets/3Dircad/voxel_size.pth')