<a href="https://colab.research.google.com/github/deepakri201/ABDSynth/blob/main/LiverHCCSeg_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LiverHCCSeg preprocessing code

For now, we use the same directory structure and filenames as the original data, so we can do a comparison to make sure our original and new files are the same.

Later, we redo the data structure of the images/labels.

[insert more info later - include inputs/outputs etc]

Deepa Krishnaswamy and Cosmin Ciausu

July 2025

Brigham and Women's Hospital

# Environment setup

In [None]:
import os
import numpy as np
import nibabel as nib
from glob import glob as glob
import shutil

# Download the data from Zenodo

In [None]:
# We use the nifti format for the images. There are DICOM files also available, if we wanted to convert it to nifti ourselves.
# We also download the nifti formation segmentations
# https://zenodo.org/records/8179129

!wget https://zenodo.org/records/8179129/files/nifti_and_segms.zip?download=1

--2025-07-10 15:03:16--  https://zenodo.org/records/8179129/files/nifti_and_segms.zip?download=1
Resolving zenodo.org (zenodo.org)... 188.185.43.25, 188.185.48.194, 188.185.45.92, ...
Connecting to zenodo.org (zenodo.org)|188.185.43.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977885191 (933M) [application/octet-stream]
Saving to: ‘nifti_and_segms.zip?download=1’


2025-07-10 15:06:49 (4.39 MB/s) - ‘nifti_and_segms.zip?download=1’ saved [977885191/977885191]



In [None]:
# We unzip the contents

!unzip /content/nifti_and_segms.zip?download=1

Archive:  /content/nifti_and_segms.zip?download=1
   creating: nifti_and_segms/
  inflating: __MACOSX/._nifti_and_segms  
   creating: nifti_and_segms/TCGA-BC-A69I/
  inflating: nifti_and_segms/.DS_Store  
  inflating: __MACOSX/nifti_and_segms/._.DS_Store  
   creating: nifti_and_segms/TCGA-DD-A4NF/
  inflating: __MACOSX/nifti_and_segms/._TCGA-DD-A4NF  
   creating: nifti_and_segms/TCGA-DD-A4NH/
   creating: nifti_and_segms/TCGA-G3-AAV2/
   creating: nifti_and_segms/TCGA-G3-AAV3/
   creating: nifti_and_segms/TCGA-G3-A7M7/
   creating: nifti_and_segms/TCGA-G3-A25T/
   creating: nifti_and_segms/TCGA-G3-A3CJ/
   creating: nifti_and_segms/TCGA-BC-A5W4/
  inflating: __MACOSX/nifti_and_segms/._TCGA-BC-A5W4  
   creating: nifti_and_segms/TCGA-BC-4073/
   creating: nifti_and_segms/TCGA-BC-A10Y/
  inflating: __MACOSX/nifti_and_segms/._TCGA-BC-A10Y  
   creating: nifti_and_segms/TCGA-BC-A216/
   creating: nifti_and_segms/TCGA-G3-AAV1/
   creating: nifti_and_segms/TCGA-DD-A4NB/
   creating: nifti

# Conversion of LiverHCCSeg

In [None]:
# Create the output directory to store the images and segmentations to be used for evaluation

output_main_directory = "/content/LiverHCCSeg_preprocessed_data"
output_rater1_directory = os.path.join(output_main_directory, "rater1")
output_rater1_images_directory = os.path.join(output_rater1_directory, "images")
output_rater1_labels_directory = os.path.join(output_rater1_directory, "labels")
output_rater2_directory = os.path.join(output_main_directory, "rater2")
output_rater2_images_directory = os.path.join(output_rater2_directory, "images")
output_rater2_labels_directory = os.path.join(output_rater2_directory, "labels")

if not os.path.isdir(output_main_directory):
  os.mkdir(output_main_directory)

if not os.path.isdir(output_rater1_images_directory):
  os.makedirs(output_rater1_images_directory, exist_ok=True)
if not os.path.isdir(output_rater1_labels_directory):
  os.makedirs(output_rater1_labels_directory, exist_ok=True)

if not os.path.isdir(output_rater2_images_directory):
  os.makedirs(output_rater2_images_directory, exist_ok=True)
if not os.path.isdir(output_rater2_labels_directory):
  os.makedirs(output_rater2_labels_directory, exist_ok=True)

In [None]:
# We are only interested in the arterial phase (art.nii.gz), so we copy this over to the images directory

input_main_directory = "/content/nifti_and_segms"

# Get the PatientIDs
patient_ids = [patient_id for patient_id in os.listdir(input_main_directory) if os.path.isdir(os.path.join(input_main_directory, patient_id))]
num_patient_ids = len(patient_ids)
print('patient_ids: ' + str(patient_ids))
print('num_patient_ids: ' + str(num_patient_ids))

# Get the paths of all the input art.nii.gz files
for patient_id in patient_ids:
  input_patient_directory = os.path.join(input_main_directory, patient_id)
  input_image_filename = glob(os.path.join(input_patient_directory, "**", "art.nii.gz"), recursive=True)[0]
  output_rater1_image_filename = os.path.join(output_rater1_images_directory, patient_id + '_rater1_art.nii.gz')
  output_rater2_image_filename = os.path.join(output_rater2_images_directory, patient_id + '_rater2_art.nii.gz')
  print('Copying from ' + input_image_filename + ' to ' + output_rater1_image_filename + ' and ' + output_rater2_image_filename)
  try:
    shutil.copy2(input_image_filename, output_rater1_image_filename)
  except:
    print('ERROR: cannot copy from ' + input_image_filename + ' to ' + output_rater1_image_filename)
  try:
    shutil.copy2(input_image_filename, output_rater2_image_filename)
  except:
    print('ERROR: cannot copy from ' + input_image_filename + ' to ' + output_rater2_image_filename)


patient_ids: ['TCGA-G3-AAV3', 'TCGA-G3-AAV7', 'TCGA-G3-A25T', 'TCGA-BC-A3KG', 'TCGA-BC-A216', 'TCGA-DD-A4NJ', 'TCGA-DD-A4NB', 'TCGA-BC-A69I', 'TCGA-BC-4073', 'TCGA-G3-AAV1', 'TCGA-G3-AAV2', 'TCGA-BC-A5W4', 'TCGA-DD-A4NF', 'TCGA-DD-A4NH', 'TCGA-G3-A3CJ', 'TCGA-BC-A10Y', 'TCGA-G3-A7M7']
num_patient_ids: 17
Copying from /content/nifti_and_segms/TCGA-G3-AAV3/01-05-2007/art.nii.gz to /content/LiverHCCSeg_preprocessed_data/rater1/images/TCGA-G3-AAV3_rater1_art.nii.gz and /content/LiverHCCSeg_preprocessed_data/rater2/images/TCGA-G3-AAV3_rater2_art.nii.gz
Copying from /content/nifti_and_segms/TCGA-G3-AAV7/06-10-2007/art.nii.gz to /content/LiverHCCSeg_preprocessed_data/rater1/images/TCGA-G3-AAV7_rater1_art.nii.gz and /content/LiverHCCSeg_preprocessed_data/rater2/images/TCGA-G3-AAV7_rater2_art.nii.gz
Copying from /content/nifti_and_segms/TCGA-G3-A25T/06-26-2001/art.nii.gz to /content/LiverHCCSeg_preprocessed_data/rater1/images/TCGA-G3-A25T_rater1_art.nii.gz and /content/LiverHCCSeg_preprocessed_

In [None]:
# This collection has segmentations from two raters
# However, we also need to change the label id according to what is used in TotalSegmentator CT
# Refer to total_v1 here https://github.com/wasserth/TotalSegmentator/blob/master/totalsegmentator/map_to_binary.py

liverhccseg_to_totalsegmentator_dict = {1:5}

# Process the rater1 liver nii files
for patient_id in patient_ids:
  input_patient_directory = os.path.join(input_main_directory, patient_id)
  input_label_filename = glob(os.path.join(input_patient_directory, "**", "rater1_liver.nii.gz"), recursive=True)[0]
  output_rater1_label_filename = os.path.join(output_rater1_labels_directory, patient_id + '_rater1_art.nii.gz')
  print('Copying from ' + input_label_filename + ' to ' + output_rater1_label_filename + ' and adjusting the label id')
  # Adjust the label id
  try:
    seg_img = nib.load(input_label_filename)
    seg_array = seg_img.get_fdata().copy()
    print("unique labels : " + str(np.unique(seg_array)))
    for chaosLabelID in np.unique(seg_array):
      if chaosLabelID !=0: #ignore background
          seg_array[seg_array == chaosLabelID] = liverhccseg_to_totalsegmentator_dict[chaosLabelID]
    print("unique labels after conversion to TotalSeg labelIDs: "+str(np.unique(seg_array)))
    nib.save(nib.Nifti1Image(seg_array, seg_img.affine, header=seg_img.header), output_rater1_label_filename)
  except:
    print('ERR: cannot copy from ' + input_label_filename + ' to ' + output_rater1_label_filename)

# Repeat for the rater2 liver nii files
for patient_id in patient_ids:
  input_patient_directory = os.path.join(input_main_directory, patient_id)
  input_label_filename = glob(os.path.join(input_patient_directory, "**", "rater2_liver.nii.gz"), recursive=True)[0]
  output_rater2_label_filename = os.path.join(output_rater2_labels_directory, patient_id + '_rater2_art.nii.gz')
  print('Copying from ' + input_label_filename + ' to ' + output_rater2_label_filename + ' and adjusting the label id')
  # Adjust the label id
  try:
    seg_img = nib.load(input_label_filename)
    seg_array = seg_img.get_fdata().copy()
    print("unique labels : " + str(np.unique(seg_array)))
    for chaosLabelID in np.unique(seg_array):
      if chaosLabelID !=0: #ignore background
          seg_array[seg_array == chaosLabelID] = liverhccseg_to_totalsegmentator_dict[chaosLabelID]
    print("unique labels after conversion to TotalSeg labelIDs: "+str(np.unique(seg_array)))
    nib.save(nib.Nifti1Image(seg_array, seg_img.affine, header=seg_img.header), output_rater2_label_filename)
  except:
    print('ERR: cannot copy from ' + input_label_filename + ' to ' + output_rater2_label_filename)


Copying from /content/nifti_and_segms/TCGA-G3-AAV3/01-05-2007/rater1_liver.nii.gz to /content/LiverHCCSeg_preprocessed_data/rater1/labels/TCGA-G3-AAV3_rater1_art.nii.gz and adjusting the label id
unique labels : [0. 1.]
unique labels after conversion to TotalSeg labelIDs: [0. 5.]
Copying from /content/nifti_and_segms/TCGA-G3-AAV7/06-10-2007/rater1_liver.nii.gz to /content/LiverHCCSeg_preprocessed_data/rater1/labels/TCGA-G3-AAV7_rater1_art.nii.gz and adjusting the label id
unique labels : [0. 1.]
unique labels after conversion to TotalSeg labelIDs: [0. 5.]
Copying from /content/nifti_and_segms/TCGA-G3-A25T/06-26-2001/rater1_liver.nii.gz to /content/LiverHCCSeg_preprocessed_data/rater1/labels/TCGA-G3-A25T_rater1_art.nii.gz and adjusting the label id
unique labels : [0. 1.]
unique labels after conversion to TotalSeg labelIDs: [0. 5.]
Copying from /content/nifti_and_segms/TCGA-BC-A3KG/02-02-2002/rater1_liver.nii.gz to /content/LiverHCCSeg_preprocessed_data/rater1/labels/TCGA-BC-A3KG_rater1

# Temporary, delete later

In [None]:
# Here we copy the new files to a bucket
# So then we can call the Validation_preprocessed_data.ipynb notebook and all the data will be in buckets

## Parameterization

In [None]:
#@title Enter your Project ID here
# initialize this variable with your Google Cloud Project ID!
project_name = "idc-external-018" #@param {type:"string"}

import os
os.environ["GCP_PROJECT_ID"] = project_name

!gcloud config set project $project_name

from google.colab import auth
auth.authenticate_user()

Updated property [core/project].


In [None]:
input_local_directory = "/content/LiverHCCSeg_preprocessed_data"
output_bucket_directory = "gs://synthseg/preprocessing_for_github"

!gsutil -m cp -r $input_local_directory $output_bucket_directory

Copying file:///content/LiverHCCSeg_preprocessed_data/rater1/images/TCGA-DD-A4NB_rater1_art.nii.gz [Content-Type=application/octet-stream]...
Copying file:///content/LiverHCCSeg_preprocessed_data/rater1/images/TCGA-G3-AAV1_rater1_art.nii.gz [Content-Type=application/octet-stream]...
/ [0/68 files][    0.0 B/257.4 MiB]   0% Done                                   / [0/68 files][    0.0 B/257.4 MiB]   0% Done                                   Copying file:///content/LiverHCCSeg_preprocessed_data/rater1/images/TCGA-BC-A69I_rater1_art.nii.gz [Content-Type=application/octet-stream]...
Copying file:///content/LiverHCCSeg_preprocessed_data/rater1/images/TCGA-G3-AAV7_rater1_art.nii.gz [Content-Type=application/octet-stream]...
Copying file:///content/LiverHCCSeg_preprocessed_data/rater1/images/TCGA-BC-A10Y_rater1_art.nii.gz [Content-Type=application/octet-stream]...
/ [0/68 files][    0.0 B/257.4 MiB]   0% Done                                   / [0/68 files][    0.0 B/257.4 MiB]   0% Done  