<a href="https://colab.research.google.com/github/deepakri201/ABDSynth/blob/main/AMOS_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

AMOS preprocessing code

For now, we use the same directory structure and filenames as the original data, so we can do a comparison to make sure our original and new files are the same.

Later, we redo the data structure of the images/labels.

[insert more info later - include inputs/outputs etc]

Deepa Krishnaswamy and Cosmin Ciausu

July 2025

Brigham and Women's Hospital

# Environment setup

In [None]:
import os
import numpy as np
import nibabel as nib
from glob import glob as glob
import shutil
import json

# Download the data from Zenodo

In [None]:
# We download the AMOS data
# https://zenodo.org/records/7155725

!wget https://zenodo.org/records/7155725/files/amos22.zip?download=1

--2025-07-11 21:57:09--  https://zenodo.org/records/7155725/files/amos22.zip?download=1
Resolving zenodo.org (zenodo.org)... 188.185.48.194, 188.185.45.92, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.48.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24234336519 (23G) [application/octet-stream]
Saving to: ‘amos22.zip?download=1.1’


2025-07-11 22:14:10 (22.7 MB/s) - ‘amos22.zip?download=1.1’ saved [24234336519/24234336519]



In [None]:
# Unzip the contents

!unzip /content/amos22.zip?download=1

Archive:  /content/amos22.zip?download=1
replace amos22/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: amos22/.DS_Store        
  inflating: __MACOSX/amos22/._.DS_Store  
  inflating: __MACOSX/amos22/._imagesTr  
  inflating: amos22/readme.md        
  inflating: __MACOSX/amos22/._readme.md  
  inflating: __MACOSX/amos22/._imagesVa  
  inflating: amos22/dataset.json     
  inflating: amos22/labelsTr/amos_0001.nii.gz  
  inflating: amos22/labelsTr/amos_0584.nii.gz  
  inflating: amos22/labelsTr/amos_0162.nii.gz  
  inflating: amos22/labelsTr/amos_0588.nii.gz  
  inflating: amos22/labelsTr/amos_0113.nii.gz  
  inflating: amos22/labelsTr/amos_0170.nii.gz  
  inflating: amos22/labelsTr/amos_0596.nii.gz  
  inflating: amos22/labelsTr/amos_0158.nii.gz  
  inflating: amos22/labelsTr/amos_0058.nii.gz  
  inflating: amos22/labelsTr/amos_0392.nii.gz  
  inflating: amos22/labelsTr/amos_0125.nii.gz  
  inflating: amos22/labelsTr/amos_0025.nii.gz  
  inflating: amos22/labelsTr/amos

# Conversion of AMOS

In [None]:
# Create the output directory to store the images and segmentations to be used for evaluation

output_main_directory = "/content/AMOS_preprocessed_data"

output_train_directory = os.path.join(output_main_directory, "train")
output_train_images_directory = os.path.join(output_train_directory, "images")
output_train_labels_directory = os.path.join(output_train_directory, "labels")

output_val_directory = os.path.join(output_main_directory, "val")
output_val_images_directory = os.path.join(output_val_directory, "images")
output_val_labels_directory = os.path.join(output_val_directory, "labels")

if not os.path.isdir(output_main_directory):
  os.mkdir(output_main_directory)

if not os.path.isdir(output_train_images_directory):
  os.makedirs(output_train_images_directory, exist_ok=True)
if not os.path.isdir(output_train_labels_directory):
  os.makedirs(output_train_labels_directory, exist_ok=True)

if not os.path.isdir(output_val_images_directory):
  os.makedirs(output_val_images_directory, exist_ok=True)
if not os.path.isdir(output_val_labels_directory):
  os.makedirs(output_val_labels_directory, exist_ok=True)

## Train

In [None]:
input_main_directory = "/content/amos22"

input_train_images_directory = os.path.join(input_main_directory, "imagesTr")
input_train_labels_directory = os.path.join(input_main_directory, "labelsTr")

# if id is >=500, it is MRI

# Get the train Patient IDs
# first get the actual patient ids
patient_ids = sorted([patient_id.split('.')[0] for patient_id in os.listdir(input_train_images_directory) if patient_id.endswith('.nii.gz')])
# convert to int and keep the ones >=500
patient_ids = [np.int32(patient_id[5:9]) for patient_id in patient_ids if np.int32(patient_id[5:9])>=500]
# convert back to the full id
patient_ids = [f"amos_{str(i).zfill(4)}" for i in patient_ids]
# Should be 40 for AMOS train
print(patient_ids)
print(len(patient_ids))

['amos_0507', 'amos_0508', 'amos_0510', 'amos_0514', 'amos_0517', 'amos_0518', 'amos_0522', 'amos_0530', 'amos_0532', 'amos_0538', 'amos_0540', 'amos_0541', 'amos_0548', 'amos_0551', 'amos_0554', 'amos_0555', 'amos_0557', 'amos_0558', 'amos_0570', 'amos_0571', 'amos_0578', 'amos_0580', 'amos_0582', 'amos_0583', 'amos_0584', 'amos_0585', 'amos_0586', 'amos_0587', 'amos_0588', 'amos_0589', 'amos_0590', 'amos_0591', 'amos_0592', 'amos_0593', 'amos_0594', 'amos_0595', 'amos_0596', 'amos_0597', 'amos_0599', 'amos_0600']
40


### Images

In [None]:
# Copy the image files

for patient_id in patient_ids:
  input_train_image_filename = os.path.join(input_train_images_directory, patient_id + '.nii.gz')
  output_train_image_filename = os.path.join(output_train_images_directory, patient_id + '.nii.gz')
  print('Copying from ' + input_train_image_filename + ' to ' + output_train_image_filename)
  try:
    shutil.copy2(input_train_image_filename, output_train_image_filename)
  except:
    print('ERROR: cannot copy from ' + input_train_image_filename + ' to ' + output_train_image_filename)


Copying from /content/amos22/imagesTr/amos_0507.nii.gz to /content/AMOS_preprocessed_data/train/images/amos_0507.nii.gz
Copying from /content/amos22/imagesTr/amos_0508.nii.gz to /content/AMOS_preprocessed_data/train/images/amos_0508.nii.gz
Copying from /content/amos22/imagesTr/amos_0510.nii.gz to /content/AMOS_preprocessed_data/train/images/amos_0510.nii.gz
Copying from /content/amos22/imagesTr/amos_0514.nii.gz to /content/AMOS_preprocessed_data/train/images/amos_0514.nii.gz
Copying from /content/amos22/imagesTr/amos_0517.nii.gz to /content/AMOS_preprocessed_data/train/images/amos_0517.nii.gz
Copying from /content/amos22/imagesTr/amos_0518.nii.gz to /content/AMOS_preprocessed_data/train/images/amos_0518.nii.gz
Copying from /content/amos22/imagesTr/amos_0522.nii.gz to /content/AMOS_preprocessed_data/train/images/amos_0522.nii.gz
Copying from /content/amos22/imagesTr/amos_0530.nii.gz to /content/AMOS_preprocessed_data/train/images/amos_0530.nii.gz
Copying from /content/amos22/imagesTr/am

### Labels


In [None]:
# We read the labels from the dataset.json file
json_filename = "/content/amos22/dataset.json"
with open(json_filename, 'r') as file:
  amos_json = json.load(file)
amos_labels = amos_json['labels']
amos_labels

{'0': 'background',
 '1': 'spleen',
 '2': 'right kidney',
 '3': 'left kidney',
 '4': 'gall bladder',
 '5': 'esophagus',
 '6': 'liver',
 '7': 'stomach',
 '8': 'arota',
 '9': 'postcava',
 '10': 'pancreas',
 '11': 'right adrenal gland',
 '12': 'left adrenal gland',
 '13': 'duodenum',
 '14': 'bladder',
 '15': 'prostate/uterus'}

In [None]:
# We manually form the dictionary that holds the mapping of the AMOS ids to the TotalSegmentator CT ids
# refer to the total_v1 here for the TotalSegmentator IDs: https://github.com/wasserth/TotalSegmentator/blob/master/totalsegmentator/map_to_binary.py
# As we don't include the esophagus, aorta, postcava, bladder or prostate/uterus in our evaluation,
# these are not included in the transformed dataset.

amos_to_totalsegmentator_dict = {
    1  : 1,  # spleen
    2  : 2,  # kidney right
    3  : 3,  # left kidney
    4  : 4,  # gallbladder
    6  : 5,  # liver
    7  : 6,  # stomach
    10 : 10, # pancreas
    11 : 11, # adrenal gland right
    12 : 12, # adrenal gland left
    13 : 56  # duodenum
}

In [None]:
# Copy the label files, but change the ids of the segments to match TotalSegmentator

for patient_id in patient_ids:
  print(patient_id)
  input_train_label_filename = os.path.join(input_train_labels_directory, patient_id + '.nii.gz')
  output_train_label_filename = os.path.join(output_train_labels_directory, patient_id + '.nii.gz')
  # Adjust the label id
  seg_img = nib.load(input_train_label_filename)
  seg_array = seg_img.get_fdata().copy()
  seg_array_output = np.zeros_like(seg_array)
  # print("unique labels : " + str(np.unique(seg_array)))
  unique_labels = [int(f) for f in np.unique(seg_array)]
  for amosLabelID in unique_labels:
    if amosLabelID != 0 and amosLabelID in list(amos_to_totalsegmentator_dict.keys()):
      seg_array_output[seg_array == amosLabelID] = amos_to_totalsegmentator_dict[amosLabelID]
  # print("unique labels after conversion to TotalSeg labelIDs: "+str(np.unique(seg_array)))
  # save
  seg_array_output = seg_array_output.astype(np.int16)
  try:
    nib.save(nib.Nifti1Image(seg_array_output, seg_img.affine, header=seg_img.header), output_train_label_filename)
  except:
    print('ERR: cannot copy from ' + input_train_label_filename + ' to ' + output_train_label_filename)

amos_0507
amos_0508
amos_0510
amos_0514
amos_0517
amos_0518
amos_0522
amos_0530
amos_0532
amos_0538
amos_0540
amos_0541
amos_0548
amos_0551
amos_0554
amos_0555
amos_0557
amos_0558
amos_0570
amos_0571
amos_0578
amos_0580
amos_0582
amos_0583
amos_0584
amos_0585
amos_0586
amos_0587
amos_0588
amos_0589
amos_0590
amos_0591
amos_0592
amos_0593
amos_0594
amos_0595
amos_0596
amos_0597
amos_0599
amos_0600


## Val

In [None]:
input_main_directory = "/content/amos22"

input_val_images_directory = os.path.join(input_main_directory, "imagesVa")
input_val_labels_directory = os.path.join(input_main_directory, "labelsVa")

# if id is >=500, it is MRI

# Get the train Patient IDs
# first get the actual patient ids
patient_ids = sorted([patient_id.split('.')[0] for patient_id in os.listdir(input_val_images_directory) if patient_id.endswith('.nii.gz')])
# convert to int and keep the ones >=500
patient_ids = [np.int32(patient_id[5:9]) for patient_id in patient_ids if np.int32(patient_id[5:9])>=500]
# convert back to the full id
patient_ids = [f"amos_{str(i).zfill(4)}" for i in patient_ids]
# Should be 20 for AMOS val
print(patient_ids)
print(len(patient_ids))

['amos_0544', 'amos_0545', 'amos_0546', 'amos_0547', 'amos_0549', 'amos_0550', 'amos_0552', 'amos_0553', 'amos_0556', 'amos_0559', 'amos_0561', 'amos_0562', 'amos_0563', 'amos_0568', 'amos_0572', 'amos_0573', 'amos_0575', 'amos_0576', 'amos_0581', 'amos_0598']
20


### Images

In [None]:
# Copy the image files

for patient_id in patient_ids:
  input_val_image_filename = os.path.join(input_val_images_directory, patient_id + '.nii.gz')
  output_val_image_filename = os.path.join(output_val_images_directory, patient_id + '.nii.gz')
  print('Copying from ' + input_val_image_filename + ' to ' + output_val_image_filename)
  try:
    shutil.copy2(input_val_image_filename, output_val_image_filename)
  except:
    print('ERROR: cannot copy from ' + input_val_image_filename + ' to ' + output_val_image_filename)


Copying from /content/amos22/imagesVa/amos_0544.nii.gz to /content/AMOS_preprocessed_data/val/images/amos_0544.nii.gz
Copying from /content/amos22/imagesVa/amos_0545.nii.gz to /content/AMOS_preprocessed_data/val/images/amos_0545.nii.gz
Copying from /content/amos22/imagesVa/amos_0546.nii.gz to /content/AMOS_preprocessed_data/val/images/amos_0546.nii.gz
Copying from /content/amos22/imagesVa/amos_0547.nii.gz to /content/AMOS_preprocessed_data/val/images/amos_0547.nii.gz
Copying from /content/amos22/imagesVa/amos_0549.nii.gz to /content/AMOS_preprocessed_data/val/images/amos_0549.nii.gz
Copying from /content/amos22/imagesVa/amos_0550.nii.gz to /content/AMOS_preprocessed_data/val/images/amos_0550.nii.gz
Copying from /content/amos22/imagesVa/amos_0552.nii.gz to /content/AMOS_preprocessed_data/val/images/amos_0552.nii.gz
Copying from /content/amos22/imagesVa/amos_0553.nii.gz to /content/AMOS_preprocessed_data/val/images/amos_0553.nii.gz
Copying from /content/amos22/imagesVa/amos_0556.nii.gz t

### Labels

In [None]:
# Copy the label files, but change the ids of the segments to match TotalSegmentator

for patient_id in patient_ids:
  print(patient_id)
  input_val_label_filename = os.path.join(input_val_labels_directory, patient_id + '.nii.gz')
  output_val_label_filename = os.path.join(output_val_labels_directory, patient_id + '.nii.gz')
  # Adjust the label id
  seg_img = nib.load(input_val_label_filename)
  seg_array = seg_img.get_fdata().copy()
  seg_array_output = np.zeros_like(seg_array)
  # print("unique labels : " + str(np.unique(seg_array)))
  unique_labels = [int(f) for f in np.unique(seg_array)]
  for amosLabelID in unique_labels:
    if amosLabelID != 0 and amosLabelID in list(amos_to_totalsegmentator_dict.keys()):
      seg_array_output[seg_array == amosLabelID] = amos_to_totalsegmentator_dict[amosLabelID]
  # print("unique labels after conversion to TotalSeg labelIDs: "+str(np.unique(seg_array)))
  # save
  seg_array_output = seg_array_output.astype(np.int16)
  try:
    nib.save(nib.Nifti1Image(seg_array_output, seg_img.affine, header=seg_img.header), output_val_label_filename)
  except:
    print('ERR: cannot copy from ' + input_val_label_filename + ' to ' + output_val_label_filename)

amos_0544
amos_0545
amos_0546
amos_0547
amos_0549
amos_0550
amos_0552
amos_0553
amos_0556
amos_0559
amos_0561
amos_0562
amos_0563
amos_0568
amos_0572
amos_0573
amos_0575
amos_0576
amos_0581
amos_0598


# Temporary, delete later

In [None]:
# Here we copy the new files to a bucket
# So then we can call the Validation_preprocessed_data.ipynb notebook and all the data will be in buckets

## Parameterization

In [None]:
#@title Enter your Project ID here
# initialize this variable with your Google Cloud Project ID!
project_name = "idc-external-018" #@param {type:"string"}

import os
os.environ["GCP_PROJECT_ID"] = project_name

!gcloud config set project $project_name

from google.colab import auth
auth.authenticate_user()

Updated property [core/project].


In [None]:
input_local_directory = "/content/AMOS_preprocessed_data"
output_bucket_directory = "gs://synthseg/preprocessing_for_github"

!gsutil -m cp -r $input_local_directory $output_bucket_directory

Copying file:///content/AMOS_preprocessed_data/train/images/amos_0588.nii.gz [Content-Type=application/octet-stream]...
/ [0/120 files][    0.0 B/758.1 MiB]   0% Done                                  Copying file:///content/AMOS_preprocessed_data/train/images/amos_0584.nii.gz [Content-Type=application/octet-stream]...
Copying file:///content/AMOS_preprocessed_data/train/images/amos_0580.nii.gz [Content-Type=application/octet-stream]...
/ [0/120 files][    0.0 B/758.1 MiB]   0% Done                                  Copying file:///content/AMOS_preprocessed_data/train/images/amos_0555.nii.gz [Content-Type=application/octet-stream]...
/ [0/120 files][    0.0 B/758.1 MiB]   0% Done                                  / [0/120 files][    0.0 B/758.1 MiB]   0% Done                                  Copying file:///content/AMOS_preprocessed_data/train/images/amos_0558.nii.gz [Content-Type=application/octet-stream]...
/ [0/120 files][    0.0 B/758.1 MiB]   0% Done                              