In [1]:
import os
from pathlib import Path

## Configuration

Set the input manifest file path and output directory.

In [2]:
# Input manifest file
input_manifest = '/workspace/data/TCIA_TCGA-BRCA_09-16-2015.tcia'

# Output directory for split manifests
output_dir = '/workspace/data/'

# Number of splits
num_splits = 4

## Read and Parse the Manifest File

The manifest file has a header section (first 6 lines) followed by series IDs.

In [3]:
# Read the manifest file
with open(input_manifest, 'r') as f:
    lines = f.readlines()

# Separate header and series IDs
header_lines = lines[:6]  # First 6 lines are the header
series_ids = lines[6:]     # Remaining lines are series IDs

print(f"Total lines: {len(lines)}")
print(f"Header lines: {len(header_lines)}")
print(f"Series IDs: {len(series_ids)}")
print("\nHeader:")
for line in header_lines:
    print(line.rstrip())

Total lines: 1883
Header lines: 6
Series IDs: 1877

Header:
downloadServerUrl=https://public.cancerimagingarchive.net/nbia-download/servlet/DownloadServlet
includeAnnotation=true
noOfrRetry=4
databasketId=manifest-25vRPwyh8987165612391086998.tcia
manifestVersion=3.0
ListOfSeriesToDownload=


## Split Series IDs into Equal Parts

In [4]:
# Calculate the size of each split
total_series = len(series_ids)
split_size = total_series // num_splits
remainder = total_series % num_splits

print(f"Total series: {total_series}")
print(f"Series per split (base): {split_size}")
print(f"Remainder: {remainder}")

# Split the series IDs
splits = []
start_idx = 0

for i in range(num_splits):
    # Distribute remainder across first splits
    current_split_size = split_size + (1 if i < remainder else 0)
    end_idx = start_idx + current_split_size
    
    splits.append(series_ids[start_idx:end_idx])
    print(f"Split {i+1}: {len(splits[i])} series (indices {start_idx} to {end_idx-1})")
    
    start_idx = end_idx

# Verify all series are accounted for
total_in_splits = sum(len(s) for s in splits)
print(f"\nTotal series in all splits: {total_in_splits}")
print(f"Verification: {total_in_splits == total_series}")

Total series: 1877
Series per split (base): 469
Remainder: 1
Split 1: 470 series (indices 0 to 469)
Split 2: 469 series (indices 470 to 938)
Split 3: 469 series (indices 939 to 1407)
Split 4: 469 series (indices 1408 to 1876)

Total series in all splits: 1877
Verification: True


## Write Split Manifest Files

Each output file will contain the header followed by 1/4 of the series IDs.

In [5]:
# Get the base name of the input file
input_path = Path(input_manifest)
base_name = input_path.stem  # filename without extension
extension = input_path.suffix  # .tcia

# Write each split to a separate file
output_files = []

for i, split_series in enumerate(splits, 1):
    # Create output filename
    output_filename = f"{base_name}_part{i}_of_{num_splits}{extension}"
    output_path = os.path.join(output_dir, output_filename)
    
    # Write the manifest file
    with open(output_path, 'w') as f:
        # Write header
        f.writelines(header_lines)
        # Write series IDs for this split
        f.writelines(split_series)
    
    output_files.append(output_path)
    print(f"Created: {output_filename} ({len(split_series)} series)")

print(f"\nSuccessfully created {len(output_files)} manifest files!")

Created: TCIA_TCGA-BRCA_09-16-2015_part1_of_4.tcia (470 series)
Created: TCIA_TCGA-BRCA_09-16-2015_part2_of_4.tcia (469 series)
Created: TCIA_TCGA-BRCA_09-16-2015_part3_of_4.tcia (469 series)
Created: TCIA_TCGA-BRCA_09-16-2015_part4_of_4.tcia (469 series)

Successfully created 4 manifest files!


## Verify Output Files

Check that each output file is valid and contains the expected number of lines.

In [6]:
print("Verification of output files:\n")

for i, output_path in enumerate(output_files, 1):
    with open(output_path, 'r') as f:
        file_lines = f.readlines()
    
    header = file_lines[:6]
    series = file_lines[6:]
    
    print(f"Part {i}: {os.path.basename(output_path)}")
    print(f"  Total lines: {len(file_lines)}")
    print(f"  Header lines: {len(header)}")
    print(f"  Series IDs: {len(series)}")
    print(f"  First series ID: {series[0].strip() if series else 'N/A'}")
    print(f"  Last series ID: {series[-1].strip() if series else 'N/A'}")
    print()

Verification of output files:

Part 1: TCIA_TCGA-BRCA_09-16-2015_part1_of_4.tcia
  Total lines: 476
  Header lines: 6
  Series IDs: 470
  First series ID: 1.3.6.1.4.1.14519.5.2.1.1869.4002.208324965693621375957867341309
  Last series ID: 1.3.6.1.4.1.14519.5.2.1.1869.4002.149492185494350856251349770767

Part 2: TCIA_TCGA-BRCA_09-16-2015_part2_of_4.tcia
  Total lines: 475
  Header lines: 6
  Series IDs: 469
  First series ID: 1.3.6.1.4.1.14519.5.2.1.1869.4002.265950682563669640985038952372
  Last series ID: 1.3.6.1.4.1.14519.5.2.1.9203.4002.884031599361005278884781093491

Part 3: TCIA_TCGA-BRCA_09-16-2015_part3_of_4.tcia
  Total lines: 475
  Header lines: 6
  Series IDs: 469
  First series ID: 1.3.6.1.4.1.14519.5.2.1.6450.4002.338410652138881121304256137283
  Last series ID: 1.3.6.1.4.1.14519.5.2.1.3023.4002.225035917136485052141793362807

Part 4: TCIA_TCGA-BRCA_09-16-2015_part4_of_4.tcia
  Total lines: 475
  Header lines: 6
  Series IDs: 469
  First series ID: 1.3.6.1.4.1.14519.5.2.1.33