## Imports

In [46]:
import os
import time
import random
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pprint
import re
from astropy.io import fits
from astropy.table import Table

# Making .txt file with all the urls of the files we want to download

### Function

In [None]:
def make_list_of_urls(API_KEY, BASE_API_URL, ENDING, SNAPSHOT_FILTER):
    '''
    This function retrieves a list of URLs from the TNG50-1 API for skirt images.
    It filters the URLs to include only those that contain '_realistic_v2_72'.
    It then retrieves the file URLs from the filtered snapshot URLs and writes them to a text file.
    Args:
        API_KEY (str): The API key for accessing the TNG50-1 API.
        BASE_API_URL (str): The base URL for the TNG50-1 API.
        ENDING (str): The endpoint for skirt images in the TNG50-1 API
        SNAPSHOT_FILTER (str): The filter to apply to the snapshot URLs.
    Returns:
        None: The function writes the file URLs to a text file named 'all_file_urls.txt'.
    Raises:
        HTTPError: If the HTTP request returned an unsuccessful status code.
    Example:
        make_list_of_urls('your_api_key', 'https://www.tng-project.org/', '/api/TNG50-1/files/skirt_images_hsc/', '_realistic_v2_72')
    '''
    def get_endpoint(url):
        r = requests.get(url, headers={'API-Key': API_KEY})
        r.raise_for_status()

        return r.json()

    # Get the list of snapshot URLs
    snapshot_urls = get_endpoint(url = BASE_API_URL + ENDING)

    # Filter the snapshot URLs to only include the ones you want (SNAPSHOT_FILTER)
    filtered_snapshot_urls = []
    for url in snapshot_urls:
        if SNAPSHOT_FILTER in url:
            filtered_snapshot_urls.append(url)

    # Get the file URLs for each snapshot URL
    all_file_urls = []
    for url in filtered_snapshot_urls[0:1]: # `filtered_snapshot_urls` is a list of a single URL that satisfies the filter, so need to do [0:1] to get the first element
        file_urls = get_endpoint(url)['files']
        all_file_urls += file_urls

    # Writing all file URLs to a text file
    # This will create a file named 'all_file_urls.txt' in the current directory
    # Each URL will be on a new line
    with open('all_file_urls.txt', 'w') as f:
        for url in all_file_urls:
            f.write(url + '\n')

### Parameters

In [None]:
API_KEY = 'your_api_key_here'  # Replace with your actual API key

BASE_API_URL = 'https://www.tng-project.org/'
ENDING = '/api/TNG50-1/files/skirt_images_hsc/'
SNAPSHOT_FILTER = '_realistic_v2_72'

### Run

In [None]:
make_list_of_urls(API_KEY, BASE_API_URL, ENDING, SNAPSHOT_FILTER)

# Download urls in the .txt file 
# <span style="background-color:rgb(150, 150, 60)">Ignore this for now</span>

### Code

In [29]:
def download_parent_files(API_KEY, URL_LIST, BATCH_START, BATCH_SIZE):
    '''
    
    THIS FUNCTION SHOULD NOT BE RUN FOR NOW, AS WE ARE ONLY INTERESTED IN THE SPLIT FILES


    Downloads parent files from a list of URLs in batches.
    Args:
        API_KEY (str): The API key for authentication.
        URL_LIST (str): The path to the text file containing URLs.
        BATCH_START (int): The starting index for the batch of URLs to download.
        BATCH_SIZE (int): The number of URLs to download in this batch.
    '''
    # Load URLs
    with open(URL_LIST, 'r') as f:
        urls = [line.strip() for line in f if line.strip()]

    # Select the batch
    batch = urls[BATCH_START:BATCH_START + BATCH_SIZE]

    # Download files
    for url in batch:
        filename = url.split('/')[-3] + '_' + url.split('/')[-1]  # e.g. subhaloID_skirt_images_hsc_realistic_v2.fits
        print(f'Downloading {filename}...')

        headers = {'API-Key': API_KEY}
        try:
            r = requests.get(url, headers=headers, stream=True)
            r.raise_for_status()
            with open(filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f'✅ Saved: {filename}')
        except Exception as e:
            print(f'❌ Failed to download {url}: {e}')

        time.sleep(1)  # this is just to not overload the server and not get myself banned

    print('Done!')

### Parameters

In [None]:
API_KEY = 'your_api_key_here'  # Replace with your actual API key

URL_LIST = 'all_file_urls.txt'

BATCH_START = 0
BATCH_SIZE = 50

### (DON'T) Run

In [None]:
download_parent_files(API_KEY, URL_LIST, BATCH_START, BATCH_SIZE)

# Download the individual filters from each file (g,r,i,z,y)

### Code

In [60]:
def download_and_split_hsc_images(
    OUTPUT_DIR='split_images',
    URL_LIST=None,
    BATCH_START=None,
    BATCH_SIZE=None,
    API_KEY=None,
    remove_parent: bool = False,
    catalog_path=None,
    parent_file_only: bool = False
):
    """
    Downloads and splits HSC survey FITS images from the TNG50-1 API into individual filters,
    optionally removes the original parent FITS files, and can generate a catalog compatible with Hyrax.

    Args:
        OUTPUT_DIR (str, optional): Directory to save split FITS images. Defaults to 'split_images'.
        URL_LIST (str): Path to a text file containing one URL per line.
        BATCH_START (int): Starting index for the batch of URLs to download.
        BATCH_SIZE (int): Number of URLs to process in this batch.
        API_KEY (str): API key required to access the TNG50-1 API.
        remove_parent (bool, optional): If True, delete the original downloaded FITS file after splitting. Defaults to False.
        catalog_path (str, optional): If provided, saves a Hyrax-compatible FITS catalog at this location.
            The catalog will include columns: 'object_id', 'filename', and 'filter'.
        parent_file_only (bool, optional): If True, only download the parent FITS files and skip splitting and catalog creation. Defaults to False.

    Notes:
        - Split FITS images will be named as: SNAPSHOT_SUBHALO_FILTER_VERSION_hsc_realistic.fits
          (e.g., 72_0_G_v2_hsc_realistic.fits)
        - Catalog format is compatible with Hyrax's FitsImageDataSet expectations.

    Example:
        # Save split images and keep the original FITS files
        download_and_split_hsc_images(
            OUTPUT_DIR='split_images',
            URL_LIST='urls.txt',
            BATCH_START=0,
            BATCH_SIZE=50,
            API_KEY='YOUR_API_KEY'
        )

        # Save split images, remove the parent file, and write a catalog
        download_and_split_hsc_images(
            OUTPUT_DIR='split_images',
            URL_LIST='urls.txt',
            BATCH_START=0,
            BATCH_SIZE=50,
            API_KEY='YOUR_API_KEY',
            remove_parent=True,
            catalog_path='split_images/catalog.fits'
        )

        # Download only the parent files, no splitting or catalog
        download_and_split_hsc_images(
            URL_LIST='urls.txt',
            BATCH_START=0,
            BATCH_SIZE=10,
            API_KEY='YOUR_API_KEY',
            parent_file_only=True
        )
    """
    # ensure output dir exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # load URLs and pick batch
    with open(URL_LIST) as f:
        urls = [u.strip() for u in f if u.strip()]
    batch = urls[BATCH_START : BATCH_START + BATCH_SIZE]

    catalog_entries = [] if catalog_path else None

    # helper to pull snapshot, subhalo, version from URL
    def parse_url(u):
        parts = u.split('/')
        snapshot = parts[6]        # e.g. '72'
        subhalo = parts[8]        # e.g. '0'
        fn = parts[-1]       # e.g. 'skirt_images_hsc_realistic_v2.fits'
        v_match = re.search(r'(v\d+)', fn)
        version = v_match.group(1) if v_match else 'v?'
        return snapshot, subhalo, version

    # main loop
    for url in batch:
        snapshot, subhalo, version = parse_url(url)

        # download parent file
        fname_parent = f'{snapshot}_{subhalo}_{version}_parent.fits'
        print(f'\nDownloading {fname_parent} …')
        r = requests.get(url, headers={'API-Key': API_KEY}, stream=True)
        r.raise_for_status()
        with open(fname_parent, 'wb') as f:
            for chunk in r.iter_content(8192):
                f.write(chunk)

        if not parent_file_only:
            # open and split
            with fits.open(fname_parent, memmap=True) as hdul:
                for filt in ['G', 'R', 'I', 'Z', 'Y']:
                    target_ext = f'SUBARU_HSC.{filt}'
                    sci_hdu = next(
                        (h for h in hdul if h.header.get('EXTNAME','') == target_ext),
                        None
                    )
                    if sci_hdu is None:
                        print(f' ⚠️  no extension {target_ext} in {fname_parent}')
                        continue

                    new_hdu = fits.PrimaryHDU(data=sci_hdu.data, header=sci_hdu.header)
                    out_name = f'{snapshot}_{subhalo}_{filt}_{version}_hsc_realistic.fits'
                    out_path = os.path.join(OUTPUT_DIR, out_name)
                    new_hdu.writeto(out_path, overwrite=True)
                    print(f' ✅ wrote {out_name}')
                    if catalog_entries is not None:
                        catalog_entries.append({
                            'object_id': f'{snapshot}_{subhalo}',
                            'filename': out_name,
                            'filter': filt
                        })

            # optionally remove parent file
            if remove_parent:
                try:
                    os.remove(fname_parent)
                    print(f' 🗑 removed parent file {fname_parent}')
                except OSError as e:
                    print(f' ⚠️  could not remove {fname_parent}: {e}')

            # be gentle on the API server
            time.sleep(1)

    if catalog_entries is not None and not parent_file_only:
        table = Table(rows=catalog_entries, names=['object_id', 'filename', 'filter'])
        table.write(catalog_path, overwrite=True)
        print(f' 📄 wrote catalog with {len(catalog_entries)} entries to {catalog_path}')

### Usage

In [None]:
download_and_split_hsc_images(
    OUTPUT_DIR="split_images",
    URL_LIST='all_file_urls.txt',
    BATCH_START=0,
    BATCH_SIZE=50,
    API_KEY="your_api_key_here",
    remove_parent=True,
    catalog_path="split_images/hyrax_catalog.fits"
)


Downloading 72_0_v2_parent.fits …
 ✅ wrote 72_0_G_v2_hsc_realistic.fits
 ✅ wrote 72_0_R_v2_hsc_realistic.fits
 ✅ wrote 72_0_I_v2_hsc_realistic.fits
 ✅ wrote 72_0_Z_v2_hsc_realistic.fits
 ✅ wrote 72_0_Y_v2_hsc_realistic.fits
 🗑 removed parent file 72_0_v2_parent.fits

Downloading 72_1_v2_parent.fits …
 ✅ wrote 72_1_G_v2_hsc_realistic.fits
 ✅ wrote 72_1_R_v2_hsc_realistic.fits
 ✅ wrote 72_1_I_v2_hsc_realistic.fits
 ✅ wrote 72_1_Z_v2_hsc_realistic.fits
 ✅ wrote 72_1_Y_v2_hsc_realistic.fits
 🗑 removed parent file 72_1_v2_parent.fits

Downloading 72_2_v2_parent.fits …
 ✅ wrote 72_2_G_v2_hsc_realistic.fits
 ✅ wrote 72_2_R_v2_hsc_realistic.fits
 ✅ wrote 72_2_I_v2_hsc_realistic.fits
 ✅ wrote 72_2_Z_v2_hsc_realistic.fits
 ✅ wrote 72_2_Y_v2_hsc_realistic.fits
 🗑 removed parent file 72_2_v2_parent.fits

Downloading 72_3_v2_parent.fits …
 ✅ wrote 72_3_G_v2_hsc_realistic.fits
 ✅ wrote 72_3_R_v2_hsc_realistic.fits
 ✅ wrote 72_3_I_v2_hsc_realistic.fits
 ✅ wrote 72_3_Z_v2_hsc_realistic.fits
 ✅ wrote 

In [49]:
test_table = fits.open("split_images/hyrax_catalog.fits")
test_table.info()

Filename: split_images/hyrax_catalog.fits
No.    Name      Ver    Type      Cards   Dimensions   Format
  0  PRIMARY       1 PrimaryHDU       4   ()      
  1                1 BinTableHDU     14   25R x 3C   [4A, 28A, 1A]   


In [56]:
test_table[1].data

FITS_rec([('72_0', '72_0_G_v2_hsc_realistic.fits', 'G'),
          ('72_0', '72_0_R_v2_hsc_realistic.fits', 'R'),
          ('72_0', '72_0_I_v2_hsc_realistic.fits', 'I'),
          ('72_0', '72_0_Z_v2_hsc_realistic.fits', 'Z'),
          ('72_0', '72_0_Y_v2_hsc_realistic.fits', 'Y'),
          ('72_1', '72_1_G_v2_hsc_realistic.fits', 'G'),
          ('72_1', '72_1_R_v2_hsc_realistic.fits', 'R'),
          ('72_1', '72_1_I_v2_hsc_realistic.fits', 'I'),
          ('72_1', '72_1_Z_v2_hsc_realistic.fits', 'Z'),
          ('72_1', '72_1_Y_v2_hsc_realistic.fits', 'Y'),
          ('72_2', '72_2_G_v2_hsc_realistic.fits', 'G'),
          ('72_2', '72_2_R_v2_hsc_realistic.fits', 'R'),
          ('72_2', '72_2_I_v2_hsc_realistic.fits', 'I'),
          ('72_2', '72_2_Z_v2_hsc_realistic.fits', 'Z'),
          ('72_2', '72_2_Y_v2_hsc_realistic.fits', 'Y'),
          ('72_3', '72_3_G_v2_hsc_realistic.fits', 'G'),
          ('72_3', '72_3_R_v2_hsc_realistic.fits', 'R'),
          ('72_3', '72_3_I_v2_h