## TIC collection import

Author: Melissa D

Last run: Jun 12, 2025

ISSUE LINK TODO

New threshold: threshold between 956_376 and 3_264_430

In [None]:
# !pip install lsdb

In [2]:
import hats
import numpy as np
from dask.distributed import Client
from hats_import import CollectionArguments, pipeline_with_client, pipeline, VerificationArguments
from pathlib import Path
from hats_import.catalog.file_readers import CsvReader
from astropy.io import ascii
import lsdb

hats.__version__

'0.6'

In [3]:
## input paths:
raw_dir = Path("/data3/epyc/data3/hipscat/raw/gaia/")
file_list = list(raw_dir.glob("GaiaSource*"))
print("found", len(file_list), "files for import")

## Index division hints
global_min = 4295806720
global_max = 6917528997577384320
num_row_groups = 3933

increment = int((global_max - global_min) / num_row_groups)

divisions = np.append(np.arange(start=global_min, stop=global_max, step=increment), global_max)
divisions = divisions.tolist()

## Create a schema file, using the contents from the VOTable.

empty_astropy_table = ascii.read(file_list[0], format="ecsv", data_end=1)
empty_astropy_table.write("gaia_schema.parquet", overwrite=True)

found 3388 files for import


In [4]:
args = (
    CollectionArguments(
        completion_email_address="delucchi@andrew.cmu.edu",
        output_artifact_name="tic",
        output_path="/data3/epyc/data3/hats/catalogs/v06",
        progress_bar=True,
        simple_progress_bar=True,
    )
    .catalog(
        output_artifact_name="tic",
        input_file_list=file_list,
        file_reader=CsvReader(comment="#", schema_file="gaia_schema.parquet", compression="gzip"),
        ra_column="ra",
        dec_column="dec",
        sort_columns="source_id",
        use_schema_file="gaia_schema.parquet",
        highest_healpix_order=8,
        skymap_alt_orders=[2, 4, 6],
        pixel_threshold=2_000_000,
        row_group_kwargs={"num_rows": 200_000},
    )
    .add_margin(margin_threshold=10.0, is_default=True)
    .add_margin(margin_threshold=300.0, output_artifact_name="gaia_300arcs")
    .add_index(
        indexing_column="source_id",
        output_artifact_name="gaia_source_id_index",
        include_healpix_29=False,
        include_order_pixel=True,
        compute_partition_size=2_000_000_000,
        division_hints=divisions,
        drop_duplicates=False,
    )
)

In [None]:
with Client(
    local_directory="/data3/epyc/data3/hats/tmp/",
    n_workers=20,
    threads_per_worker=1,
) as client:
    pipeline_with_client(args, client)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34084 instead
Planning  : 100%|██████████| 4/4 [00:00<00:00, 93.97it/s]


Mapping   : 100%|██████████| 3388/3388 [35:54<00:00,  1.57it/s]  
Binning   : 100%|██████████| 2/2 [00:18<00:00,  9.07s/it]
Splitting : 100%|██████████| 3388/3388 [3:32:24<00:00,  3.76s/it]  
Reducing  :  11%|█▏        | 229/2016 [07:17<1:04:05,  2.15s/it]

In [None]:
catalog = lsdb.open_catalog("/data3/epyc/data3/hats/catalogs/v06/gaia_dr3")
len(catalog.margin)

In [None]:
catalog.plot_pixels()

In [None]:
%% time
catalog.id_search(values={"source_id":35325045153309056}).compute()

In [None]:
args = VerificationArguments(
    input_catalog_path="/data3/epyc/data3/hats/catalogs/v06/gaia_dr3/gaia",
    output_path="./verification/gaia_dr3",
)
pipeline(args)