## 2MASS collection import

Author: Melissa D

Last run: ???

https://github.com/astronomy-commons/data.lsdb.io/issues/159

New threshold: threshold between 3_387_891 and 11_564_002

In [1]:
import hats
import numpy as np
from dask.distributed import Client
from hats_import import CollectionArguments, pipeline_with_client, pipeline, VerificationArguments
from pathlib import Path
from hats_import.catalog.file_readers import CsvReader
from astropy.io import ascii
import lsdb
import pandas as pd

hats.__version__

  from .autonotebook import tqdm as notebook_tqdm


'0.7.1'

In [2]:
## input paths:
raw_dir = Path("/epyc/data3/hipscat/raw/two_mass")
file_list = list(raw_dir.glob("psc*.gz"))
print("found", len(file_list), "files for import")

found 92 files for import


In [3]:
# Load the column names and types from a side file.
type_frame = pd.read_csv(raw_dir / "schema.csv")
type_map = dict(zip(type_frame["name"], type_frame["type"]))

In [8]:
args = (
    CollectionArguments(
        completion_email_address="delucchi@andrew.cmu.edu",
        output_artifact_name="two_mass",
        output_path="/epyc/data3/hats/catalogs/v06",
        progress_bar=True,
        simple_progress_bar=True,
    )
    .catalog(
        output_artifact_name="two_mass",
        input_file_list=file_list,
        file_reader=CsvReader(
            header=None,
            column_names=type_frame["name"].values.tolist(),
            type_map=type_map,
            chunksize=250_000,
            sep="|",
            compression="gzip",
            na_values="\\N",
        ),
        ra_column='ra',
        dec_column='decl',
        expected_total_rows=470_992_970,
        pixel_threshold=5_000_000,
        highest_healpix_order=8,
        skymap_alt_orders=[2, 4, 6],
        row_group_kwargs={"num_rows": 200_000},
        # resume=False,
    )
    .add_margin(margin_threshold=5.0, is_default=True)
)

In [9]:
with Client(
    local_directory="/epyc/data3/hats/tmp/",
    n_workers=10,
    threads_per_worker=1,
) as client:
    pipeline_with_client(args, client)

Planning  : 100%|██████████| 3/3 [00:05<00:00,  1.92s/it]
Mapping   : 100%|██████████| 219/219 [00:37<00:00,  5.79it/s]
Binning   :   0%|          | 0/1 [00:00<?, ?it/s]
Reducing  : 100%|██████████| 219/219 [00:01<00:00, 164.68it/s]
Finishing : 100%|██████████| 4/4 [00:00<00:00,  4.70it/s]
Finishing : 100%|██████████| 2/2 [00:00<00:00, 58.34it/s]


In [10]:
args = VerificationArguments(
    input_catalog_path="/epyc/data3/hats/catalogs/v06/two_mass",
    output_path="./verification/two_mass",
)
pipeline(args)

Loading dataset and schema.

Starting: Test hats.io.validation.is_valid_collection.
Validating collection at path /epyc/data3/hats/catalogs/v06/two_mass ... 
Validating catalog at path /epyc/data3/hats/catalogs/v06/two_mass/two_mass ... 
Found 219 partitions.
Approximate coverage is 100.00 % of the sky.
Validating catalog at path /epyc/data3/hats/catalogs/v06/two_mass/two_mass_5arcs ... 
Found 219 partitions.
Approximate coverage is 100.00 % of the sky.
Result: PASSED

Starting: Test that files in _metadata match the data files on disk.
Result: PASSED

Starting: Test that number of rows are equal.
	file footers vs catalog properties
	file footers vs _metadata
Result: PASSED

Starting: Test that schemas are equal, excluding metadata.
	_common_metadata vs truth
	_metadata vs truth
	file footers vs truth
Result: PASSED

Verifier results written to verification/two_mass/verifier_results.csv
Elapsed time (seconds): 0.51


In [None]:
from hats_import.catalog.resume_plan import ResumePlan

resume_plan = ResumePlan(import_args=args.catalog_args)

raw_histogram = resume_plan.read_histogram(args.catalog_args.mapping_healpix_order, which_histogram="row_count")
resume_plan.get_alignment_file(
            raw_histogram,
            args.catalog_args.constant_healpix_order,
            args.catalog_args.highest_healpix_order,
            args.catalog_args.lowest_healpix_order,
            args.catalog_args.pixel_threshold,
            args.catalog_args.drop_empty_siblings,
            470992970,
    
        )

In [None]:
remaining_reduce = resume_plan.get_reduce_items()

for pixel, expected, _ in remaining_reduce:
    # pixel_dir = Path(f"/epyc/data3/hats/catalogs/v06/tic/intermediate/tic/intermediate/order_{pixel.order}/dir_{pixel.dir}/pixel_{pixel.pixel}")
    # total =0 
    # for file in pixel_dir.glob("*.parquet"):
    #     parquet_file = pq.ParquetFile(file)
    #     total += parquet_file.metadata.num_rows
    # if total != expected:
    #     print("pixel", pixel, "expected", expected, "found", total)

    reduced_file = Path(f"{args.catalog_args.catalog_path}/dataset/Norder={pixel.order}/Dir={pixel.dir}/Npix={pixel.pixel}.parquet")
    if reduced_file.exists():
        # print("pixel", pixel, "is sussss")
        try:
            hats.io.file_io.read_parquet_metadata(reduced_file)
        except:
            print("rm", reduced_file)