# Re-import helper

Concatenation of several helper methods to determine the appropriate arguments for importing a dataset that has previously been imported into HATS.

In [None]:
import hats_import
import hats
from hats.pixel_math import HealpixPixel
import os
import matplotlib.pyplot as plt
import numpy as np
import os
import pyarrow.parquet as pq
import hats
import numpy as np
from pathlib import Path

hats_import.__version__

In [None]:
### Change this path!!!
catalog_dir = "/epyc/data3/hats/catalogs/tic"

### ----------------
### You probably won't have to change anything from here.

catalog = hats.read_hats(catalog_dir)

info_frame = catalog.partition_info.as_dataframe()

for index, partition in info_frame.iterrows():
    file_name = result = hats.io.paths.pixel_catalog_file(
        catalog_dir, HealpixPixel(partition["Norder"], partition["Npix"])
    )
    info_frame.loc[index, "size_on_disk"] = os.path.getsize(file_name)

info_frame = info_frame.astype(int)
info_frame["gbs"] = info_frame["size_on_disk"] / (1024 * 1024 * 1024)

In [None]:
print(f'healpix orders: {info_frame["Norder"].unique()}')
print(f'num partitions: {len(info_frame["Npix"])}')
print("------")
print(f'min size_on_disk: {info_frame["gbs"].min():.2f}')
print(f'max size_on_disk: {info_frame["gbs"].max():.2f}')
print(f'size_on_disk ratio: {info_frame["gbs"].max()/info_frame["gbs"].min():.2f}')
print(f'total size_on_disk: {info_frame["gbs"].sum():.2f}')
print("------")

plt.hist(info_frame["gbs"])

bins = [0, 0.5, 1, 2, 100]
labels = ["small-ish", "sweet-spot", "big-ish", "too-big"]
hist = np.histogram(info_frame["gbs"], bins=bins)[0]
pcts = hist / len(info_frame)
for i in range(0, len(labels)):
    print(f"{labels[i]} \t: {hist[i]} \t({pcts[i]*100:.1f} %)")

In [None]:
ra_column = catalog.catalog_info.ra_column
stats = catalog.per_pixel_statistics(include_columns=[ra_column], include_stats=["row_count"])
biggest_parts = stats.sort_values([f"{ra_column}: row_count"], ascending=False).head(5)
print(catalog.catalog_info)

biggest_parts

In [None]:
biggest_pixel = biggest_parts.index[0]
sample_parquet_file = f"{catalog_dir}/dataset/Norder={biggest_pixel.order}/Dir={biggest_pixel.dir}/Npix={biggest_pixel.pixel}.parquet"

sample_file_size = os.path.getsize(sample_parquet_file)
parquet_file = pq.ParquetFile(sample_parquet_file)
num_rows = parquet_file.metadata.num_rows

## 300MB
ideal_file_small = 300 * 1024 * 1024
## 1G
ideal_file_large = 1024 * 1024 * 1024

threshold_small = ideal_file_small / sample_file_size * num_rows
threshold_large = ideal_file_large / sample_file_size * num_rows


print(f"ra_column='{catalog.catalog_info.ra_column}',")
print(f"dec_column='{catalog.catalog_info.dec_column}',")
print(f"expected_total_rows={int(catalog.catalog_info.total_rows):_},")
print(f"pixel_threshold= BETWEEN {int(threshold_small):_} AND {int(threshold_large):_},")