# Alerce alert import

Start with collating the three different tables into a single table with the three sets of data nested under the object level.

Then we hipscat-import.

In [14]:
import pandas as pd
import pickle
from tqdm import tqdm


import glob
import hipscat_import.pipeline as runner
from hipscat_import.catalog.arguments import ImportArguments


In [19]:
lcs_aggs = {
    "mean_ra": pd.NamedAgg(column="ra", aggfunc="mean"),
    "mean_dec": pd.NamedAgg(column="dec", aggfunc="mean"),
}
for col in ['ra','dec','candid', 'mjd', 'fid', 'pid', 'diffmaglim', 'isdiffpos', 'nid',
            'magpsf', 'sigmapsf', 'magap', 'sigmagap', 'distnr', 'rb',
           'rbversion', 'drb', 'drbversion', 'magapbig', 'sigmagapbig', 'rfid',
           'magpsf_corr', 'sigmapsf_corr', 'sigmapsf_corr_ext', 'corrected',
           'dubious', 'parent_candid', 'has_stamp', 'step_id_corr']:
    lcs_aggs[f"lc_{col}"] = pd.NamedAgg(column=col, aggfunc=list)

nondet_aggs = {
    "nondet_mjd": pd.NamedAgg(column="mjd", aggfunc=list),
    "nondet_fid": pd.NamedAgg(column="fid", aggfunc=list),
    "nondet_diffmaglim": pd.NamedAgg(column="diffmaglim", aggfunc=list),
    
}

ref_aggs = {}

for col in ["rfid","candid","fid","rcid","field","magnr",
            "sigmagnr","chinr","sharpnr","ranr","decnr",
            "mjdstartref","mjdendref","nframesref"]:
    ref_aggs[f"ref_{col}"] = pd.NamedAgg(column=col, aggfunc=list)



In [21]:
for index in tqdm(range(1,183)):
    lc_file_name = f"/data3/epyc/data3/hipscat/raw/alerts/lcs_{index:04}.picklelcs_{index:04}.pickle"

    with open(lc_file_name, "rb") as pickle_file:
        lightcurves = pickle.load(pickle_file)


    lightcurves = lightcurves.groupby("oid").agg(**lcs_aggs)


    nondet_file_name = f"/data3/epyc/data3/hipscat/raw/alerts/nondet_{index:04}.picklenondet_{index:04}.pickle"

    with open(nondet_file_name, "rb") as pickle_file:
        nondet = pickle.load(pickle_file)


    nondet = nondet.groupby("oid").agg(**nondet_aggs)

    ref_file_name = f"/data3/epyc/data3/hipscat/raw/alerts/ref_{index:04}.pickleref_{index:04}.pickle"

    with open(ref_file_name, "rb") as pickle_file:
        ref_frame = pickle.load(pickle_file)


    ref_frame = ref_frame.groupby("oid").agg(**ref_aggs)

    agged = pd.merge(pd.merge(lightcurves, nondet, on="oid", suffixes=(False, False)), ref_frame, on="oid", suffixes=(False, False))
    agged.to_parquet(f"/data3/epyc/data3/hipscat/raw/alerts/nested_named/file_{index:04}.parquet")

100%|██████████| 182/182 [1:29:37<00:00, 29.55s/it]


In [22]:
files = glob.glob("/data3/epyc/data3/hipscat/raw/alerts/nested_named/file_*")
files.sort()
print(f"found {len(files)} files")

args = ImportArguments(
    output_artifact_name="alerce_nested",
    input_file_list=files,
    file_reader="parquet",
    ra_column="mean_ra",
    dec_column="mean_dec",
    pixel_threshold=40_000,
    tmp_dir="/data3/epyc/data3/hipscat/tmp/",
    highest_healpix_order=6,
    dask_n_workers=10,
    dask_threads_per_worker=1,
    dask_tmp="/data3/epyc/data3/hipscat/tmp/",
    output_path="/data3/epyc/data3/hipscat/test_catalogs/alerce/",
    completion_email_address="delucchi@andrew.cmu.edu",
    simple_progress_bar=True,
)
runner.pipeline(args)


found 183 files


Planning  :   0%|          | 0/5 [00:00<?, ?it/s]

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37912 instead


Mapping   :   0%|          | 0/183 [00:00<?, ?it/s]

Binning   :   0%|          | 0/2 [00:00<?, ?it/s]

Splitting :   0%|          | 0/183 [00:00<?, ?it/s]

Reducing  :   0%|          | 0/113 [00:00<?, ?it/s]

Finishing :   0%|          | 0/5 [00:00<?, ?it/s]

In [23]:
import os
import pyarrow.parquet as pq

# sample_parquet_file = "/data3/epyc/data3/hipscat/test_catalogs/alerce/alerce_sample/Norder=0/Dir=0/Npix=3.parquet"
sample_parquet_file = "/data3/epyc/data3/hipscat/raw/alerts/nested_pq/file_0070.pickle"

sample_file_size = os.path.getsize(sample_parquet_file)
parquet_file = pq.ParquetFile(sample_parquet_file)
num_rows = parquet_file.metadata.num_rows

## 300MB
ideal_file_small = 300 * 1024 * 1024
## 1G
ideal_file_large = 1024 * 1024 * 1024

threshold_small = ideal_file_small / sample_file_size * num_rows
threshold_large = ideal_file_large / sample_file_size * num_rows

print(f"threshold between {int(threshold_small):_} and {int(threshold_large):_}")

threshold between 21_465 and 73_268
