In [7]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import healpy as hp

#### -----------------
## Columns that will be repeated per object
repeated_columns = [
    "ps1_objid", "ra", "dec", "ps1_gMeanPSFMag", "ps1_rMeanPSFMag", "ps1_iMeanPSFMag",
    "nobs_g", "nobs_r", "nobs_i","mean_mag_g","mean_mag_r","mean_mag_i"]
id_column = ["ps1_objid", "ra", "dec"]

## Columns that will be constructed per array field
timedomain_columns = ["band", "catflags", "fieldID", "mag", "magerr", "mjd", "rcID"]

## band-specific columns to timedomain_columns
g_column_map = {"catflags_g":"catflags",
                        "fieldID_g" : "fieldID", 
                        "mag_g":"mag",
                        "magerr_g":"maggerr",
                        "mjd_g":"mjd", 
                        "rcID_g":"rcID"}
g_columns = list(g_column_map.keys())
r_column_map = {"catflags_r":"catflags",
                        "fieldID_r" : "fieldID", 
                        "mag_r":"mag",
                        "magerr_r":"maggerr",
                        "mjd_r":"mjd", 
                        "rcID_r":"rcID"}
r_columns = list(r_column_map.keys())
i_column_map = {"catflags_i":"catflags",
                        "fieldID_i" : "fieldID", 
                        "mag_i":"mag",
                        "magerr_i":"maggerr",
                        "mjd_i":"mjd", 
                        "rcID_i":"rcID"}
i_columns = list(i_column_map.keys())

explode_columns = list(g_column_map.values())

## For completeness, here's some fields I'm dropping because they seem useless or redundant
## If you disagree, please LMK.
## dec_detections, dup, index, level_0, ra_detections, zone

new_column = "__hips_hop"

In [2]:
def compute_index(ra, dec, order=20):
    pix = hp.ang2pix(2**order, ra, dec, nest=True, lonlat=True)
    bits=4 + 2*order
    idx = pix.astype(np.uint64) << (64-bits)
    orig_idx = np.arange(len(idx))
    sorted_idx = np.lexsort((dec, ra, idx))
    idx, ra, dec, orig_idx = idx[sorted_idx], ra[sorted_idx], dec[sorted_idx], orig_idx[sorted_idx]
    _, x, i = np.unique(idx, return_inverse=True, return_index=True)
    x = x.astype(np.uint64)
    ii = np.arange(len(i), dtype=np.uint64)
    di = ii - x[i]
    idx += di
    idx = idx[orig_idx]

    return idx

In [5]:
%%time

## Single table with all objects, exploded

# file_name = "/astro/users/mmd11/data/ztf_row.parquet"
file_name = "/data3/epyc/data3/hipscat/catalogs/ztf_dr14/Norder=1/Dir=0/Npix=33.parquet"

data_frame = pd.read_parquet(file_name, engine="pyarrow")
print(len(data_frame))

just_i = data_frame[repeated_columns+i_columns].copy()
just_i.rename(columns=i_column_map, inplace=True)
just_i['band'] = 'i'

just_g = data_frame[repeated_columns+g_columns].copy()
just_g.rename(columns=g_column_map, inplace=True)
just_g['band'] = 'g'

just_r = data_frame[repeated_columns+r_columns].copy()
just_r.rename(columns=r_column_map, inplace=True)
just_r['band'] = 'r'

explodey = pd.concat([just_i, just_g, just_r]).explode(explode_columns)
print(len(explodey))
explodey = explodey[explodey['mag'].notna()]
explodey = explodey.sort_values(["ps1_objid", 'band', "mjd"])

explodey[new_column] = compute_index(explodey["ra"].values, explodey["dec"].values, order=20)
explodey = explodey.set_index(new_column).sort_index()

explodey.to_parquet("/data3/epyc/data3/hipscat/catalogs/ztf_dr14_explode/Norder1/Npix33/catalog_hop.parquet")

982531
65609157
CPU times: user 5min 29s, sys: 3min, total: 8min 29s
Wall time: 7min 47s


In [2]:
import glob

file_names = glob.glob("/data3/epyc/data3/hipscat/raw/ztf_shards/**parquet")
print(len(file_names))
print(file_names[0])

6020
/data3/epyc/data3/hipscat/raw/ztf_shards/part-00000-shard-0.parquet


In [8]:
%%time

## Separate object and source tables.

file_name = "/data3/epyc/data3/hipscat/catalogs/ztf_dr14/Norder=1/Dir=0/Npix=33.parquet"

data_frame = pd.read_parquet(file_name, engine="pyarrow", columns=repeated_columns)
data_frame[new_column] = compute_index(data_frame["ra"].values, data_frame["dec"].values, order=20)
data_frame = data_frame.set_index(new_column).sort_index()

data_frame.to_parquet("/data3/epyc/data3/hipscat/catalogs/ztf_dr14_explode/Norder1/Npix33/object_hop.parquet")
print(len(data_frame))

data_frame = pd.read_parquet(file_name, engine="pyarrow", columns=i_columns+g_columns+r_columns+id_column)
just_i = data_frame[id_column+i_columns].copy()
just_i.rename(columns=i_column_map, inplace=True)
just_i['band'] = 'i'

just_g = data_frame[id_column+g_columns].copy()
just_g.rename(columns=g_column_map, inplace=True)
just_g['band'] = 'g'

just_r = data_frame[id_column+r_columns].copy()
just_r.rename(columns=r_column_map, inplace=True)
just_r['band'] = 'r'

explodey = pd.concat([just_i, just_g, just_r]).explode(explode_columns)
explodey = explodey[explodey['mag'].notna()]
explodey = explodey.sort_values(["ps1_objid", 'band', "mjd"])

explodey[new_column] = compute_index(explodey["ra"].values, explodey["dec"].values, order=20)
explodey = explodey.set_index(new_column).sort_index()

print(len(explodey))
explodey.to_parquet("/data3/epyc/data3/hipscat/catalogs/ztf_dr14_explode/Norder1/Npix33/source_hop.parquet")

982531
64138932
CPU times: user 4min 50s, sys: 1min 58s, total: 6min 49s
Wall time: 6min 26s


In [9]:
%%time

## Separate object and source tables NO HOP.

file_name = "/data3/epyc/data3/hipscat/catalogs/ztf_dr14/Norder=1/Dir=0/Npix=33.parquet"

# data_frame = pd.read_parquet(file_name, engine="pyarrow", columns=repeated_columns)

# data_frame.to_parquet("/data3/epyc/data3/hipscat/catalogs/ztf_dr14_explode/Norder1/Npix33/object.parquet")
# print(len(data_frame))

data_frame = pd.read_parquet(file_name, engine="pyarrow", columns=i_columns+g_columns+r_columns+id_column)
just_i = data_frame[id_column+i_columns].copy()
just_i.rename(columns=i_column_map, inplace=True)
just_i['band'] = 'i'

just_g = data_frame[id_column+g_columns].copy()
just_g.rename(columns=g_column_map, inplace=True)
just_g['band'] = 'g'

just_r = data_frame[id_column+r_columns].copy()
just_r.rename(columns=r_column_map, inplace=True)
just_r['band'] = 'r'

explodey = pd.concat([just_i, just_g, just_r]).explode(explode_columns)
explodey = explodey[explodey['mag'].notna()]
explodey = explodey.sort_values(["ps1_objid", 'band', "mjd"])

print(len(explodey))
explodey.to_parquet("/data3/epyc/data3/hipscat/catalogs/ztf_dr14_explode/Norder1/Npix33/source.parquet")

64138932
CPU times: user 4min 26s, sys: 2min, total: 6min 26s
Wall time: 6min 19s
