**Andy Tzanidakis**

In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import astropy.units as u
%matplotlib inline
%config InlineBackend.figure_format = "retina"
from matplotlib import rcParams
rcParams['savefig.dpi'] = 550
rcParams['font.size'] = 20
plt.rc('font', family='serif')

import lsdb
from lsdb import lsdb_client
client = lsdb_client(dask_on_ray=True, num_workers=8)


2023-11-27 10:25:30,742	INFO worker.py:1507 -- Calling ray.init() again after it has already been called.


In [2]:
#Hipcatts
gaia = lsdb.read_hipscat("/data3/epyc/projects3/ivoa_demo/gaia/catalog")
ztf = lsdb.read_hipscat("/data3/epyc/data3/hipscat/catalogs/ztf_axs/ztf_dr14")

#sources load takes a minute, since it creates a healpix alignment on load
ztf_sources = lsdb.read_hipscat("/data3/epyc/data3/hipscat/catalogs/ztf_axs/ztf_source")

## Task 1

- Cone search Gaia + ZTF FG dwarf sources with classprob_dsc_combmod_star_gaia > 0.5

- Condition sources such that: 
    1. mag[catflags] has been applied (i.e removing bad catflags)
    2. Compute biweight location
    3. Query/cut sources based on biweight location 

In [3]:
%%time
_sample = gaia.cone_search(
    ra=31,
    dec=3,
    radius=1,
).crossmatch(
    ztf
).query(
    "nobs_g_ztf_dr14 > 100 and nobs_r_ztf_dr14 > 100 and \
    parallax_gaia > 0 and parallax_over_error_gaia > 5 and \
    teff_gspphot_gaia > 5380 and teff_gspphot_gaia < 7220 and logg_gspphot_gaia > 4.5 and logg_gspphot_gaia < 4.72 and classprob_dsc_combmod_star_gaia > 0.5"
).compute()

[36m(dask:cone_filter-3a50b7d3-fbf9-4912-a32e-4bd6ea406f77 pid=125613)[0m   data_frame["_CONE_SEP"] = df_separations


CPU times: user 1.21 s, sys: 213 ms, total: 1.43 s
Wall time: 14.9 s


In [4]:
_sample_hips = lsdb.from_dataframe(
    _sample, 
    lowest_order=5, 
    highest_order=8, 
    set_hipscat_index=False, 
    ra_column="ra_gaia", 
    dec_column="dec_gaia", 
    threshold=1_000_000
)

_sample_sources = _sample_hips.join(
    ztf_sources, left_on="ps1_objid_ztf_dr14", right_on="ps1_objid"
).compute()

In [12]:
len(np.unique(_sample_sources.index))

102

In [13]:
def fetch_lc(hips_id, table=_sample_sources, band='r'):
    """This function fetches the light curve for a given hips_id and band.
    
    Parameters
    ----------
    hips_id : int
    band (str): photometric band (default='r')

    Returns
    -------
    light curve table for a given band 
    """
    one_object = table.query(f"_hipscat_index == {hips_id}")

    if band=='r':
        one_object_rband = one_object.query("band == \"r\"")
        cat = one_object_rband['catflags'] != -32768 
        return one_object_rband["mjd"][cat], one_object_rband["mag"][cat], one_object_rband["magerr"][cat]

    elif band=='g':
        one_object_gband = one_object.query("band == \"g\"")
        cat = one_object_gband['catflags'] != -32768 
        return one_object_gband["mjd"][cat], one_object_gband["mag"][cat], one_object_gband["magerr"][cat]

    elif band=='i':
        one_object_iband = one_object.query("band == \"i\"")
        return one_object_iband["mjd"], one_object_iband["mag"], one_object_iband["magerr"]

    elif band=='all':
        one_object_rband = one_object.query("band == \"r\"")
        one_object_gband = one_object.query("band == \"g\"")
        one_object_iband = one_object.query("band == \"i\"")

        gband_mjd, gband_mag, gband_magerr = one_object_gband["mjd"], one_object_gband["mag"], one_object_gband["magerr"]
        rband_mjd, rband_mag, rband_magerr = one_object_rband["mjd"], one_object_rband["mag"], one_object_rband["magerr"]
        iband_mjd, iband_mag, iband_magerr = one_object_iband["mjd"], one_object_iband["mag"], one_object_iband["magerr"]


        return gband_mjd, gband_mag, gband_magerr, rband_mjd, rband_mag, rband_magerr, iband_mjd, iband_mag, iband_magerr

In [24]:
%%time

# random sourceID
rnd = 5029289713001824256
lc_r = fetch_lc(rnd, band='r')

CPU times: user 57.2 ms, sys: 9.31 ms, total: 66.5 ms
Wall time: 63.7 ms


In [28]:
# bleh querying each light curve would take too long!
(63*u.ms * (1_000_000)).to(u.hr)

<Quantity 17.5 h>

In [14]:
# let's try writing a custom function
def custom_function(df):
    df_rband = df.query("band == \"r\"")

    cat = df_rband['catflags'] != -32768 # remove spurious measurements
    x, y, yerr = df_rband["mjd"][cat], df_rband["mag"][cat], df_rband["magerr"][cat]

    return pd.DataFrame(
        {
            "ps1_objid" : [df["ps1_objid"].values.tolist()[0]],
            "sigma_cat": [np.std(y)]
        }
    )

In [19]:
ztf_sample_with_sources_rehips = lsdb.from_dataframe(
    _sample_sources, 
    lowest_order=5, 
    highest_order=8, 
    set_hipscat_index=False, 
    ra_column="ra_gaia", 
    dec_column="dec_gaia",
    threshold=1_000_000
)

In [22]:
var_analysis = ztf_sample_with_sources_rehips.for_each(
    key="ps1_objid",                       # groupby index
    ufunc=custom_function, # apply(ufunc) for each group
    meta={"ps1_objid": "i8", 
          "sigma_cat": "f8"}                       # return metadata
).compute()

In [23]:
var_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,ps1_objid,sigma_cat
ps1_objid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
110430308269557355,0,110430308269557355,0.012260
110450308595118270,0,110450308595118270,0.012616
110460310693527414,0,110460310693527414,0.014508
110490308722577435,0,110490308722577435,0.015595
110520309590336414,0,110520309590336414,0.018491
...,...,...,...
112590307708633572,0,112590307708633572,0.031572
112700312133184343,0,112700312133184343,0.045506
112720311463028137,0,112720311463028137,0.015124
112740310761225460,0,112740310761225460,0.014929


In [None]:
# Q: Can we avoid this light curve processing method without .compute()?

