# Assemble WebSky background catalog with Dask


In [2]:
import h5pickle as h5py
import numpy as np
import healpy as hp
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
%load_ext jupyter_ai
%ai list gemini

| Provider | Environment variable | Set? | Models |
|----------|----------------------|------|--------|
| `gemini` | `GOOGLE_API_KEY` | <abbr title="You have set this environment variable, so you can use this provider's models.">✅</abbr> | <ul><li>`gemini:gemini-1.0-pro`</li><li>`gemini:gemini-1.0-pro-001`</li><li>`gemini:gemini-1.0-pro-latest`</li><li>`gemini:gemini-1.0-pro-vision-latest`</li><li>`gemini:gemini-pro`</li><li>`gemini:gemini-pro-vision`</li></ul> |


In [4]:
#%%ai gemini:gemini-pro -f code


In [5]:
import healpy as hp
hp.version

<module 'healpy.version' from '/global/common/software/cmb/zonca/conda/pycmb/lib/python3.10/site-packages/healpy/version.py'>

In [6]:
%alias_magic gm ai -p "gemini:gemini-pro -f code"

Created `%gm` as an alias for `%ai gemini:gemini-pro -f code`.
Created `%%gm` as an alias for `%%ai gemini:gemini-pro -f code`.


In [7]:
import os

num_threads = 128
os.environ["OMP_NUM_THREADS"] = "1"

In [8]:
from dask.distributed import Client, LocalCluster

cluster = LocalCluster(n_workers=num_threads, threads_per_worker=1, processes=True)
client = Client(cluster)

In [9]:
cutoff_flux = 1e-3

In [10]:
plot = False

In [11]:
cd /global/cfs/cdirs/sobs/www/users/Radio_WebSky/matched_catalogs_2

/global/cfs/cdirs/sobs/www/users/Radio_WebSky/matched_catalogs_2


In [12]:
cat = h5py.File("catalog_100.0.h5", "r")

In [13]:
#%%ai gemini:gemini-pro -f code
    
#find the fields in a h5py File

In [14]:
import dask.array as da

There are no metadata in the file, I guess fluxes are in `Jy`

In [15]:
import pandas as pd
import xarray as xr

In [16]:
field = 'flux'

In [17]:
chunk_size = int(1e6)

In [18]:
cat_xr = xr.open_dataset("catalog_100.0.h5", chunks=chunk_size)
cat_xr = cat_xr.rename({"phony_dim_0":"index"})

In [19]:
cutoff_mask = (cat_xr.flux < cutoff_flux).compute()

In [20]:
%load_ext jupyter_ai
%ai list gemini

The jupyter_ai extension is already loaded. To reload it, use:
  %reload_ext jupyter_ai


| Provider | Environment variable | Set? | Models |
|----------|----------------------|------|--------|
| `gemini` | `GOOGLE_API_KEY` | <abbr title="You have set this environment variable, so you can use this provider's models.">✅</abbr> | <ul><li>`gemini:gemini-1.0-pro`</li><li>`gemini:gemini-1.0-pro-001`</li><li>`gemini:gemini-1.0-pro-latest`</li><li>`gemini:gemini-1.0-pro-vision-latest`</li><li>`gemini:gemini-pro`</li><li>`gemini:gemini-pro-vision`</li></ul> |


In [21]:
#%%ai gemini:gemini-pro -f code

In [22]:
pol_coeff = xr.open_dataarray(
    "/pscratch/sd/z/zonca/websky_full_catalog_polarized flux.h5", chunks=chunk_size)[:, cutoff_mask]

In [23]:
pol_coeff

Unnamed: 0,Array,Chunk
Bytes,10.48 GiB,38.11 MiB
Shape,"(5, 281384121)","(5, 999136)"
Dask graph,282 chunks in 3 graph layers,282 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 10.48 GiB 38.11 MiB Shape (5, 281384121) (5, 999136) Dask graph 282 chunks in 3 graph layers Data type float64 numpy.ndarray",281384121  5,

Unnamed: 0,Array,Chunk
Bytes,10.48 GiB,38.11 MiB
Shape,"(5, 281384121)","(5, 999136)"
Dask graph,282 chunks in 3 graph layers,282 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [24]:
temp_coeff = xr.open_dataarray("/pscratch/sd/z/zonca/websky_full_catalog_flux.h5", chunks=chunk_size)[:, cutoff_mask]

In [25]:
output_catalog = xr.Dataset({"logpolycoefpolflux":pol_coeff,"logpolycoefflux":temp_coeff })

In [27]:
output_catalog

Unnamed: 0,Array,Chunk
Bytes,10.48 GiB,38.11 MiB
Shape,"(281384121, 5)","(999136, 5)"
Dask graph,282 chunks in 4 graph layers,282 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 10.48 GiB 38.11 MiB Shape (281384121, 5) (999136, 5) Dask graph 282 chunks in 4 graph layers Data type float64 numpy.ndarray",5  281384121,

Unnamed: 0,Array,Chunk
Bytes,10.48 GiB,38.11 MiB
Shape,"(281384121, 5)","(999136, 5)"
Dask graph,282 chunks in 4 graph layers,282 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.48 GiB,38.11 MiB
Shape,"(281384121, 5)","(999136, 5)"
Dask graph,282 chunks in 4 graph layers,282 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 10.48 GiB 38.11 MiB Shape (281384121, 5) (999136, 5) Dask graph 282 chunks in 4 graph layers Data type float64 numpy.ndarray",5  281384121,

Unnamed: 0,Array,Chunk
Bytes,10.48 GiB,38.11 MiB
Shape,"(281384121, 5)","(999136, 5)"
Dask graph,282 chunks in 4 graph layers,282 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [33]:
output_catalog = output_catalog.transpose()

In [34]:
output_catalog["logpolycoefflux"].shape

(281384121, 5)

In [35]:
output_catalog.logpolycoefflux.attrs["units"] = "Jy"
output_catalog.logpolycoefpolflux.attrs["units"] = "Jy"

In [36]:
for coord in ["theta", "phi"]:
    output_catalog = output_catalog.assign_coords(
        **{coord:(("index"), cat_xr[coord][cutoff_mask].data)})

In [37]:
output_catalog["flux_100"] = np.polynomial.polynomial.polyval(
    np.log(100), output_catalog["logpolycoefflux"][:,::-1], tensor=False)

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
  c0 = c[-i] + c0*x


In [38]:
output_catalog["flux_100"].max()

In [39]:
output_catalog = output_catalog.sortby("flux_100", ascending=False)
del output_catalog["flux_100"]

In [40]:
output_catalog.coords["theta"].attrs["units"] = "rad"
output_catalog.coords["phi"].attrs["units"] = "rad"

In [41]:
output_catalog.attrs["notes"] = \
"""Catalog of sources where the flux in Jy at any frequency is calculated with a 5th order polynomial in the logarithm of the frequency in GHz, separately for temperature and polarization.
The catalog does not contain information about the polarization angle of a source.
The catalog sorted in descending order based on the source flux at 100 GHz"""

In [44]:
output_filename = f"/pscratch/sd/z/zonca/websky_full_catalog_trasp.h5"

In [42]:
output_catalog.to_netcdf(
    output_filename, format="NETCDF4") # requires netcdf4 package

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [45]:
import h5py

In [46]:
f = h5py.File(output_filename)

In [47]:
f.keys()

<KeysViewHDF5 ['index', 'logpolycoefflux', 'logpolycoefpolflux', 'phi', 'power', 'theta']>

In [50]:
f["logpolycoefflux"][0]

array([ 5.37899652e-09, -1.29664725e-07,  1.20804354e-06, -5.22231671e-06,
        9.00274650e-06])

In [1]:
!benchmark-pixell-runner

/bin/bash: benchmark-pixell-runner: command not found


In [2]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:         515307      288278      237454        2284        3068      227028
Swap:             0           0           0


In [1]:
import xarray as xr
catalog = xr.open_dataset("/pscratch/sd/z/zonca/websky_full_catalog_trasp.h5")

In [2]:
catalog

In [3]:
catalog["logpolycoefflux"].shape

(281384121, 5)