In [1]:
import h5py
import numpy as np
import healpy as hp
import matplotlib.pyplot as plt

In [2]:
cd /global/cfs/cdirs/sobs/www/users/Radio_WebSky/matched_catalogs_2

/global/cfs/cdirs/sobs/www/users/Radio_WebSky/matched_catalogs_2


In [3]:
%ls

catalog_100.0.h5  catalog_232.0.h5  catalog_353.0.h5  catalog_643.0.h5
catalog_111.0.h5  catalog_24.5.h5   catalog_375.0.h5  catalog_67.8.h5
catalog_129.0.h5  catalog_256.0.h5  catalog_409.0.h5  catalog_70.0.h5
catalog_143.0.h5  catalog_27.3.h5   catalog_41.7.h5   catalog_729.0.h5
catalog_153.0.h5  catalog_275.0.h5  catalog_44.0.h5   catalog_73.7.h5
catalog_164.0.h5  catalog_294.0.h5  catalog_467.0.h5  catalog_79.6.h5
catalog_18.7.h5   catalog_30.0.h5   catalog_47.4.h5   catalog_817.0.h5
catalog_189.0.h5  catalog_306.0.h5  catalog_525.0.h5  catalog_857.0.h5
catalog_21.6.h5   catalog_314.0.h5  catalog_545.0.h5  catalog_90.2.h5
catalog_210.0.h5  catalog_340.0.h5  catalog_584.0.h5  catalog_906.0.h5
catalog_217.0.h5  catalog_35.9.h5   catalog_63.9.h5


In [4]:
freqs = [
    "18.7",
    "24.5",
    "44.0",
    "70.0",
    "100.0",
    "143.0",
    "217.0",
    "353.0",
    "545.0",
    "643.0",
    "729.0",
    "857.0",
    "906.0",
]

In [5]:
cat = h5py.File("catalog_100.0.h5", "r")

There are no metadata in the file, I guess fluxes are in `Jy`

In [6]:
cutoff_flux = 1e-3

In [7]:
high_flux_sources_mask = cat["flux"][:] > cutoff_flux

In [8]:
(high_flux_sources_mask).sum()

372255

In [9]:
high_flux_sources_mask.mean() * 100

0.13211945911740433

In [10]:
for k, v in cat.items():
    print(k, v[:3])

flux [3.24291534e-07 3.16862867e-07 3.17171157e-07]
phi [3.22861886 3.22861886 3.22861886]
polarized flux [1.42910628e-09 1.99535624e-08 2.29563857e-09]
theta [1.64009452 1.64009452 1.64009452]


In [11]:
(all_indices,) = np.nonzero(high_flux_sources_mask)

In [12]:
len(all_indices)

372255

In [13]:
all_indices = np.array(sorted(all_indices))

In [14]:
import pandas as pd
import xarray as xr

In [15]:
columns = ["theta", "phi", "flux", "polarized flux"]

In [16]:
flux = xr.DataArray(
    data=np.zeros((len(all_indices), len(freqs)), dtype=np.float64),
    coords={"index": all_indices, "freq": list(map(float, freqs))},
    name="flux",
)
fluxnorm = flux.copy()

In [17]:
polarized_flux = flux.copy()

In [18]:
sources_xr = xr.Dataset(
    {"flux": flux, "polarized_flux": polarized_flux, "fluxnorm": fluxnorm}
)
for freq in freqs:
    print(freq)
    cat = h5py.File(f"catalog_{freq}.h5", "r")
    for column in ["flux", "polarized_flux"]:
        sources_xr[column].loc[dict(index=all_indices, freq=float(freq))] = cat[
            column.replace("_", " ")
        ][high_flux_sources_mask]

18.7
24.5
44.0
70.0
100.0
143.0
217.0
353.0
545.0
643.0
729.0
857.0
906.0


In [19]:
sources_xr = sources_xr.sortby(sources_xr.flux.loc[dict(freq=float(freqs[0]))])

In [20]:
sources_xr.coords["index"] = np.arange(len(sources_xr.coords["index"]))

In [21]:
for s in range(len(all_indices)):
    sources_xr["fluxnorm"].loc[dict(index=s)] = sources_xr["flux"].loc[
        dict(index=s)
    ] / sources_xr["flux"].loc[dict(index=s)].sel(freq=100)

In [22]:
#print(sources_xr["fluxnorm"].loc[dict(index=s)], sources_xr["flux"].loc[dict(index=s)])

In [23]:
#sources_xr.fluxnorm.plot(vmin=0, vmax=100)
#plt.figure()
#sources_xr.flux.plot(vmin=0, vmax=100)

In [24]:
sources_xr["logpolycoefflux"] = xr.DataArray(
    np.zeros((len(all_indices), 5), dtype=np.float64),
    dims=["index", "power"],
    coords={"power": np.arange(5)},
)
sources_xr["logpolycoefnorm"] = sources_xr["logpolycoefflux"].copy()
sources_xr["logpolycoefpolflux"] = sources_xr["logpolycoefflux"].copy()

In [25]:
from scipy.optimize import curve_fit


def model(freq, a, b, c, d, e):
    log_freq = np.log(freq)
    return a + b * log_freq + c * log_freq**2 + d * log_freq**3 + e * log_freq**4


for s in range(len(all_indices)):
    sources_xr["logpolycoefflux"].loc[dict(index=s)], cov = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.flux.sel(index=s)
    )
    sources_xr["logpolycoefpolflux"].loc[dict(index=s)], cov = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.polarized_flux.sel(index=s)
    )
    sources_xr["logpolycoefnorm"].loc[dict(index=s)], cov = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.fluxnorm.sel(index=s)
    )

In [26]:
# for s in range(len(all_indices)):
#     plt.figure()
#     sources_xr.flux.sel(index=s).plot(marker="o", linestyle="none")  # , xscale="log")
#     sources_xr.fluxnorm.sel(index=s).plot(
#         marker="o", linestyle="none"
#     )  # , xscale="log")

#     plt.loglog(
#         sources_xr.coords["freq"],
#         model(sources_xr.coords["freq"], *sources_xr.logpolycoefflux.sel(index=s)),
#     )
#     plt.loglog(
#         sources_xr.coords["freq"],
#         model(sources_xr.coords["freq"], *sources_xr.logpolycoefnorm.sel(index=s)),
#     )
#     plt.grid()
#     break

In [27]:
sources_xr.logpolycoefflux.min(), sources_xr.logpolycoefflux.max()

(<xarray.DataArray 'logpolycoefflux' ()>
 array(-17557.80288493),
 <xarray.DataArray 'logpolycoefflux' ()>
 array(23993.59927165))

In [28]:
# plt.figure(figsize=(12, 5))
# plt.subplot(121)
# sources_xr.logpolycoefflux.plot(vmax=50, vmin=-50)
# plt.subplot(122)
# sources_xr.logpolycoefnorm.plot(vmax=50, vmin=-50)

In [29]:
# plt.figure(figsize=(15, 8))

# for power in range(5):
#     plt.subplot(231 + power)

#     np.fabs(sources_xr.logpolycoefflux.loc[dict(power=power)]).plot.hist(
#         bins=np.logspace(-0, 4, 20), density=False, lw=3, label="fluxes"
#     )

#     np.fabs(sources_xr.logpolycoefnorm.loc[dict(power=power)]).plot.hist(
#         bins=np.logspace(-0, 4, 20),
#         density=False,
#         histtype="step",
#         lw=2,
#         label="normalized fluxes",
#         linestyle="--",
#     )
#     plt.grid()
#     plt.title(f"Power {power}")
#     plt.legend()
#     plt.xscale("log")
#     plt.xlabel(None)

In [30]:
output_catalog = sources_xr[["logpolycoefflux","logpolycoefpolflux"]]

In [31]:
output_catalog["index"] = all_indices

In [32]:
output_catalog.logpolycoefflux.attrs["units"] = "Jy"
output_catalog.logpolycoefpolflux.attrs["units"] = "Jy"

In [33]:
for coord in ["theta", "phi"]:
    output_catalog = output_catalog.assign_coords(**{coord:(("index"), cat[coord][high_flux_sources_mask].astype(np.float64))})

In [34]:
output_catalog

In [35]:
output_filename = "websky_high_flux_catalog_1mJy.h5"

In [36]:
output_catalog.coords["theta"].attrs["units"] = "rad"
output_catalog.coords["phi"].attrs["units"] = "rad"

In [37]:
output_catalog.to_netcdf(output_filename, format="NETCDF4") # requires netcdf4 package

In [38]:
%ls -lah $output_filename

-rw-rw---- 1 zonca sobs 37M Jun 27 16:53 websky_high_flux_catalog_1mJy.h5


In [39]:
import xarray

In [40]:
xarray.open_dataset(output_filename)

In [41]:
import h5py
f = h5py.File(output_filename, 'r')
f["logpolycoefflux"]

<HDF5 dataset "logpolycoefflux": shape (372255, 5), type "<f8">

In [42]:
f["logpolycoefflux"].attrs["units"]

b'Jy'