In [1]:
import h5py
import numpy as np
import healpy as hp
import matplotlib.pyplot as plt

In [2]:
freqs = [
    "18.7",
    "24.5",
    "44.0",
    "70.0",
    "100.0",
    "143.0",
    "217.0",
    "353.0",
    "545.0",
    "643.0",
    "729.0",
    "857.0",
    "906.0",
]

In [3]:
cat = h5py.File("data/matched_catalogs_2/catalog_100.0.h5", "r")

There are no metadata in the file, I guess fluxes are in `Jy`

In [4]:
cutoff_flux = 1e-3

In [5]:
high_flux_sources_mask = cat["flux"][:] > cutoff_flux

In [6]:
(high_flux_sources_mask).sum()

372255

In [7]:
high_flux_sources_mask.mean() * 100

0.13211945911740433

In [8]:
for k, v in cat.items():
    print(k, v[:3])

flux [3.24291534e-07 3.16862867e-07 3.17171157e-07]
phi [3.22861886 3.22861886 3.22861886]
polarized flux [1.42910628e-09 1.99535624e-08 2.29563857e-09]
theta [1.64009452 1.64009452 1.64009452]


In [9]:
(all_indices,) = np.nonzero(high_flux_sources_mask)

In [10]:
len(all_indices)

372255

In [11]:
all_indices = np.array(sorted(all_indices))

`all_indices` contains the index of the sources in the original Websky catalog

In [12]:
all_indices

array([    11253,     16428,     24110, ..., 281755117, 281755430,
       281755795])

In [13]:
import pandas as pd
import xarray as xr

In [14]:
columns = ["theta", "phi", "flux", "polarized flux"]

In [15]:
flux = xr.DataArray(
    data=np.zeros((len(all_indices), len(freqs)), dtype=np.float64),
    coords={"index": all_indices, "freq": list(map(float, freqs))},
    name="flux",
)
fluxnorm = flux.copy()

In [16]:
polarized_flux = flux.copy()

In [17]:
sources_xr = xr.Dataset(
    {"flux": flux, "polarized_flux": polarized_flux, "fluxnorm": fluxnorm}
)
for freq in freqs:
    print(freq)
    cat = h5py.File(f"data/matched_catalogs_2/catalog_{freq}.h5", "r")
    for column in ["flux", "polarized_flux"]:
        sources_xr[column].loc[dict(index=all_indices, freq=float(freq))] = cat[
            column.replace("_", " ")
        ][high_flux_sources_mask]

18.7


24.5


44.0


70.0


100.0


143.0


217.0


353.0


545.0


643.0


729.0


857.0


906.0


In [18]:
sources_xr

In [19]:
min_idx = int(sources_xr.flux.sel(freq=100).argmin().item())
max_idx = int(sources_xr.flux.sel(freq=100).argmax().item())
print("Index with minimum flux at 100 GHz:", min_idx)
print("Index with maximum flux at 100 GHz:", max_idx)

Index with minimum flux at 100 GHz: 64456
Index with maximum flux at 100 GHz: 9474


In [20]:
sources_xr = sources_xr.sortby(sources_xr.flux.sel(freq=100.0), ascending=False)

In [21]:
len(sources_xr)

3

In [22]:
max_loc = int(sources_xr.flux.sel(freq=100).argmax().item())
min_loc = int(sources_xr.flux.sel(freq=100).argmin().item())
print("Location (position) of maximum flux at 100 GHz:", max_loc)
print("Location (position) of minimum flux at 100 GHz:", min_loc)


Location (position) of maximum flux at 100 GHz: 0
Location (position) of minimum flux at 100 GHz: 372254


In [23]:
sources_xr.coords["index"]

In [24]:
# this was needed for some missing functionality in an older version of xarray
#sources_xr.coords["index"] = np.arange(len(sources_xr.coords["index"]))

In [25]:
sources_xr["fluxnorm"] = sources_xr["flux"] / sources_xr["flux"].sel(freq=100)


In [26]:
#print(sources_xr["fluxnorm"].loc[dict(index=s)], sources_xr["flux"].loc[dict(index=s)])

In [27]:
#sources_xr.fluxnorm.plot(vmin=0, vmax=100)
#plt.figure()
#sources_xr.flux.plot(vmin=0, vmax=100)

In [28]:
sources_xr["logpolycoefflux"] = xr.DataArray(
    np.zeros((len(all_indices), 5), dtype=np.float64),
    dims=["index", "power"],
    coords={"power": np.arange(5)[::-1]},
)
sources_xr["logpolycoefnorm"] = sources_xr["logpolycoefflux"].copy()
sources_xr["logpolycoefpolflux"] = sources_xr["logpolycoefflux"].copy()

In [29]:
from scipy.optimize import curve_fit
import time

def is_interactive():
    try:
        from IPython import get_ipython
        return get_ipython() is not None
    except ImportError:
        return False

def model(freq, a, b, c, d, e):
    log_freq = np.log(freq)
    return a * log_freq**4 + b * log_freq**3 + c * log_freq**2 + d * log_freq + e

start_time = time.time()
total = len(sources_xr.coords["index"])
indices = sources_xr.coords["index"]
if is_interactive():
    from tqdm.notebook import tqdm
    iterator = tqdm(indices, desc="Fitting sources")
else:
    iterator = indices

for i, s in enumerate(iterator):
    sources_xr["logpolycoefflux"].loc[dict(index=s)] = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.flux.sel(index=s)
    )[0]
    sources_xr["logpolycoefpolflux"].loc[dict(index=s)] = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.polarized_flux.sel(index=s)
    )[0]
    sources_xr["logpolycoefnorm"].loc[dict(index=s)] = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.fluxnorm.sel(index=s)
    )[0]
    if (i + 1) % 100 == 0 or (i + 1) == total:
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        remaining = (total - (i + 1)) / rate if rate > 0 else float("inf")
        msg = f"ETA: {int(remaining // 3600)}h {int((remaining % 3600) // 60)}m"
        if is_interactive():
            iterator.set_postfix_str(msg)
        else:
            print(f"Processed {i+1}/{total} | {msg}")

Fitting sources:   0%|          | 0/372255 [00:00<?, ?it/s]

In [30]:
# for s in range(len(all_indices)):
#     plt.figure()
#     sources_xr.flux.sel(index=s).plot(marker="o", linestyle="none")  # , xscale="log")
#     sources_xr.fluxnorm.sel(index=s).plot(
#         marker="o", linestyle="none"
#     )  # , xscale="log")

#     plt.loglog(
#         sources_xr.coords["freq"],
#         model(sources_xr.coords["freq"], *sources_xr.logpolycoefflux.sel(index=s)),
#     )
#     plt.loglog(
#         sources_xr.coords["freq"],
#         model(sources_xr.coords["freq"], *sources_xr.logpolycoefnorm.sel(index=s)),
#     )
#     plt.grid()
#     break

In [31]:
sources_xr.logpolycoefflux.min(), sources_xr.logpolycoefflux.max()

(<xarray.DataArray 'logpolycoefflux' ()> Size: 8B
 array(-17557.80083613),
 <xarray.DataArray 'logpolycoefflux' ()> Size: 8B
 array(23993.59700202))

In [32]:
# plt.figure(figsize=(12, 5))
# plt.subplot(121)
# sources_xr.logpolycoefflux.plot(vmax=50, vmin=-50)
# plt.subplot(122)
# sources_xr.logpolycoefnorm.plot(vmax=50, vmin=-50)pwd

In [33]:
# plt.figure(figsize=(15, 8))

# for power in range(5):
#     plt.subplot(231 + power)

#     np.fabs(sources_xr.logpolycoefflux.loc[dict(power=power)]).plot.hist(
#         bins=np.logspace(-0, 4, 20), density=False, lw=3, label="fluxes"
#     )

#     np.fabs(sources_xr.logpolycoefnorm.loc[dict(power=power)]).plot.hist(
#         bins=np.logspace(-0, 4, 20),
#         density=False,
#         histtype="step",
#         lw=2,
#         label="normalized fluxes",
#         linestyle="--",
#     )
#     plt.grid()
#     plt.title(f"Power {power}")
#     plt.legend()
#     plt.xscale("log")
#     plt.xlabel(None)

In [34]:
output_catalog = sources_xr[["logpolycoefflux","logpolycoefpolflux"]]

In [35]:
#no need to override this again
#output_catalog["index"] = all_indices

In [36]:
output_catalog.logpolycoefflux.attrs["units"] = "Jy"
output_catalog.logpolycoefpolflux.attrs["units"] = "Jy"

In [37]:
for coord in ["theta", "phi"]:
    output_catalog = output_catalog.assign_coords(**{coord:(("index"), cat[coord][high_flux_sources_mask].astype(np.float64))})

In [38]:
output_catalog

In [39]:
output_filename = "data/websky_high_flux_catalog_1mJy.h5"

In [40]:
!ls -l $output_filename

ls: cannot access 'data/websky_high_flux_catalog_1mJy.h5': No such file or directory


In [41]:
output_catalog.coords["theta"].attrs["units"] = "rad"
output_catalog.coords["phi"].attrs["units"] = "rad"
output_catalog.coords["theta"].attrs["reference_frame"] = "Galactic"
output_catalog.coords["phi"].attrs["reference_frame"] = "Galactic"

In [42]:
output_catalog.attrs["description"] = (
    "Websky catalog of sources with flux > 1 mJy at 100 GHz, fitted with a 4th order polynomial in log frequency. "
    "Galactic reference frame. Sorted by flux at 100 GHz (descending). "
    "The 'index' coordinate gives the original index in the Websky catalog."
)

In [43]:
output_catalog.to_netcdf(output_filename, format="NETCDF4", mode="w")  # requires netcdf4 package

In [44]:
%ls -lah $output_filename

-rw-rw-r-- 1 azonca azonca 37M May 12 22:41 data/websky_high_flux_catalog_1mJy.h5


In [45]:
import xarray

In [46]:
xarray.open_dataset(output_filename)

In [47]:
import h5py
f = h5py.File(output_filename, 'r')
f["logpolycoefflux"]

<HDF5 dataset "logpolycoefflux": shape (372255, 5), type "<f8">

In [48]:
f["logpolycoefflux"].attrs["units"]

b'Jy'