In [90]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import umap.plot as uplot
import h5py
import pickle
from sklearn.utils import Bunch
from datetime import datetime
import bokeh.plotting as bk
import bokeh.transform as btr
from bokeh.models import *
from bokeh.palettes import *
bk.output_notebook()
%matplotlib inline
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})


# get filepaths of all files in '/mnt/c/Users/findm/Desktop/surf2023/waterfall_data/data/'
filepaths = glob.glob('/mnt/c/Users/findm/Desktop/surf2023/waterfall_data/data/*.h5')

In [2]:
f = h5py.File(filepaths[-1], 'r')
print(list(f['frb'].keys()))
print(list(f['frb'].attrs.keys()))

['calibrated_wfall', 'extent', 'model_spec', 'model_ts', 'model_wfall', 'plot_freq', 'plot_time', 'spec', 'ts', 'wfall']
['calibration_observation_date', 'calibration_source_name', 'dm', 'scatterfit', 'tns_name']
<class 'numpy.float64'>


- `extent`: the extent of the waterfall data
- `plot_freq`: The values of the frequecy indices in $\rm{MHz}$
- `plot_time`: The value of the time indices in $\rm{\mu s}$
- `wfall`: waterfall data
- `model_wfall`: waterfall from fitted data
- `spec`: Dynamic Spectrum
- `model_spec`: model-fitted dynamic spectrum
- `ts`: time series data
- `model_ts`: model-fitted time series
- `caliberated_wfall`: The waterfall data with calibration applied
- `calibrated_observation_date`
- `calibration_source_name`
- `dm`
- `scatterfit`
- `tns_name`

In [4]:
# make a new sklearn-compatible dataset with one entry for every frb file
# each entry should have tns_name and dm as metadata, and plot_freq, plot_time, wfall, spec, ts as data

tns_names = []
dates = []
# dms = []
# plot_freqs = []
# plot_times = []
# wfalls = []
specs = []
tss = []

for f in filepaths:
    with h5py.File(f, 'r') as frb:
        tns_names.append(frb['frb'].attrs['tns_name'].decode())
        dates.append(datetime.strptime(frb['frb'].attrs['tns_name'].decode()[3:-1], '%Y%m%d'))
        # dms.append(frb['frb'].attrs['dm'])
        # plot_freqs.append(frb['frb']['plot_freq'][:])
        # plot_times.append(frb['frb']['plot_time'][:])
        # wfalls.append(frb['frb']['wfall'][:])
        specs.append(frb['frb']['spec'][:])
        tss.append(frb['frb']['ts'][:])
        
frb_dataset = Bunch(tns_name=tns_names, date=dates, spec=specs, ts=tss)

In [5]:
from cfod import catalog
catalog = catalog.as_dataframe()

| Column Name | Description | Units |
| --- | --- | --- |
| tns_name | TNS name of the burst | - |
| previous_name | Previous name (if applicable) | - |
| repeater_name | Associated repeater name (if applicable) | - |
| ra | Right ascension (J2000) | degrees |
| ra_err | Right ascension error (68% confidence) | degrees |
| dec | Declination (J2000) | degrees |
| dec_err | Declination error (68% confidence) | degrees |
| gl | Galactic longitude | degrees |
| gb | Galactic latitude | degrees |
| exp_up | Exposure for upper transit of the source | hour |
| exp_up_err | Exposure error for upper transit of the source | hour |
| exp_low | Exposure for lower transit of the source | hour |
| exp_low_err | Exposure error for lower transit of the source | hour |
| bonsai_snr | Detection SNR | - |
| bonsai_dm | Detection DM | pc cm−3 |
| low_ft_68 | Lower limit fluence threshold (68% confidence) | Jy ms |
| up_ft_68 | Upper limit fluence threshold (68% confidence) | Jy ms |
| low_ft_95 | Lower limit fluence threshold (95% confidence) | Jy ms |
| up_ft_95 | Upper limit fluence threshold (95% confidence) | Jy ms |
| snr_fitb | SNR determined using the fitting algorithm fitburst | - |
| dm_fitb | DM determined using the fitting algorithm fitburst | pc cm−3 |
| dm_fitb_err | DM error determined using the fitting algorithm fitburst | pc cm−3 |
| dm_exc_ne2001 | DM excess between DM determined by fitburst and NE2001 assuming the best-fit sky position of the source | pc cm−3 |
| dm_exc_ymw16 | DM excess between DM determined by fitburst and YMW16 assuming the best-fit sky position of the source | pc cm−3 |
| bc_width | Box car width of the pulse | s |
| scat_time | Scattering time (at 600MHz) | s |
| scat_time_err | Scattering time (at 600MHz) error | s |
| flux | Flux | Jy |
| flux_err | Flux error | Jy |
| fluence | Fluence | Jy ms |
| fluence_err | Fluence error | Jy ms |
| sub_num | Sub-burst number (1 if the FRB has only one burst) | - |
| mjd_400 | Time of arrival in UTC at CHIME location (topocentric) with reference to 400.19 MHz for the specific sub-burst | MJD |
| mjd_400_err | Time of arrival in UTC at CHIME location (topocentric) error with reference to 400.19 MHz for the specific sub-burst | MJD |
| mjd_inf | Time of arrival in UTC at CHIME location (topocentric) with reference to infinite frequency for the specific sub-burst | MJD |
| mjd_inf_err | Time of arrival in UTC at CHIME location (topocentric) error with reference to infinite frequency for the specific sub-burst | MJD |
| width_fitb | Width of sub-burst using fitburst | s |
| width_fitb_err | Width error of sub-burst using fitburst | s |
| sp_idx | Spectral index for the sub-burst | - |
| sp_idx_err | Spectral index error for the sub-burst | - |
| sp_run | Spectral running for the sub-burst | - |
| sp_run_err | Spectral running error for the sub-burst | - |
| high_freq | Highest frequency band of detection for the sub-burst | MHz |
| low_freq | Lowest frequency band of detection for the sub-burst | MHz |
| peak_freq | Peak frequency for the sub-burst | MHz |
| chi_sq | Chi-squared from fitburst | - |
| dof | Number of degrees of freedom in fitburst | - |
| flag_frac | Fraction of spectral channels flagged in fitburst | - |

In [6]:
# get attribute of frb by name
def get_attr(tns_name, attr_name, log=False):
    x = catalog[catalog['tns_name'] == tns_name][attr_name].values[0]
    if isinstance(x, str):
        x = x.replace('<', '')
        x = x.replace('>', '')
        x = float(x)
    if log:
        x = np.log10(x)
    return x

In [89]:
reducer = umap.UMAP(
    n_neighbors=15, 
    min_dist=0.1, 
    n_components=2, 
    metric='euclidean', 
    verbose=True
)
reducer.fit(np.nan_to_num(frb_dataset.spec))

embedding_df = pd.DataFrame(reducer.embedding_, columns=('x', 'y'))
embedding_df['tns_name'] = frb_dataset.tns_name
embedding_df['date'] = [d.year for d in frb_dataset.date]
embedding_df['dm'] = [get_attr(tns_name, 'dm_fitb') for tns_name in frb_dataset.tns_name]
embedding_df['width'] = [get_attr(tns_name, 'width_fitb')*1000000 for tns_name in frb_dataset.tns_name]

def make_colormap(col_name):
    return btr.linear_cmap(col_name, Plasma256, low=min(embedding_df[col_name]), high=max(embedding_df[col_name]))

tooltips = """
    <div>
        <div>
            <span style="font-size: 15px; font-weight: bold;">@tns_name</span>
        </div>
        <div>
            <span style="font-size: 10px; color: #696;">Width (µs): </span>
            <span style="font-size: 10px; color: #696;">@width</span>
        </div>
    </div>
"""

plot = bk.figure(
    title='UMAP projection of FRB data',
    toolbar_location='above',
    tools=(
        CopyTool(),
        HoverTool(tooltips=tooltips)
    )
)

plot.circle(
    x="x",
    y="y", 
    source=embedding_df, 
    size=7,
    line_color=None,
    color=make_colormap('width'),
)

plot.grid.visible = False
plot.axis.visible = False

bk.show(plot)


UMAP( verbose=True)
Wed Jul  5 02:31:22 2023 Construct fuzzy simplicial set
Wed Jul  5 02:31:23 2023 Finding Nearest Neighbors
Wed Jul  5 02:31:23 2023 Finished Nearest Neighbor Search
Wed Jul  5 02:31:23 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Wed Jul  5 02:31:29 2023 Finished embedding


In [107]:
bk.save(plot, 'umap_frb_spec.html')

  bk.save(plot, 'umap_frb_spec.html')
  bk.save(plot, 'umap_frb_spec.html')


'/home/guutz/dawn/umap_frb_spec.html'

In [81]:
import pickle
chime_frb_529_interpolated_ts = pickle.load(open('chime_frb_529_interpolated_ts.pkl','rb'))
data = chime_frb_529_interpolated_ts[:][3000:8000]

In [82]:
data

array([], shape=(0, 14962), dtype=float64)

In [101]:
list1 = frb_dataset['tns_name']
list2 = list(catalog['tns_name'])
missing = set(list1).difference(list2)
added = set(list2).difference(list1)
print('Missing values in second list:', ','.join(missing))
print('Additional values in second list:\n', ',\n'.join(added))

Missing values in second list: 
Additional values in second list:
 FRB20190322B,
FRB20190317B,
FRB20190317A,
FRB20181222D,
FRB20190329A,
FRB20190228A,
FRB20181220A


In [105]:
duplicates = [x for x in set(list2) if list2.count(x) > 1]
print(duplicates)
print(len(duplicates))

['FRB20181028A', 'FRB20190501B', 'FRB20190422A', 'FRB20190605B', 'FRB20181117B', 'FRB20181223A', 'FRB20190625E', 'FRB20181226A', 'FRB20181224E', 'FRB20190611A', 'FRB20181019A', 'FRB20190308C', 'FRB20190423B', 'FRB20180917A', 'FRB20181119D', 'FRB20190308B', 'FRB20190519B', 'FRB20181222E', 'FRB20180814B', 'FRB20190109A', 'FRB20190208A', 'FRB20190122C', 'FRB20190301A', 'FRB20181125A', 'FRB20181226B', 'FRB20190104A', 'FRB20181128C', 'FRB20190519A', 'FRB20190421A', 'FRB20190423A', 'FRB20190124C', 'FRB20190604F', 'FRB20190131D', 'FRB20190411C', 'FRB20181222A', 'FRB20190601C', 'FRB20190527A', 'FRB20190213B', 'FRB20181104C', 'FRB20181128A', 'FRB20190609A', 'FRB20190111A']
42


In [106]:
len(set(list2))

536