## Data storage

Data from this notebook will be stored at : `/media3/CRP7/hosts/data/homogenized`

## Data Format

The columns available in the alert package are described [here](https://fink-portal.org/api/v1/columns). 

Further explanation about the schema is given by ZTF [in this document](https://irsa.ipac.caltech.edu/data/ZTF/docs/ztf_explanatory_supplement.pdf). 

Meaning of columns (full list [here](https://fink-portal.org/api/v1/columns)):

fwhm:  "Full Width Half Max assuming a Gaussian core, from SExtractor (pixels)"  
aimage:  "Windowed profile RMS afloat minor axis from SExtractor (pixels)"  
bimage: "Windowed profile RMS afloat minor axis from SExtractor (pixels)"  
isdiffpos: "t or 1 => candidate is from positive (sci minus ref) subtraction; f or 0 => candidate is from negative (ref minus sci) subtraction"  

## Read the object-based files and combine into same format for TNS, SIMBAD, SPICY

In [1]:
import io
import gzip
import copy
import re
import os

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import astropy.io.fits as fits

from pathlib import Path
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor
from itertools import repeat

In [2]:
TIME_SERIES_COLUMNS = [
    'i:aimage',
    'i:aimagerat',
    'i:bimage',
    'i:bimagerat',
    'i:candid',
    'i:chinr',
    'i:chipsf',
    'i:classtar',
    'i:dec',
    'i:fid',
    'i:fwhm',
    'i:isdiffpos',
    'i:jd',
    'i:maggaia',
    'i:maggaiabright',
    'i:magpsf',
    'i:neargaia',
    'i:neargaiabright',
    'i:ra',
    'i:sigmapsf',
    'v:classification',
    'd:mangrove_2MASS_name',
    'd:mangrove_HyperLEDA_name',
    'd:mangrove_ang_dist',
    'd:mangrove_lum_dist',
]

STAMP_COLUMNS = [
    'b:cutoutScience_stampData',
    'b:cutoutTemplate_stampData',
    'b:cutoutDifference_stampData',
]

REDUCE_COLUMNS = [
    'd:mangrove_2MASS_name',
    'd:mangrove_HyperLEDA_name',
    'd:mangrove_ang_dist',
    'd:mangrove_lum_dist',
]

In [3]:
def process_simbad_tns_pickle(path_obj):
    df = pd.read_pickle(str(path_obj))
    
    id_col = 'i:objectId'
    for col in TIME_SERIES_COLUMNS:
        if col not in df:
            df[col] = None
    ts_df = df[[id_col] + TIME_SERIES_COLUMNS].groupby(id_col, as_index=False).aggregate(tuple)
    for col in TIME_SERIES_COLUMNS:
        ts_df[col] = ts_df[col].apply(np.array)
    
    jd_groupby = df[[id_col, 'i:jd']].groupby(id_col)
    first_df = df.loc[jd_groupby.idxmin().values.flatten(), [id_col] + STAMP_COLUMNS]
    last_df = df.loc[jd_groupby.idxmax().values.flatten(), [id_col] + STAMP_COLUMNS]
    for col in STAMP_COLUMNS:
        first_df[col] = first_df[col].apply(np.array)
        last_df[col] = last_df[col].apply(np.array)
    first_df = first_df.rename(columns={col: f'{col}_first' for col in STAMP_COLUMNS})
    last_df = last_df.rename(columns={col: f'{col}_last' for col in STAMP_COLUMNS})
    
    obj_df = ts_df.merge(first_df, on=id_col).merge(last_df, on=id_col)
    return obj_df


In [4]:
print(f'Started processing at {datetime.now()}')
tns_path_objs = Path('/media3/CRP7/hosts/data/TNS/Apr2023/obj_info').glob('TNS*.pickle')
with ProcessPoolExecutor(max_workers=8) as executor:
    obj_df_iterator = executor.map(process_simbad_tns_pickle, tns_path_objs)
    xmatch_tns = pd.concat(obj_df_iterator, ignore_index=True)
print(f'Finished processing at {datetime.now()}')

Started processing at 2023-09-12 12:35:23.513938
Finished processing at 2023-09-12 12:36:04.795066


In [5]:
print(f'Started processing at {datetime.now()}')
simbad_path_objs = Path('/media3/CRP7/hosts/data/SIMBAD/Apr2023/obj_info').glob('finkclass=*/*.pickle')
with ProcessPoolExecutor(max_workers=8) as executor:
    obj_df_iterator = executor.map(process_simbad_tns_pickle, simbad_path_objs)
    xmatch_simbad = pd.concat(obj_df_iterator, ignore_index=True)
print(f'Finished processing at {datetime.now()}')

Started processing at 2023-09-12 12:36:04.808675
Finished processing at 2023-09-12 12:46:02.933648


In [6]:
xmatch_simbad.shape

(16857, 32)

In [7]:
xmatch_spicy = pd.read_parquet('/media3/CRP7/hosts/data/SPICY/SPICY_CROSSMATCHED_2_ASEC_SMALL/')

In [8]:
def read_spicy_fits_image(bytes_str):
    hdu_list = fits.open(gzip.open(io.BytesIO(bytes_str)))
    primary_hdu = hdu_list[0]
    return primary_hdu.data

In [9]:
def homogenize_spicy_df(xmatch_spicy):
    for col in TIME_SERIES_COLUMNS:
        if col not in xmatch_spicy:
            xmatch_spicy[col] = xmatch_spicy['i:aimage'].apply(lambda x: np.array([None]*len(x)))
    for col in STAMP_COLUMNS:
        orig_col = f'{col}_small'
        xmatch_spicy[f'{col}_first'] = xmatch_spicy[orig_col].apply(lambda x: read_spicy_fits_image(x[0]))
    for col in STAMP_COLUMNS:
        orig_col = f'{col}_small'
        xmatch_spicy[f'{col}_last'] = xmatch_spicy[orig_col].apply(lambda x: read_spicy_fits_image(x[-1]))
        xmatch_spicy = xmatch_spicy.drop(columns=[orig_col])
    xmatch_spicy = xmatch_spicy.rename(columns={'objectId': 'i:objectId'})
    return xmatch_spicy

In [10]:
xmatch_spicy = homogenize_spicy_df(xmatch_spicy)
xmatch_spicy.shape

(3003, 32)

In [11]:
list(xmatch_tns.columns) == list(xmatch_simbad.columns) == list(xmatch_spicy.columns)

True

In [12]:
def filter_na_get_unique(x, raise_on_duplicate=False):
    x_sr = pd.Series(x)
    uniques = np.unique(x[(x_sr != 'None').values & (x_sr != 'nan').values & (~pd.isna(x))])
    if len(uniques) > 1:
        message = f'Multiple matches {uniques} contained in array {x}'
        if raise_on_duplicate:
            raise ValueError(message)
        else:
            print(message)
            return None
    elif len(uniques) == 1:
        return uniques[0]
    else:
        return None


In [13]:
for col in REDUCE_COLUMNS:
    xmatch_tns[col] = xmatch_tns[col].apply(filter_na_get_unique)
    xmatch_simbad[col] = xmatch_simbad[col].apply(filter_na_get_unique)
    xmatch_spicy[col] = xmatch_spicy[col].apply(filter_na_get_unique)

Multiple matches ['166.5028539785387' '183.5741118958228'] contained in array ['166.5028539785387' '183.5741118958228' '166.5028539785387'
 '166.5028539785387' '166.5028539785387' '183.5741118958228'
 '183.5741118958228' '183.5741118958228' '183.5741118958228'
 '166.5028539785387' '183.5741118958228' '183.5741118958228'
 '183.5741118958228' '183.5741118958228' '183.5741118958228'
 '183.5741118958228' '183.5741118958228' '183.5741118958228'
 '183.5741118958228' '183.5741118958228' '183.5741118958228'
 '183.5741118958228' '183.5741118958228' '166.5028539785387'
 '183.5741118958228' '183.5741118958228' '183.5741118958228'
 '183.5741118958228' '166.5028539785387']
Multiple matches ['174.6425019939592' '207.14468030217245'] contained in array ['207.14468030217245' '207.14468030217245' '207.14468030217245'
 '207.14468030217245' '207.14468030217245' '207.14468030217245'
 '207.14468030217245' '207.14468030217245' '207.14468030217245'
 '207.14468030217245' '207.14468030217245' '207.144680302172

In [14]:
def get_classification_tuple(x):
    count_available = 0
    count_incl_unknown = 0
    most_frequent_value = None
    most_frequent_count = 0
    
    for value, count in zip(*np.unique(x, return_counts=True)):
        if (value != 'None') and (value != 'nan') and not pd.isna(value):
            count_incl_unknown += count
            if (value != 'Unknown'):
                count_available += count
                if count > most_frequent_count:
                    most_frequent_value = value
                    most_frequent_count = count
    
    return (most_frequent_value, most_frequent_count/max(1, count_available), most_frequent_count/max(1,count_incl_unknown))

def add_reduced_classification_cols(df):
    class_col = 'v:classification'
    out_cols = [
        'v:classification_best',
        'v:classification_prob',
        'v:classification_prob_incl_unknown',
    ]
    df[out_cols[0]], df[out_cols[1]], df[out_cols[2]] = zip(*df[class_col].apply(get_classification_tuple))
    return

In [15]:
add_reduced_classification_cols(xmatch_tns)
add_reduced_classification_cols(xmatch_simbad)
add_reduced_classification_cols(xmatch_spicy)

In [16]:
list(xmatch_tns.columns) == list(xmatch_simbad.columns) == list(xmatch_spicy.columns)

True

In [17]:
xmatch_tns.shape, xmatch_simbad.shape, xmatch_spicy.shape

((862, 35), (16857, 35), (3003, 35))

### Add TNS spectroscopic columns to TNS, fill for other surveys

In [18]:
def read_tns_dat_file(path):
    with open(path, 'r') as tns_infile:
        header = None
        data_dict = None
        for line in tns_infile:
            pieces = []
            for i, line_part in enumerate(line.split("'")):
                if i % 2 == 0:
                    pieces.extend(line_part.split())
                else:
                    pieces.append(line_part)
            if header is None:
                header = pieces
                existing = set()
                for i, col in enumerate(header):
                    while col in existing:
                        col += '_'
                    existing.add(col)
                    header[i] = col
                data_dict = {
                    col: []
                    for col in header
                }
            else:
                for k, v in zip(header, pieces):
                    if v == 'None':
                        v = None
                    data_dict[k].append(v)
    return pd.DataFrame(data_dict)


In [19]:
tns_events_df = read_tns_dat_file('/media3/CRP7/hosts/data/TNS/Apr2023/tns_events.dat')
tns_events_df = tns_events_df[['SN', 'objtype', 'redshift', 'host', 'host_redshift']]
tns_class_df = pd.read_pickle('/media3/CRP7/hosts/data/TNS/Apr2023/objects.pickle')
tns_class_df = tns_class_df[['i:objectId', 'class']].drop_duplicates(ignore_index=True).rename(columns={'i:objectId': 'SN2'})
tns_events_df.shape, tns_class_df.shape

((862, 5), (862, 2))

In [20]:
xmatch_tns = xmatch_tns.merge(
    tns_events_df, left_on='i:objectId', right_on='SN',
).merge(
    tns_class_df, left_on='i:objectId', right_on='SN2',
).drop(
    columns=['SN', 'SN2'],
)
for col in xmatch_tns.columns:
    if col not in xmatch_simbad:
        xmatch_simbad[col] = None
    if col not in xmatch_spicy:
        xmatch_spicy[col] = None

### Remove duplicated object ids from other surveys

In [21]:
xmatch_simbad = xmatch_simbad.loc[~xmatch_simbad['i:objectId'].isin(xmatch_tns['i:objectId'])].reset_index(drop=True)
xmatch_spicy = xmatch_spicy.loc[~xmatch_spicy['i:objectId'].isin(xmatch_tns['i:objectId'])].reset_index(drop=True)

In [22]:
list(xmatch_tns.columns) == list(xmatch_simbad.columns) == list(xmatch_spicy.columns)

True

In [23]:
xmatch_tns.shape, xmatch_simbad.shape, xmatch_spicy.shape

((862, 40), (16584, 40), (3002, 40))

In [24]:
output_folder = '/media3/CRP7/hosts/data/homogenized/'
os.makedirs(output_folder, exist_ok=True)
with open(os.path.join(output_folder, 'tns.pkl'), 'wb') as out_file:
    xmatch_tns.to_pickle(out_file)
with open(os.path.join(output_folder, 'simbad.pkl'), 'wb') as out_file:
    xmatch_simbad.to_pickle(out_file)
with open(os.path.join(output_folder, 'spicy.pkl'), 'wb') as out_file:
    xmatch_spicy.to_pickle(out_file)