## Data storage

Data from this notebook will be stored at : `/media3/CRP7/hosts/data/homogenized`

## Data Format

The columns available in the alert package are described [here](https://fink-portal.org/api/v1/columns). 

Further explanation about the schema is given by ZTF [in this document](https://irsa.ipac.caltech.edu/data/ZTF/docs/ztf_explanatory_supplement.pdf). 

Meaning of columns (full list [here](https://fink-portal.org/api/v1/columns)):

fwhm:  "Full Width Half Max assuming a Gaussian core, from SExtractor (pixels)"  
aimage:  "Windowed profile RMS afloat minor axis from SExtractor (pixels)"  
bimage: "Windowed profile RMS afloat minor axis from SExtractor (pixels)"  
isdiffpos: "t or 1 => candidate is from positive (sci minus ref) subtraction; f or 0 => candidate is from negative (ref minus sci) subtraction"  

## Read the object-based files and combine into same format for TNS, SIMBAD, SPICY

In [None]:
import io
import gzip
import copy
import re
import os

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import astropy.io.fits as fits

from pathlib import Path
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor
from itertools import repeat

In [None]:
TIME_SERIES_COLUMNS = [
    'i:aimage',
    'i:aimagerat',
    'i:bimage',
    'i:bimagerat',
    'i:candid',
    'i:chinr',
    'i:chipsf',
    'i:classtar',
    'i:dec',
    'i:fid',
    'i:fwhm',
    'i:isdiffpos',
    'i:jd',
    'i:maggaia',
    'i:maggaiabright',
    'i:magpsf',
    'i:neargaia',
    'i:neargaiabright',
    'i:ra',
    'i:sigmapsf',
    'v:classification',
]

STAMP_COLUMNS = [
    'b:cutoutScience_stampData',
    'b:cutoutTemplate_stampData',
    'b:cutoutDifference_stampData',
]

In [None]:
def process_simbad_tns_pickle(path_obj):
    df = pd.read_pickle(str(path_obj))
    
    id_col = 'i:objectId'
    ts_df = df[[id_col] + TIME_SERIES_COLUMNS].groupby(id_col, as_index=False).aggregate(tuple)
    for col in TIME_SERIES_COLUMNS:
        ts_df[col] = ts_df[col].apply(np.array)
    
    jd_groupby = df[[id_col, 'i:jd']].groupby(id_col)
    first_df = df.loc[jd_groupby.idxmin().values.flatten(), [id_col] + STAMP_COLUMNS]
    last_df = df.loc[jd_groupby.idxmax().values.flatten(), [id_col] + STAMP_COLUMNS]
    for col in STAMP_COLUMNS:
        first_df[col] = first_df[col].apply(np.array)
        last_df[col] = last_df[col].apply(np.array)
    first_df = first_df.rename(columns={col: f'{col}_first' for col in STAMP_COLUMNS})
    last_df = last_df.rename(columns={col: f'{col}_last' for col in STAMP_COLUMNS})
    
    obj_df = ts_df.merge(first_df, on=id_col).merge(last_df, on=id_col)
    return obj_df


In [None]:
print(f'Started processing at {datetime.now()}')
tns_path_objs = Path('/media3/CRP7/hosts/data/TNS/Apr2023/obj_info').glob('TNS*.pickle')
with ProcessPoolExecutor(max_workers=8) as executor:
    obj_df_iterator = executor.map(process_simbad_tns_pickle, tns_path_objs, repeat(False))
    xmatch_tns = pd.concat(obj_df_iterator, ignore_index=True)
print(f'Finished processing at {datetime.now()}')

In [None]:
xmatch_tns.shape

In [None]:
print(f'Started processing at {datetime.now()}')
simbad_path_objs = Path('/media3/CRP7/hosts/data/SIMBAD/Apr2023/obj_info').glob('finkclass=*/*.pickle')
with ProcessPoolExecutor(max_workers=8) as executor:
    obj_df_iterator = executor.map(process_simbad_tns_pickle, simbad_path_objs)
    xmatch_simbad = pd.concat(obj_df_iterator, ignore_index=True)
print(f'Finished processing at {datetime.now()}')

In [None]:
xmatch_simbad.shape

In [None]:
xmatch_spicy = pd.read_parquet('/media3/CRP7/hosts/data/SPICY/SPICY_CROSSMATCHED_2_ASEC_SMALL/')

In [None]:
def read_spicy_fits_image(bytes_str):
    hdu_list = fits.open(gzip.open(io.BytesIO(bytes_str)))
    primary_hdu = hdu_list[0]
    return primary_hdu.data

In [None]:
def homogenize_spicy_df(xmatch_spicy):
    for col in STAMP_COLUMNS:
        orig_col = f'{col}_small'
        xmatch_spicy[f'{col}_first'] = xmatch_spicy[orig_col].apply(lambda x: read_spicy_fits_image(x[0]))
    for col in STAMP_COLUMNS:
        orig_col = f'{col}_small'
        xmatch_spicy[f'{col}_last'] = xmatch_spicy[orig_col].apply(lambda x: read_spicy_fits_image(x[-1]))
        xmatch_spicy = xmatch_spicy.drop(columns=[orig_col])
    xmatch_spicy = xmatch_spicy.rename(columns={'objectId': 'i:objectId'})
    return xmatch_spicy

In [None]:
xmatch_spicy = homogenize_spicy_df(xmatch_spicy)
xmatch_spicy.shape

In [None]:
output_folder = '/media3/CRP7/hosts/data/homogenized/'
os.makedirs(output_folder, exist_ok=True)
with open(os.path.join(output_folder, 'tns.pkl'), 'wb') as out_file:
    xmatch_tns.to_pickle(out_file)
with open(os.path.join(output_folder, 'simbad.pkl'), 'wb') as out_file:
    xmatch_simbad.to_pickle(out_file)
with open(os.path.join(output_folder, 'spicy.pkl'), 'wb') as out_file:
    xmatch_spicy.to_pickle(out_file)