# Crossmatching our dippers
This notebook processes the full ZTF dataset to look for dips and creates a new AXS dataset with the results.

# Setup

In [8]:
import getpass
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import axs
import pyspark.sql.functions as sparkfunc

import dipper

In [9]:
%matplotlib inline

In [10]:
def spark_start(local_dir):
    from pyspark.sql import SparkSession
    
    spark = (
            SparkSession.builder
            .appName("LSD2")
            .config("spark.sql.warehouse.dir", local_dir)
            .config('spark.master', "local[20]")
            .config('spark.driver.memory', '256G') # 128
            .config('spark.local.dir', local_dir)
            .config('spark.memory.offHeap.enabled', 'true')
            .config('spark.memory.offHeap.size', '4G') # 256
            .config("spark.sql.execution.arrow.enabled", "true")
            .config("spark.driver.maxResultSize", "128G")
            .config("spark.driver.extraJavaOptions", f"-Dderby.system.home={local_dir}")
            .enableHiveSupport()
            .getOrCreate()
    )   

    return spark

username = getpass.getuser()
spark_session = spark_start(f"/epyc/users/{username}/spark-tmp/")

catalog = axs.AxsCatalog(spark_session)

In [11]:
spark_session

https://epyc.astro.washington.edu/jupyter/user/ecbellm/proxy/4049/jobs/

In [13]:
catalog.list_table_names()

['gaia_dr2_1am_dup',
 'allwise_1am_dup',
 'unwise_v1',
 'cklein_flare',
 'unwise_v2',
 'catalina_variables_n',
 'gaia',
 'gaia_dr2_1am_dup_ssd',
 'ztf_dr3_detections',
 'ps1',
 'gaia_dr2_wds_j2019',
 'gaia_dr2_wds_j2019_fall2020temp',
 'green19_stellar_params',
 'rosat_2rxs',
 'ztf_dr3',
 'rosat_2rxs_z4am_b2am',
 'ztf_dr3_2rxs_obj',
 'wtf_ztf_dr3',
 'ztf_wds_kjb_dr3',
 'ztf_aug2020_combined',
 'karenws_cut_wtf_fits_r_band',
 'karenws_cut_wtf_fits',
 'ztf_aug2020_combined_z4am_b2am',
 'ztf_aug2020_2rxs_obj',
 'skymapper_dr2',
 'test_skymapper',
 'skymapper_dr2_ver2',
 'skymapper_dr2_ver3',
 'ztf_rrlyr',
 'gaia_source_edr3',
 'gaia_edr3_distances',
 'rrlyrae_sample_andy',
 'stevengs_test_small_df',
 'ztf5',
 'ztf_wds_expected_kjb',
 'ztf_dr4_detections',
 'ztf_wds_kjb',
 'feh_rrlyr_ab_020620',
 'kepler_rrlyrae',
 'ztf_kepler_rrlyrae',
 'ps_uband',
 'ps_uband_ver2',
 'debug_match_a',
 'debug_match_b',
 'debug_match_c',
 'gaia_dr2_wds_j2019_spring2021temp',
 'temp_kjb',
 'ztf_rrlyr_grid_50

### Match back to class labels from Zooniverse

In [14]:
dip_cat = catalog.load('wtf_aug2020_dip_candidates')

In [15]:
best_dippers_df = (
    dip_cat
    .exclude_duplicates()
    .sort(sparkfunc.col('dip.asymmetry_significance').desc())
    .toPandas()
)

In [None]:
best_dippers_df

In [17]:
best_dippers_df.columns

Index(['ps1_objid', 'ra', 'dec', 'mean_mag_g', 'mean_mag_r', 'mean_mag_i',
       'ra_stddev', 'dec_stddev', 'ps1_gMeanPSFMag', 'ps1_rMeanPSFMag',
       'ps1_iMeanPSFMag', 'ra_detections', 'dec_detections', 'mjd_g', 'mag_g',
       'magerr_g', 'psfflux_g', 'psffluxerr_g', 'catflags_g', 'expid_g',
       'rcID_g', 'fieldID_g', 'xpos_g', 'ypos_g', 'nobs_g', 'mjd_r', 'mag_r',
       'magerr_r', 'psfflux_r', 'psffluxerr_r', 'catflags_r', 'expid_r',
       'rcID_r', 'fieldID_r', 'xpos_r', 'ypos_r', 'nobs_r', 'mjd_i', 'mag_i',
       'magerr_i', 'psfflux_i', 'psffluxerr_i', 'catflags_i', 'expid_i',
       'rcID_i', 'fieldID_i', 'xpos_i', 'ypos_i', 'nobs_i', 'zone', 'dup',
       'dip'],
      dtype='object')

### Gaia

In [18]:
gaia = catalog.load('gaia_source_edr3')

In [19]:
gaia.columns

['solution_id',
 'designation',
 'source_id',
 'random_index',
 'ref_epoch',
 'ra',
 'ra_error',
 'dec',
 'dec_error',
 'parallax',
 'parallax_error',
 'parallax_over_error',
 'pm',
 'pmra',
 'pmra_error',
 'pmdec',
 'pmdec_error',
 'ra_dec_corr',
 'ra_parallax_corr',
 'ra_pmra_corr',
 'ra_pmdec_corr',
 'dec_parallax_corr',
 'dec_pmra_corr',
 'dec_pmdec_corr',
 'parallax_pmra_corr',
 'parallax_pmdec_corr',
 'pmra_pmdec_corr',
 'astrometric_n_obs_al',
 'astrometric_n_obs_ac',
 'astrometric_n_good_obs_al',
 'astrometric_n_bad_obs_al',
 'astrometric_gof_al',
 'astrometric_chi2_al',
 'astrometric_excess_noise',
 'astrometric_excess_noise_sig',
 'astrometric_params_solved',
 'astrometric_primary_flag',
 'nu_eff_used_in_astrometry',
 'pseudocolour',
 'pseudocolour_error',
 'ra_pseudocolour_corr',
 'dec_pseudocolour_corr',
 'parallax_pseudocolour_corr',
 'pmra_pseudocolour_corr',
 'pmdec_pseudocolour_corr',
 'astrometric_matched_transits',
 'visibility_periods_used',
 'astrometric_sigma5d_max

In [20]:
%%time
dip_x_gaia = dip_cat.crossmatch(gaia, return_min=True).select('ps1_objid','source_id',
'ruwe',  'bp_rp', 'bp_g', 'g_rp',
 'phot_g_mean_flux',
 'phot_g_mean_flux_error',
 'phot_g_mean_mag',
 'phot_bp_mean_flux',
 'phot_bp_mean_flux_error',
 'phot_bp_mean_mag',
 'phot_rp_mean_flux',
 'phot_rp_mean_flux_error',
 'phot_rp_mean_mag',
 'parallax',
 'parallax_error',
 'parallax_over_error',
 'pm',
 'pmra',
 'pmra_error',
 'pmdec',
 'pmdec_error').write.parquet(f'./wtf_aug2020_dip_candidates_x_gaia_source_edr3')

CPU times: user 208 ms, sys: 92.7 ms, total: 301 ms
Wall time: 13min 40s


In [None]:
df_dip_x_gaia =  pd.read_parquet('wtf_aug2020_dip_candidates_x_gaia_source_edr3')
df_dip_x_gaia

### Green 19

In [24]:
g19 = catalog.load("green19_stellar_params")

In [25]:
g19.columns

['dm_16',
 'E_16',
 'Mr_16',
 'FeH_16',
 'dm_50',
 'E_50',
 'Mr_50',
 'FeH_50',
 'dm_84',
 'E_84',
 'Mr_84',
 'FeH_84',
 'obj_id',
 'l',
 'b',
 'gaia_id',
 'chisq',
 'ra',
 'dec',
 '__index_level_0__',
 'zone',
 'dup']

In [32]:
%%time
dip_x_g19 = dip_cat.crossmatch(g19, return_min=True).select('ps1_objid','dm_16',
        'E_16', 'Mr_16','FeH_16','dm_50','E_50','Mr_50','FeH_50','dm_84', 'E_84',
        'Mr_84', 'FeH_84','chisq','gaia_id').write.parquet(f'./wtf_aug2020_dip_candidates_x_green19_stellar_params')

CPU times: user 35.1 ms, sys: 24.4 ms, total: 59.5 ms
Wall time: 1min 19s


In [None]:
df_dip_x_g19 =  pd.read_parquet('wtf_aug2020_dip_candidates_x_green19_stellar_params')
df_dip_x_g19

### Bailer-Jones distances

In [35]:
bjdist = catalog.load('gaia_edr3_distances')

In [36]:
bjdist.columns

['ra',
 'dec',
 'zone',
 'dup',
 'source_id',
 'r_med_geo',
 'r_lo_geo',
 'r_hi_geo',
 'r_med_photogeo',
 'r_lo_photogeo',
 'r_hi_photogeo',
 'flag']

In [37]:
%%time
dip_x_bjdist = dip_cat.crossmatch(bjdist, return_min=True).select('ps1_objid','source_id',
'r_med_geo',
 'r_lo_geo',
 'r_hi_geo',
 'r_med_photogeo',
 'r_lo_photogeo',
 'r_hi_photogeo',
 'flag').write.parquet(f'./wtf_aug2020_dip_candidates_x_gaia_edr3_distances')

CPU times: user 54.7 ms, sys: 25.6 ms, total: 80.3 ms
Wall time: 2min 32s


In [None]:
df_dip_x_bjd =  pd.read_parquet('wtf_aug2020_dip_candidates_x_gaia_edr3_distances')
df_dip_x_bjd

### AllWISE

In [39]:
wise = catalog.load('allwise_1am_dup')

In [40]:
wise.columns

['designation',
 'ra',
 'dec',
 'sigra',
 'sigdec',
 'sigradec',
 'glon',
 'glat',
 'elon',
 'elat',
 'wx',
 'wy',
 'cntr',
 'source_id',
 'coadd_id',
 'src',
 'w1mpro',
 'w1sigmpro',
 'w1snr',
 'w1rchi2',
 'w2mpro',
 'w2sigmpro',
 'w2snr',
 'w2rchi2',
 'w3mpro',
 'w3sigmpro',
 'w3snr',
 'w3rchi2',
 'w4mpro',
 'w4sigmpro',
 'w4snr',
 'w4rchi2',
 'rchi2',
 'nb',
 'na',
 'w1sat',
 'w2sat',
 'w3sat',
 'w4sat',
 'satnum',
 'ra_pm',
 'dec_pm',
 'sigra_pm',
 'sigdec_pm',
 'sigradec_pm',
 'pmra',
 'sigpmra',
 'pmdec',
 'sigpmdec',
 'w1rchi2_pm',
 'w2rchi2_pm',
 'w3rchi2_pm',
 'w4rchi2_pm',
 'rchi2_pm',
 'pmcode',
 'cc_flags',
 'rel',
 'ext_flg',
 'var_flg',
 'ph_qual',
 'det_bit',
 'moon_lev',
 'w1nm',
 'w1m',
 'w2nm',
 'w2m',
 'w3nm',
 'w3m',
 'w4nm',
 'w4m',
 'w1cov',
 'w2cov',
 'w3cov',
 'w4cov',
 'w1cc_map',
 'w1cc_map_str',
 'w2cc_map',
 'w2cc_map_str',
 'w3cc_map',
 'w3cc_map_str',
 'w4cc_map',
 'w4cc_map_str',
 'best_use_cntr',
 'ngrp',
 'w1flux',
 'w1sigflux',
 'w1sky',
 'w1sigsk',
 '

In [41]:
%%time
dip_x_wise = dip_cat.crossmatch(wise, return_min=True).select('ps1_objid','source_id',
                                                               'w1mpro',
 'w1sigmpro',
 'w1snr',
 'w1rchi2',
 'w2mpro',
 'w2sigmpro',
 'w2snr',
 'w2rchi2',
 'w3mpro',
 'w3sigmpro',
 'w3snr',
 'w3rchi2',
 'w4mpro',
 'w4sigmpro',
 'w4snr',
 'w4rchi2',
 'rchi2',
 'na', 'w1sat',
 'w2sat',
 'w3sat',
 'w4sat',
 'satnum'
).write.parquet(f'./wtf_aug2020_dip_candidates_x_allwise_1am_dup')

CPU times: user 72.1 ms, sys: 35.6 ms, total: 108 ms
Wall time: 3min 34s


In [None]:
df_dip_x_wise =  pd.read_parquet('wtf_aug2020_dip_candidates_x_allwise_1am_dup')
df_dip_x_wise

### Dip labels from Zooniverse

Read in the zooniverse labels from zooniverse_analysis.ipynb

In [9]:
dip_labels = pd.read_csv('ztf_dips_labelled.csv')

In [None]:
dip_labels.head()

In [11]:
dip_labels.dip_label.value_counts()

None        569
Lithium      13
Platinum     11
Gold          4
Name: dip_label, dtype: int64

In [12]:
best_dippers_df = pd.merge(best_dippers_df,dip_labels,left_on='ps1_objid',right_on='ps1_id')

In [13]:
best_dippers_df[['ps1_objid', 'ra', 'dec', 'mean_mag_g', 'mean_mag_r', 'mean_mag_i',
       'ps1_gMeanPSFMag', 'ps1_rMeanPSFMag','ps1_iMeanPSFMag','dip_label']].to_csv('ztf_dips_labelled_allinfo.csv')

In [14]:
best_dippers_df.dip_label.value_counts()

None        568
Lithium      13
Platinum     11
Gold          4
Name: dip_label, dtype: int64

In [30]:
!pwd

/data/epyc/users/ecbellm/ZTF_Boyajian
