In [1]:
import os
import sys
import warnings

PROJECT_DIR = '/Users/research/projects/kepler-ecc-rp/'
sys.path.append(PROJECT_DIR)

from datetime import datetime
today = datetime.today().strftime("%Y-%m-%d")

import astropy.constants as apc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from utils.astro import calc_T14_circ, calc_sma
from utils.io import load_dr25_data_from_hdf5

pi = np.pi

RSAU = (apc.R_sun/apc.au).value                                 # solar radius [AU]
RSRE = (apc.R_sun/apc.R_earth).value                            # R_sun/R_earth
RHOSUN_GCM3 = (3*apc.M_sun/(4*pi*apc.R_sun**3)).value/1000      # solar density [g/cm^3]



In [2]:
DATA_DIR = '/Users/research/data/DR25_chains/'

DR25_KOIS    = os.path.join(PROJECT_DIR, 'Catalogs/kepler_q1_q17_thompson.csv')
BERGER_IN    = os.path.join(PROJECT_DIR, 'Catalogs/berger_2020_gaia_kepler_tab1_input.txt')
BERGER_OUT   = os.path.join(PROJECT_DIR, 'Catalogs/berger_2020_gaia_kepler_tab2_output.txt')
JZ_DETWEIGHT = os.path.join(PROJECT_DIR, 'Catalogs/zink_kepler_detection_weights.csv')
MORTON_2016  = os.path.join(PROJECT_DIR, 'Catalogs/morton_2016_kepler_fpp.txt')
FURLAN_2017  = os.path.join(PROJECT_DIR, 'Catalogs/furlan_2017_kepler_radius_correction_factors.txt')

DR25_CHAINS  = os.path.join(DATA_DIR, 'dr25-chains_trimmed-thinned.hdf')

## Read in DR25 catalog

In [3]:
# read in DR25
dr25 = pd.read_csv(DR25_KOIS, skiprows=53)

# remove false positives
fp = np.array(dr25.koi_disposition == 'FALSE POSITIVE', dtype='bool')
fp += np.array((dr25.koi_disposition != 'CONFIRMED') &
               (dr25.koi_pdisposition == 'FALSE POSITIVE')
              )

dr25 = dr25[~fp]
dr25 = dr25.reset_index()

print('{0} planets | {1} stars'.format(len(dr25), len(np.unique(dr25.kepid))))

4078 planets | 3087 stars


In [4]:
# count up number of planets in each system
sy_pnum = []
for i, kepid in enumerate(dr25.kepid):
    use = np.array(dr25.kepid == kepid, dtype='bool')
    sy_pnum.append(np.sum(use))
dr25['sy_pnum'] = np.array(sy_pnum)

## Read in Jon Zink's detection weights

In [5]:
# read in DR25
jz_weight = pd.read_csv(JZ_DETWEIGHT, index_col=0)

dr25['detprob'] = np.ones(len(dr25), dtype='float')*np.nan

for i, koi in enumerate(dr25.kepoi_name):
    if koi is not np.nan:
        use = jz_weight.ID == float(koi[1:])

        if np.sum(use) == 1:
            dr25.loc[dr25.kepoi_name == koi, 'detprob'] = jz_weight.loc[use, 'DetectProb'].iloc[0]

## Read in Berger+ 2020 (Kepler-Gaia DR2)

In [6]:
# read in stellar input parameters
with open(BERGER_IN, "r") as infile:
    raw_gaia_data = []
    
    for i, line in enumerate(infile):
        raw_gaia_data.append(line.split("&"))
            
raw_gaia_data = np.array(raw_gaia_data)

# strip off trailing \newline commands
for i in range(len(raw_gaia_data)):
    raw_gaia_data[i,-1] = raw_gaia_data[i,-1].strip("\n").strip("\ ")
    
raw_gaia_in = pd.DataFrame()

for i, k in enumerate(raw_gaia_data[0]):
    raw_gaia_in[k] = raw_gaia_data[1:,i]

In [7]:
# read in stellar output parameters
with open(BERGER_OUT, "r") as infile:
    raw_gaia_data = []
    
    for i, line in enumerate(infile):
        raw_gaia_data.append(line.split("&"))
            
raw_gaia_data = np.array(raw_gaia_data)

# strip off trailing \newline commands
for i in range(len(raw_gaia_data)):
    raw_gaia_data[i,-1] = raw_gaia_data[i,-1].strip("\n").strip("\ ")
    
raw_gaia_out = pd.DataFrame()

for i, k in enumerate(raw_gaia_data[0]):
    raw_gaia_out[k] = raw_gaia_data[1:,i]

In [8]:
raw_gaia = raw_gaia_out.merge(raw_gaia_in, on='KIC')

In [9]:
gaia = pd.DataFrame()

gaia['kic']          = np.array(raw_gaia['KIC'], dtype='int')

gaia['gmag']         = np.array(raw_gaia['gmag'], dtype='float')
gaia['gmag_err']     = np.array(raw_gaia['gmag_err'], dtype='float')
gaia['kmag']         = np.array(raw_gaia['kmag'], dtype='float')
gaia['kmag_err']     = np.array(raw_gaia['kmag_err'], dtype='float')

gaia['parallax']     = np.array(raw_gaia['parallax'], dtype='float')
gaia['parallax_err'] = np.array(raw_gaia['parallax_err'], dtype='float')
gaia['RUWE']         = np.array(raw_gaia['RUWE'], dtype='float')

gaia['mstar']        = np.array(raw_gaia['iso_mass'], dtype='float')
gaia['mstar_err1']   = np.array(raw_gaia['iso_mass_err1'], dtype='float')
gaia['mstar_err2']   = np.array(raw_gaia['iso_mass_err2'], dtype='float')

gaia['rstar']        = np.array(raw_gaia['iso_rad'], dtype='float')
gaia['rstar_err1']   = np.array(raw_gaia['iso_rad_err1'], dtype='float')
gaia['rstar_err2']   = np.array(raw_gaia['iso_rad_err2'], dtype='float')

# Berger+2020 actually reports logrho -- this will be fixed later
gaia['rhostar']      = np.array(raw_gaia['iso_rho'], dtype='float')
gaia['rhostar_err1'] = np.array(raw_gaia['iso_rho_err1'], dtype='float')
gaia['rhostar_err2'] = np.array(raw_gaia['iso_rho_err2'], dtype='float')

gaia['teff']         = np.array(raw_gaia['iso_teff'], dtype='float')
gaia['teff_err1']    = np.array(raw_gaia['iso_teff_err1'], dtype='float')
gaia['teff_err2']    = np.array(raw_gaia['iso_teff_err2'], dtype='float')

gaia['feh']          = np.array(raw_gaia['iso_feh'], dtype='float')
gaia['feh_err1']     = np.array(raw_gaia['iso_feh_err1'], dtype='float')
gaia['feh_err2']     = np.array(raw_gaia['iso_feh_err2'], dtype='float')

gaia['logg']         = np.array(raw_gaia['iso_logg'], dtype='float')
gaia['logg_err1']    = np.array(raw_gaia['iso_logg_err1'], dtype='float')
gaia['logg_err2']    = np.array(raw_gaia['iso_logg_err2'], dtype='float')

gaia['age']          = np.array(raw_gaia['iso_age'], dtype='float')
gaia['age_err1']     = np.array(raw_gaia['iso_age_err1'], dtype='float')
gaia['age_err2']     = np.array(raw_gaia['iso_age_err2'], dtype='float')
gaia['age_flag']     = np.array(raw_gaia['unReAgeFlag'], dtype='str')

# sanitize age_flag
age_flag = np.array(gaia.age_flag)
age_flag[age_flag == ''] = 0
age_flag[age_flag == '*'] = 1

gaia.age_flag = np.array(age_flag, dtype='int')

# Berger+2020 uses log(rho) +/- log(sigma_rho) instead of sigma_logrho
gaia.rhostar = np.round(10**gaia.rhostar, 3)
gaia.rhostar_err1 = np.round(10**gaia.rhostar_err1, 3)
gaia.rhostar_err2 = np.round(-10**gaia.rhostar_err2, 3)

## Cross-match Kepler DR25 vs Gaia DR2

In [10]:
kic_dr25 = np.array(dr25.kepid, dtype='int')
kic_gaia = np.array(gaia.kic, dtype='int')

use = np.isin(kic_dr25, kic_gaia) * ~np.isnan(dr25.detprob)
index = np.arange(len(use),dtype='int')[use]

gk_match = pd.DataFrame(dr25, index=index)
Nobj = len(gk_match)

#### System parameters

In [11]:
my_catalog = pd.DataFrame()

# system parameters
my_catalog['planet_name'] = np.array(gk_match.kepoi_name, dtype='str')
my_catalog['disposition'] = np.array(gk_match.koi_disposition, dtype='str')
my_catalog['kic_id'] = np.array(gk_match.kepid, dtype='int')

koi_id = []
for i, pname in enumerate(my_catalog.planet_name):
    koi_id.append(pname[:6])    
my_catalog['koi_id'] = np.array(koi_id)

my_catalog['kep_mag'] = gk_match.koi_kepmag

#### Stellar parameters

In [12]:
star_keys = list(gaia.keys())
star_keys.remove('kic')

In [13]:
# stellar parameters from Gaia DR2
star_keys = list(gaia.keys())
star_keys.remove('kic')

for i, key in enumerate(star_keys):
    val = []
    
    for j, kic in enumerate(my_catalog.kic_id):
        use = np.array(gaia.kic == kic, dtype='bool')
        if np.sum(use) == 1:
            val.append(float(gaia.loc[use,key].iloc[0]))
        else:
            val.append(np.nan)
            
    my_catalog[key] = np.array(val)

In [14]:
# ensure consistency
kic_ids = np.unique(my_catalog.kic_id)

for i, kic in enumerate(kic_ids):
    use = np.array(my_catalog.kic_id == kic, dtype='bool')
    
    for key in star_keys:        
        my_catalog.loc[use,key] = np.nanmedian(my_catalog.loc[use,key])

my_catalog.age_flag = np.array(my_catalog.age_flag, dtype='int')

#### Transit parameters

In [15]:
# transit parameters from Kepler DR25 (many of these will be overwritten from chains)
my_catalog['npl'] = np.array(gk_match.sy_pnum, dtype='int')
my_catalog['snr'] = gk_match.koi_model_snr
my_catalog['rcf'] = np.zeros_like(my_catalog.snr)
my_catalog['detprob'] = gk_match.detprob

my_catalog['period'] = gk_match.koi_period
my_catalog['period_err1'] = gk_match.koi_period_err1
my_catalog['period_err2'] = gk_match.koi_period_err2

my_catalog['epoch'] = gk_match.koi_time0bk
my_catalog['epoch_err1'] = gk_match.koi_time0bk_err1
my_catalog['epoch_err2'] = gk_match.koi_time0bk_err2

my_catalog['ror'] = np.zeros_like(gk_match.koi_depth)
my_catalog['ror_err1'] = np.zeros_like(gk_match.koi_depth_err1)
my_catalog['ror_err2'] = np.zeros_like(gk_match.koi_depth_err2)

my_catalog['duration'] = gk_match.koi_duration
my_catalog['duration_err1'] = gk_match.koi_duration_err1
my_catalog['duration_err2'] = gk_match.koi_duration_err2

my_catalog['impact'] = gk_match.koi_impact
my_catalog['impact_err1'] = gk_match.koi_impact_err1
my_catalog['impact_err2'] = gk_match.koi_impact_err2

# reset dataframe indexes
my_catalog.reset_index(inplace=True, drop=True)

In [16]:
len(my_catalog), len(np.unique(my_catalog.kic_id))

(3742, 2800)

## Update transit parameter values using DR25 posterior chains

In [17]:
targets = np.array(my_catalog.planet_name)
failure = []

catalog_keys = 'period epoch ror duration impact'.split()
samples_keys = 'PERIOD EPOCH ROR DUR14 IMPACT'.split()

# load posterior chains
for i, t in enumerate(targets):
    use = my_catalog.planet_name == t
    
    try:
        samples = pd.DataFrame(load_dr25_data_from_hdf5(DR25_CHAINS, t))
        samples['DUR14'] = calc_T14_circ(samples.PERIOD, samples.ROR, samples.IMPACT, samples.RHOTILDE)
    
        for j, ck in enumerate(catalog_keys):
            sk = samples_keys[j]
    
            my_catalog.loc[use, ck] = np.median(samples[sk])
            my_catalog.loc[use, ck+'_err1'] = np.percentile(samples[sk], 84) - my_catalog.loc[use, ck]
            my_catalog.loc[use, ck+'_err2'] = np.percentile(samples[sk], 16) - my_catalog.loc[use, ck]
            
    except:
        warnings.warn("{0} failed to load".format(t))
        failure.append(t)


# remove failed objects from catalog
my_catalog = my_catalog[~np.isin(my_catalog.planet_name, failure)]
my_catalog = my_catalog.reset_index(drop=True)
targets = np.array(my_catalog.planet_name)

  result = getattr(ufunc, method)(*inputs, **kwargs)


## Add trackers for flux contamination and false positives

#### Furlan+ 2017 radius correction factors

In [18]:
furlan_rcfs = pd.DataFrame(np.loadtxt(FURLAN_2017, usecols=[0,17], skiprows=37), columns=['koi_id', 'rcf'])

kois = []
for i, koi in enumerate(furlan_rcfs.koi_id):
    kois.append("K"+str(int(koi)).zfill(5))
furlan_rcfs.koi_id = kois

rcf = []
for i, t in enumerate(targets):
    use = furlan_rcfs.koi_id == t[:-3]

    if np.sum(use) != 1:
        rcf.append(np.nan)
    else:
        rcf.append(float(furlan_rcfs.loc[use, 'rcf'].values))

my_catalog['rcf'] = np.array(rcf)

#### Gaia RUWE

In [19]:
my_catalog = my_catalog.rename(columns={"RUWE":"ruwe"})

#### Morton+ 2016 FPP

In [38]:
morton_fpp = pd.read_csv(MORTON_2016, skiprows=15, delimiter=' ', names='planet_name disposition fpp fpp_err'.split())

planet_name = []
for i, koi in enumerate(morton_fpp.planet_name):
    planet_name.append('K'+str(koi).zfill(8))
morton_fpp['planet_name'] = planet_name

fpp = []
for i, t in enumerate(targets):
    use = morton_fpp.planet_name == t

    if np.sum(use) != 1:
        fpp.append(np.nan)
    else:
        fpp.append(float(morton_fpp.loc[use, 'fpp'].values))

my_catalog['fpp'] = np.array(fpp)

## Calcultate self-consistent physical parameters

#### Self-consistent planet radii

In [44]:
# calculate self-consistent planet radii
ror = my_catalog.ror
ror_err = np.sqrt(my_catalog.ror_err1**2 + my_catalog.ror_err2**2)/np.sqrt(2)

Rstar = my_catalog.rstar
Rstar_err = np.sqrt(my_catalog.rstar_err1**2 + my_catalog.rstar_err2**2)/np.sqrt(2)

# radius gap location from Petigura+2022; R = R0*(P/10)^y, R0 = 1.84 +/- 0.03, y = 0.11 +/- 0.02
my_catalog['rgap'] = np.array(1.84*(my_catalog.period/10)**-0.11)
my_catalog['rgap_err'] = my_catalog.rgap * np.sqrt( 0.02**2*np.log(my_catalog.period/10)**2 + (0.03/1.84)**2)

# physical planet radius
my_catalog['rp'] = np.array(ror*Rstar*RSRE)
my_catalog['rp_err'] = np.array(my_catalog.rp * np.sqrt((ror_err/ror)**2 + (Rstar_err/Rstar)**2))

# radius corrected to P=10 days (see Ho & Van Eylen 2023); equivalent to using diagonal bins
my_catalog['rp10'] = np.exp(np.log(my_catalog.rp) - np.log(my_catalog.rgap) + np.log(1.84))
my_catalog['rp10_err'] = np.sqrt(my_catalog.rp_err**2 + my_catalog.rgap_err**2)

# radius adjusted for super-Earths and sub-Neptunes only
rp = my_catalog.rp
rgap = my_catalog.rgap
rp_adj = np.array(rp)

rp_lower_lim = 1.0
rp_gap10_loc = 1.84
rp_giant_lim = 4.0

SE = (rp >= rp_lower_lim)*(rp < rgap)
SN = (rp >= rgap)*(rp < rp_giant_lim)
GP = (rp >= rp_giant_lim)

rp_adj[SE] = ((rp - rp_lower_lim)/(rgap - rp_lower_lim) * (rp_gap10_loc - rp_lower_lim) + rp_lower_lim)[SE]
rp_adj[SN] = ((rp - rgap)/(rp_giant_lim - rgap) * (rp_giant_lim - rp_gap10_loc) + rp_gap10_loc)[SN]
rp_adj[GP] = rp[GP]

rp_adj_err = np.copy(my_catalog['rp_err'])
rp_adj_err[SE+SN] = np.array(my_catalog['rp10_err'])[SE+SN]

my_catalog['rpadj'] = rp_adj
my_catalog['rpadj_err'] = rp_adj_err

  result = getattr(ufunc, method)(*inputs, **kwargs)


#### Semi-major axis

In [45]:
my_catalog['sma'] = calc_sma(my_catalog.period, my_catalog.mstar) * RSAU
my_catalog['sma_err1'] = np.sqrt(1./3) * my_catalog.sma * my_catalog.mstar_err1/my_catalog.mstar * RSAU
my_catalog['sma_err2'] = np.sqrt(1./3) * my_catalog.sma * my_catalog.mstar_err2/my_catalog.mstar * RSAU

## Save catalog

In [46]:
my_catalog.to_csv(os.path.join(PROJECT_DIR, 'Catalogs/kepler_dr25_gaia_dr2_crossmatch.csv'))