# Build Simulated Catalog

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from   scipy import stats
from   scipy.interpolate import interp1d
import csv
import sys
import os
import importlib as imp

sys.path.insert(0, "./forecaster/")
import mr_forecast as mr


import alderaan.io as io
from alderaan.constants import *
from alderaan.utils import *

In [2]:
PRIMARY_DIR  = '/Users/research/projects/alderaan/'
CSV_IN  = PRIMARY_DIR + "Catalogs/cumulative_koi_catalog.csv"
CSV_OUT = PRIMARY_DIR + "Catalogs/simulated_catalog_eccentric.csv"

N1 = 70
N2 = 30
N3 = 50

# Read in real KOI data and do some quick cleanup

In [3]:
# Read in the data from csv file
print('Reading in data from csv file')

# read in a csv file containing info on targets
csv_keys, csv_values = io.read_csv_file(CSV_IN)

# put these csv data into a dictionary
real_kois = {}
for k in csv_keys: 
    real_kois[k] = io.get_csv_data(k, csv_keys, csv_values)
    
k0 = "koi_id"
    
print("Loaded {0} real KOIs".format(len(real_kois[k0])))

Reading in data from csv file
Loaded 4273 real KOIs


In [4]:
# convert datatypes
for k in real_kois.keys():
    try:
        real_kois[k] = np.asarray(real_kois[k], dtype="float")
    except:
        real_kois[k] = np.asarray(real_kois[k])
    
    
real_kois["npl"] = np.asarray(real_kois["npl"], dtype="int")
real_kois["kic_id"] = np.asarray(real_kois["kic_id"], dtype="int")

In [5]:
# eliminate any KOIs with ultra short period planets (P < 5 days)
usp_kics = []

for i, per in enumerate(real_kois["period"]):
    if per < 5.0:
        usp_kics.append(real_kois["kic_id"][i])
        

bad = np.isin(real_kois["kic_id"], usp_kics)


for k in real_kois.keys():
    real_kois[k] = real_kois[k][~bad]

In [6]:
# eliminate any NaN-valued systems
bad = np.zeros(len(real_kois["koi_id"]), dtype="bool")

for k in real_kois.keys():
    if real_kois[k].dtype == "float64":
        bad += np.isnan(real_kois[k])
        
        
for i, koi in enumerate(real_kois["koi_id"]):
    use = real_kois["koi_id"] == koi
    
    if np.sum(bad[use]) > 0:
        bad[use] = True
    
for k in real_kois.keys():
    real_kois[k] = real_kois[k][~bad]

In [7]:
# recalculate radius ratio to ensure consistency
real_kois["ror"] = real_kois["prad"]/real_kois["rstar"]/RSRE

In [8]:
real_kois.keys()

dict_keys(['planet_name', 'disposition', 'koi_id', 'kic_id', 'npl', 'kep_mag', 'mstar', 'mstar_err1', 'mstar_err2', 'rstar', 'rstar_err1', 'rstar_err2', 'logrho', 'logrho_err1', 'logrho_err2', 'Teff', 'Teff_err1', 'Teff_err2', 'FeH', 'FeH_err1', 'FeH_err2', 'logg', 'logg_err1', 'logg_err2', 'limbdark_1', 'limbdark_2', 'period', 'period_err1', 'period_err2', 'epoch', 'epoch_err1', 'epoch_err2', 'prad', 'prad_err1', 'prad_err2', 'impact', 'impact_err1', 'impact_err2', 'depth', 'depth_err1', 'depth_err2', 'duration', 'duration_err1', 'duration_err2', 'ror', 'ror_err1', 'ror_err2', 'dor', 'dor_err1', 'dor_err2', 'snr', 'num_transits'])

In [9]:
my_keys = ['planet_name', 'disposition', 'koi_id', 'kic_id', 
           'npl', 'kep_mag', 
           'mstar', 'mstar_err1', 'mstar_err2', 
           'rstar', 'rstar_err1', 'rstar_err2',
           'logrho', 'logrho_err1', 'logrho_err2',
           'limbdark_1', 'limbdark_2', 
           'pmass', 'prad', 'period', 'epoch', 'impact','ecc', 'omega', 
           'depth', 'duration', 'ror', 'dor',
           'snr', 'ttv_type']

# (1) Single planet systems

In [10]:
singles = {}

for k in my_keys:
    singles[k] = []

In [11]:
# draw single-planet systems and assign them a TTV type
use1 = (real_kois["npl"] == 1)*(real_kois["rstar"] < 1.5)*(real_kois["period"] < 500.)*(real_kois["ror"] < 0.2)

select_systems = np.random.choice(real_kois["koi_id"][use1], size=7*N1, replace=False)

for i, ss in enumerate(select_systems):
    use = real_kois["koi_id"] == ss
    
    for k in singles.keys():
        if np.isin(k, list(real_kois.keys())):
            singles[k].append(real_kois[k][use][0])
        
    if i < 1*N1:
        singles["ttv_type"].append("linear")
    elif i < 2*N1:
        singles["ttv_type"].append("quadratic")
    elif i < 3*N1:
        singles["ttv_type"].append("cubic")
    elif i < 4*N1:
        singles["ttv_type"].append("sinusoidal")
    elif i < 5*N1:
        singles["ttv_type"].append("gaussian")
    elif i < 6*N1:
        singles["ttv_type"].append("usp")
    elif i < 7*N1:
        singles["ttv_type"].append("grazing")        
        
for k in singles.keys():
    singles[k] = np.asarray(singles[k])

In [12]:
# shift epochs forward by 1/phi
singles["epoch"] += singles["period"]/1.618

# assign impact parameter
singles["impact"] = np.random.uniform(0,0.95, size=7*N1)

# set mass values to NaN
singles["pmass"] = ["nan"]*N1*7

In [13]:
# assign eccentricity vectors (see Mills 2019 for scale)
esinw = np.hstack([0.167*np.random.normal(size=7*N1-7*N1//4), 0.0355*np.random.normal(size=7*N1//4)])
ecosw = np.hstack([0.167*np.random.normal(size=7*N1-7*N1//4), 0.0355*np.random.normal(size=7*N1//4)])

np.random.shuffle(esinw)
np.random.shuffle(ecosw)

singles["ecc"] = np.sqrt(esinw**2 + ecosw**2)
singles["omega"] = np.arctan2(esinw,ecosw)

bad = singles["ecc"] > 0.7
singles["ecc"][bad] = np.random.uniform(0,0.7, size=np.sum(bad))

In [14]:
# overwrite ultra-short-period (USP) planets
usp = singles["ttv_type"] == "usp"

new_per = np.random.uniform(0.75, 5, size=np.sum(usp))
ratio = np.ceil(singles["period"][usp]/new_per)

# integer adjustment prevents synthetic transits from overlapping real transits
singles["period"][usp] /= ratio

# adjust radii to be smaller than R_jup
large = singles["prad"] > RJRE
singles["prad"][large] = np.exp(np.random.uniform(np.log(4), np.log(11), size=np.sum(large)))

# make zero eccentricity
singles["ecc"][usp] = 0.0

In [15]:
# overwrite grazing planets
grazing = singles["ttv_type"] == "grazing"

new_b = np.zeros(np.sum(grazing))
new_r = np.zeros(np.sum(grazing))

for i, ror in enumerate(singles["ror"][grazing]):
    new_b[i] = np.random.uniform(1-ror, 1+ror)
    
    if new_b[i] > 1:
        new_r[i] = singles["prad"][grazing][i]*np.sqrt(2)
    else:
        new_r[i] = singles["prad"][grazing][i]

    
singles["impact"][grazing] = new_b
singles["prad"][grazing] = new_r


# adjust radii to be larger than 1.5 R_earth
small = singles["prad"] < 1.5
singles["prad"][small] = np.random.uniform(1.5, 3.5, size=np.sum(small))


# recalculate radius ratio
singles["ror"][grazing] = singles["prad"][grazing]/singles["rstar"][grazing]/RSRE

# (2) Mean-motion resonance systems (2 planets)

In [16]:
def draw_parameters(npl, Mstar, per_min=None, per_max=None, force_resonance=False):
    """
    Draw stellar mass, planet masses, and orbital periods and enforce pairwise Hill stability
    
    Parameters
    ----------
    npl : int
        number of planets
    Mstar : float
        stellar mass [M_sun]
    force_resonance : bool
        True to ensure that all planet pairs are near mean motion resonance (default=False)
        
    Returns
    -------
    mass : ndarray
        planet masses [M_sun]
    per : ndarray
        orbital periods [days]
    """
    if npl < 2:
        raise ValueError("Expected at least 2 planets")
        
    if per_min is None: per_min = 3.0
    if per_max is None: per_max = 50.0


    # loop until a pairwise Hill stable solution is found
    d_Hill = np.zeros(npl-1)
    
    while np.any(d_Hill < 8):
        # planet masses
        log_mass = np.random.uniform(np.log(1.5), np.log(17.), size=npl)
        mass = np.exp(log_mass)/MSME

        # planet periods
        per = np.ones(npl)*1e3

        while np.any(per > 365.):            
            per[0] = np.random.uniform(per_min,per_max)

            for i in range(npl-1):

                if force_resonance:
                    # 75% chance just wide of resonance, 25% chance just narrow
                    log_P_ttv = np.random.uniform(np.log(4*per[i]), np.log(2400))
                    P_ttv = np.exp(log_P_ttv)*np.random.choice([-1,1,1,1])
                    
                    idx = np.random.randint(0,6)
                                        
                    j_out = np.array([2, 3, 4, 5, 3, 5])[idx]
                    j_in = np.array([1, 2, 3, 4, 1, 3])[idx]
                    
                    per[i+1] = -j_out/(1/P_ttv - j_in/per[i])

                else:
                    P_ratio = 1.5 + np.random.lognormal(mean=0, sigma=1)
                    per[i+1] = per[i]*P_ratio


        # check stability: separation > 8 mutual Hill radii
        sma = get_sma(per, Mstar)
        r_Hill = ((mass[1:]+mass[:-1])/(3*Mstar))**(1/3) * (sma[1:]+sma[:-1])/2
        d_Hill = (sma[1:]-sma[:-1])/r_Hill
    
    
    # return parameters
    return mass, per

In [17]:
doubles = {}

for k in my_keys:
    doubles[k] = []

In [18]:
# draw 2-planet systems
use2 = (real_kois["npl"] == 2)*(real_kois["rstar"] < 1.5)

select_systems = np.random.choice(np.unique(real_kois["koi_id"][use2]), size=8*N2, replace=False)

In [19]:
for i, ss in enumerate(select_systems):
    use = real_kois["koi_id"] == ss
    periods = real_kois["period"][use]
    
    for k in doubles.keys():
        if np.isin(k, list(real_kois.keys())):
            doubles[k].append(real_kois[k][use][np.argmin(periods)])
            doubles[k].append(real_kois[k][use][np.argmax(periods)])
    
doubles["ttv_type"] = ["rebound"]*(12*N2) + ["eccentric"]*(4*N2)
        
for k in doubles.keys():
    doubles[k] = np.asarray(doubles[k])

In [20]:
hill_stable_masses = []
hill_stable_periods = []

for i, koi in enumerate(np.unique(doubles["koi_id"])):
    use = doubles["koi_id"] == koi
    
    Mstar = doubles["mstar"][use][0]
    periods = doubles["period"][use]
    
    if doubles["ttv_type"][use][0] == "rebound":
        force_resonance = True
    else:
        force_resonance = False
    
    mass, per = draw_parameters(2, Mstar, 
                                per_min=periods.min()/np.sqrt(2), 
                                per_max=np.sqrt(2)*periods.min(), 
                                force_resonance=force_resonance)
    
    hill_stable_masses.append(mass*MSME)
    hill_stable_periods.append(per)
    
hill_stable_masses = np.array(hill_stable_masses).reshape(-1)
hill_stable_periods = np.array(hill_stable_periods).reshape(-1)

In [21]:
hill_stable_radii = np.zeros_like(hill_stable_masses)

for i, hsm in enumerate(hill_stable_masses):
    hill_stable_radii[i] = mr.Mstat2R(hsm, hsm/100, sample_size=100)[0]

In [22]:
# replace periods, radii, and masses with hill stable pairs
doubles["period"] = np.copy(hill_stable_periods)
doubles["pmass"] = np.copy(hill_stable_masses)
doubles["prad"] = np.copy(hill_stable_radii)

# assign impact parameter
doubles["impact"] = np.random.uniform(0,0.95, size=16*N2)

# assign placeholder eccentricity vectors (will be redrawn during TTV simulation)
doubles["ecc"] = np.nan*np.ones(16*N2)
doubles["omega"] = np.nan*np.ones(16*N2)

In [23]:
# assign eccentricity vectors to non-resonant doubles (see Mills 2019 for scale)
nonres = doubles["ttv_type"] != "rebound"

esinw = np.hstack([0.167*np.random.normal(size=2*N2), 0.0355*np.random.normal(size=2*N2)])
ecosw = np.hstack([0.167*np.random.normal(size=2*N2), 0.0355*np.random.normal(size=2*N2)])

np.random.shuffle(esinw)
np.random.shuffle(ecosw)


doubles["ecc"][nonres] = np.sqrt(esinw**2 + ecosw**2)
doubles["omega"][nonres] = np.arctan2(esinw,ecosw)

bad = doubles["ecc"] > 0.7
doubles["ecc"][bad] = np.random.uniform(0,0.7, size=np.sum(bad))

# (3) Multi-planet systems

In [24]:
multis = {}

for k in my_keys:
    multis[k] = []
    
multis["ttv_type"] = []

In [25]:
# draw multiplanet systems
use3 = (real_kois["npl"] == 3)*(real_kois["rstar"] < 1.5)
select3 = np.random.choice(np.unique(real_kois["koi_id"][use3]), size=N3, replace=False)

use4 = (real_kois["npl"] == 4)*(real_kois["rstar"] < 1.5)
select4 = np.unique(real_kois["koi_id"][use4])
N4 = len(select4)

select_systems = np.hstack([select3, select4])

In [26]:
for i, ss in enumerate(select_systems):
    locs = np.where(real_kois["koi_id"] == ss)[0]
    
    for loc in locs:
        for k in multis.keys():
            if np.isin(k, list(real_kois.keys())):
                multis[k].append(real_kois[k][loc])  
            
        multis["ttv_type"].append("rebound")

        
for k in multis.keys():
    multis[k] = np.asarray(multis[k])

In [27]:
# assign impact parameter
multis["impact"] = np.random.uniform(0,0.95, size=3*N3 + 4*N4)

# adjust radii to be in range (R_earth, R_jup)
small = multis["prad"] < 1.0
large = multis["prad"] > RJRE

multis["prad"][small] = np.exp(np.random.uniform(np.log(1), np.log(4), size=np.sum(small)))
multis["prad"][large] = np.exp(np.random.uniform(np.log(4), np.log(11), size=np.sum(large)))

# assign placeholder eccentricity vectors (will be redrawn during TTV simulation)
multis["ecc"] = np.nan*np.ones(3*N3 + 4*N4)
multis["omega"] = np.nan*np.ones(3*N3 + 4*N4)

In [28]:
# peturb periods and radii
fractional_perturbation = np.random.uniform(0.02,0.05,size=N3+N4)

for i, ss in enumerate(select_systems):
    use = multis["koi_id"] == ss
    
    multis["period"][use] *= 1 + fractional_perturbation[i]
    multis["prad"][use] *= 1 + fractional_perturbation[i]
    

# perturb epochs
for i, per in enumerate(multis["period"]):
    multis["epoch"][i] += np.random.uniform(0,per)

In [29]:
# calculate masses
masses = np.zeros_like(multis["prad"])

for i, r in enumerate(multis["prad"]):
    masses[i] = mr.Rstat2M(r, r/100, sample_size=100)[0]

bad = masses > MJME
masses[bad] = np.exp(np.random.uniform(np.log(17.),np.log(300),size=np.sum(bad)))

multis["pmass"] = np.copy(masses)

# Fix dispositions and planet names to SIMULATED

In [30]:
my_catalog = {}

for k in my_keys:
    my_catalog[k] = np.hstack([singles[k], doubles[k], multis[k]]).reshape(-1)
    
    
my_catalog['disposition'] = np.array(["SIMULATED"]*len(my_catalog["disposition"]))

In [31]:
# fix planet names
for i, pname in enumerate(my_catalog["planet_name"]):
    my_catalog["planet_name"][i] = "S" + pname[1:]

# Perturb stellar masses and radii within uncertainties

In [32]:
for i, koi in enumerate(my_catalog["koi_id"]):
    use = my_catalog["koi_id"] == koi

    # perturb masses
    mstar = my_catalog["mstar"][use][0]
    mstar_err = np.sqrt(my_catalog["mstar_err1"][use][0]**2 + my_catalog["mstar_err2"][use][0]**2)/np.sqrt(2)

    my_catalog["mstar"][use] = mstar + mstar_err*stats.truncnorm.rvs(-3,3)
    
    if np.any(my_catalog["mstar"][use] < 0):
        my_catalog["mstar"][use] = mstar*np.random.uniform(0.75,1.25)

    
    # perturb radii
    rstar = my_catalog["rstar"][use][0]
    rstar_err = np.sqrt(my_catalog["rstar_err1"][use][0]**2 + my_catalog["rstar_err2"][use][0]**2)/np.sqrt(2)

    my_catalog["rstar"][use] = rstar + rstar_err*stats.truncnorm.rvs(-3,3)
    
    if np.any(my_catalog["rstar"][use] < 0):
        my_catalog["rstar"][use] = rstar*np.random.uniform(0.75,1.25)
    
    
my_catalog["logrho"] = np.log10(my_catalog["mstar"]/my_catalog["rstar"]**3*RHOSUN_GCM3)

# Recalculate transit depths, durations, $r_p/R_{\star}$, and $a/R_{\star}$

In [33]:
# recalculate transit depths and durations
my_catalog["depth"] = get_transit_depth(my_catalog["prad"]/my_catalog["rstar"]/RSRE, my_catalog["impact"])*1e6


# recalculate transit durations
sma = get_sma(my_catalog["period"], my_catalog["mstar"])

my_catalog["duration"] = 24*get_dur_tot(my_catalog["period"], 
                                        my_catalog["prad"]/RSRE, 
                                        my_catalog["rstar"],
                                        my_catalog["impact"],
                                        sma,
                                        my_catalog["ecc"],
                                        my_catalog["omega"])


my_catalog["ror"] = my_catalog["prad"]/my_catalog["rstar"]/RSRE
my_catalog["dor"] = sma/my_catalog["rstar"]

# Calculate SNR and remove systems hosting any planets with SNR < 7.1

In [34]:
# Read in CDPP data
cdpp = io.load_cdpp_data(PRIMARY_DIR + "Catalogs/keplerstellar_cdpp.csv")

# do some data cleanup
for k in cdpp.keys():
    cdpp[k] = np.asarray(cdpp[k])
    cdpp[k][cdpp[k] == ""] = "nan"

    
for k in cdpp.keys():
    try:
        cdpp[k] = np.asarray(cdpp[k], dtype="float")
    except:
        pass
    
cdpp["kepid"] = np.asarray(cdpp["kepid"], dtype="int")
cdpp["nkoi"]  = np.asarray(cdpp["nkoi"], dtype="int")

In [35]:
cdpp_interp = []
bad_dur = []

for i, kic in enumerate(my_catalog["kic_id"]):
    cdpp_dur, cdpp_rms = io.pull_cdpp_rms(cdpp, kic)
    
    tdur = my_catalog["duration"][i]
    
    if np.isnan(tdur):
        tdur = get_dur_tot(my_catalog["period"][i], my_catalog["prad"][i]/RSRE, my_catalog["rstar"][i],
                           my_catalog["impact"][i], get_sma(my_catalog["period"][i], my_catalog["mstar"][i]))
        
        bad_dur.append(True)
        
    else:
        bad_dur.append(False)
    
    cdpp_interp.append(interp1d(cdpp_dur, cdpp_rms, bounds_error=False, fill_value="extrapolate")(tdur))
    
    
cdpp_interp = np.squeeze(cdpp_interp)

In [36]:
# Read in occurence rate data products
keys, vals = io.read_csv_file(PRIMARY_DIR + "Catalogs/keplerstellar_occurence.csv")

occ = {}
for k in keys: 
    occ[k] = io.get_csv_data(k, keys, vals)

    
# do some data cleanup
for k in occ.keys():
    occ[k] = np.asarray(occ[k])
    occ[k][occ[k] == ""] = "nan"

    
for k in occ.keys():
    try:
        occ[k] = np.asarray(occ[k], dtype="float")
    except:
        pass
    
    
occ["kepid"] = np.asarray(occ["kepid"], dtype="int")
occ["nkoi"]  = np.asarray(occ["nkoi"], dtype="int")

In [37]:
dutycycle = []
dataspan = []

for i, kic in enumerate(my_catalog["kic_id"]):
    use = occ["kepid"] == kic
    
    loc = np.nanargmax(occ["dataspan"][use])
    dutycycle.append(occ["dutycycle"][use][loc])
    dataspan.append(occ["dataspan"][use][loc])
    
    
dutycycle = np.asarray(dutycycle)
dataspan = np.asarray(dataspan)

expected_num_transits = dataspan/my_catalog["period"]*dutycycle

snr = my_catalog["depth"]/cdpp_interp * np.sqrt(expected_num_transits)

my_catalog["snr"] = snr

In [38]:
low_snr = []

for i, koi in enumerate(my_catalog["koi_id"]):
    use = my_catalog["koi_id"] == koi
    
    if np.any(snr[use] < 7.1):
        low_snr.append(True)
    else:
        low_snr.append(False)

low_snr = np.asarray(low_snr)      
        
for k in my_catalog.keys():
    my_catalog[k] = my_catalog[k][~low_snr]

# Do some cleanup

In [39]:
my_catalog.keys()

dict_keys(['planet_name', 'disposition', 'koi_id', 'kic_id', 'npl', 'kep_mag', 'mstar', 'mstar_err1', 'mstar_err2', 'rstar', 'rstar_err1', 'rstar_err2', 'logrho', 'logrho_err1', 'logrho_err2', 'limbdark_1', 'limbdark_2', 'pmass', 'prad', 'period', 'epoch', 'impact', 'ecc', 'omega', 'depth', 'duration', 'ror', 'dor', 'snr', 'ttv_type'])

In [40]:
all_keys = list(my_catalog.keys())
int_keys = ['kic_id', 'npl', 'depth']
string_keys = ['planet_name', 'disposition', 'koi_id', 'ttv_type']
precise_keys = ['period', 'epoch']


for k in my_catalog.keys():
    if np.isin(k, int_keys):
        my_catalog[k] = np.array(my_catalog[k], dtype="int")
    elif np.isin(k, string_keys):
        my_catalog[k] = my_catalog[k]
    elif np.isin(k, precise_keys):
        my_catalog[k] = np.round(np.array(my_catalog[k], dtype="float"), 5)
    else:
        my_catalog[k] = np.round(np.array(my_catalog[k], dtype="float"), 3)

# Write out the catalog

In [41]:
WRITENEW = True
if WRITENEW:
    with open(CSV_OUT, "w") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(my_catalog.keys())
        writer.writerows(zip(*my_catalog.values()))