# This is to get the final training and testing sets. 
The core here is data consistency: have all the estimates and their uncertainities; make sure the data has been standardized properly etc.

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from astropy.table import Table
from astropy.io.misc.hdf5 import write_table_hdf5
import matplotlib.pyplot as plt

In [2]:
dataAll = pd.read_csv('../data/BINGO_full_info.csv')
dataAll.head()
len(dataAll)
mask =  (dataAll['SNR']>=100) & (dataAll['dist'] > 0) 
dataAll = dataAll[mask]
len(dataAll)

3046

In [None]:
plt.hist(dataAll.mass)

Step 1 is the training set. We are selecting only stars with SNR > 100 as our training set. We also make sure that the RC stars with mass greater than 1.2 solar masses are chosen.

In [3]:
nstars   = len(dataAll)  

print("The number of stars in the APOGEE-TGAS DR14 sample is:")
print(nstars) 

index  = dataAll['age']>=0
print("Found "+str(np.sum(index))+" stars with positive ages")
dataAge = dataAll[:][index]

index_g = dataAge['phot_g_mean_flux']>=0
print("Found "+str(np.sum(index_g))+" stars with good g fluxes")
dataAge = dataAge[:][index_g]

index_bp = dataAge['phot_bp_mean_flux']>=0
print("Found "+str(np.sum(index_bp))+" stars with good bp fluxes")
dataAge = dataAge[:][index_bp]

index_rp = dataAge['phot_rp_mean_flux']>=0
print("Found "+str(np.sum(index_rp))+" stars with good rp fluxes")
dataAge = dataAge[:][index_rp]

index_jerr = dataAge['J_ERR']>=0
print("Found "+str(np.sum(index_jerr))+" stars with good J err values")
dataAge = dataAge[:][index_jerr]

index_jerr = dataAge['H_ERR']>=0
print("Found "+str(np.sum(index_jerr))+" stars with good H err values")
dataAge = dataAge[:][index_jerr]

index_jerr = dataAge['K_ERR']>=0
print("Found "+str(np.sum(index_jerr))+" stars with good K err values")
data = dataAge[:][index_jerr]

The number of stars in the APOGEE-TGAS DR14 sample is:
3046
Found 3046 stars with positive ages
Found 3046 stars with good g fluxes
Found 3046 stars with good bp fluxes
Found 3046 stars with good rp fluxes
Found 3042 stars with good J err values
Found 3039 stars with good H err values
Found 3036 stars with good K err values


In [4]:
# Get the final features: logg, teff, alpham, mh, cfe, nfe, gmag, bpmag, rpmag, j, h, k & their errors
# Get the final targets: age and distance and their errors

g_mag = data['phot_g_mean_mag']
g_flux = data['phot_g_mean_flux']
g_flux_error = data['phot_g_mean_flux_error']
gmagErr = 0.5 * 2.5 * (np.log10(g_flux + g_flux_error) - np.log10(g_flux - g_flux_error ))

bp_mag = data['phot_bp_mean_mag']
bp_flux = data['phot_bp_mean_flux']
bp_flux_error = data['phot_bp_mean_flux_error']
bpmagErr = 0.5 * 2.5 * (np.log10(bp_flux + bp_flux_error) - np.log10(bp_flux - bp_flux_error ))

rp_mag = data['phot_rp_mean_mag']
rp_flux = data['phot_rp_mean_flux']
rp_flux_error = data['phot_rp_mean_flux_error']
rpmagErr = 0.5 * 2.5 * (np.log10(rp_flux + rp_flux_error) - np.log10(rp_flux - rp_flux_error ))

data['G'] = g_mag
data['G_ERR'] = gmagErr

data['BP'] = bp_mag
data['BP_ERR'] = bpmagErr

data['RP'] = rp_mag
data['RP_ERR'] = rpmagErr

In [None]:
min(data.dist)

In [5]:
# Target 1: age and age uncertainty
age = data['age']
age_68u = data['age_68U']
age_68l = data['age_68L']
ageErr = (age_68u - age_68l)/2.0
data['ageErr'] = ageErr

logAge = np.log10(age)
logAgeErr = (np.log10(age_68u) - np.log10(age_68l))/2.0

data['logAge'] = logAge
data['logAgeErr'] = logAgeErr

In [None]:
plt.hist(logAge)
len(logAge)

In [6]:
dist = data['dist']
dist_68u = data['dist_68U']
dist_68l = data['dist_68L']
dist_err = (dist_68u - dist_68l)/2.0

# This is distance in kpc
# Make sure you change this in the distance modulus

dist_kpc = dist/1000
logDistKpc = np.log10(dist_kpc)

dist_68u_kpc = dist_68u/1000
dist_68l_kpc = dist_68l/1000
dist_kpc_err = dist_err/1000

logDistKpcErr = (np.log10(dist_68u_kpc) - np.log10(dist_68l_kpc))/2.0

data['distKpc'] = dist_kpc
data['distKpc_68u'] = dist_68u_kpc
data['distKpc_68l'] = dist_68l_kpc
data['distErrKpc'] = dist_kpc_err

# Get the log(distance) error
# Also make sure you save an APOGEE id etc. Done

data['logDistKpc'] = logDistKpc
data['logDistKpcErr'] = logDistKpcErr

In [7]:
feats = ['APOGEE_ID_1', 'source_id', 'LOGG', 'TEFF', 'ALPHA_M', 'M_H', 'C_FE', 'N_FE', \
           'G', 'BP', 'RP', 'J', 'H', 'K', 'LOGG_ERR', 'TEFF_ERR', 'ALPHA_M_ERR', 'M_H_ERR', 'C_FE_ERR', 'N_FE_ERR', \
           'G_ERR', 'BP_ERR', 'RP_ERR', 'J_ERR', 'H_ERR', 'K_ERR', \
           'age', 'age_68U', 'age_68L', 'ageErr', 'logAge', 'logAgeErr', \
           'distKpc', 'distKpc_68u','distKpc_68l', 'distErrKpc', 'logDistKpc', 'logDistKpcErr']

dataOfInterest = data[feats]

In [None]:
plt.hist(dataOfInterest['logDistKpcErr'])

In [8]:
dataOfInterest.to_csv('../train_data/AllOriginal.csv')

In [None]:
dataOfInterest.head(5)

In [None]:
len(dataOfInterest)