# Import Libraries

In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('data/cumulative.csv')

df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Disposition vs P_Disposition

Disposition (koi_disposition) is the category of the KOI from the Exoplanet archive. P_Disposition (koi_pdisposition) is the "pipeline flag that designates the most probable physical explanation of the KOI"

The difference between these two categories is not immediately clear. There is some overlap in the categories which would otherwise be separated.

We do some data wrangling to look at how these categories within the two columns match up and differ. 

In [27]:
print(df.koi_disposition.value_counts(),end='\n\n')
print(df.koi_pdisposition.value_counts())

FALSE POSITIVE    5023
CONFIRMED         2293
CANDIDATE         2248
Name: koi_disposition, dtype: int64

FALSE POSITIVE    5068
CANDIDATE         4496
Name: koi_pdisposition, dtype: int64


We have some immediate observations:

1) There are three possible values for koi_disposition vs two values for koi_pdisposition

2) We can see here that the number of False Positives for each category is roughly the same, koi_pdisposition has 45 more False Positives

3) Going along with the first observation, without first looking at the actual value intersections, it appears that the Candidate value for koi_pdisposition can almost be split between the Confirmed and Candidate values in koi_disposition

To get a better idea of how these values align, we will 

a) Create a temporary dataframe looking only at rows which have a certain value in either column

b) Then look at the value counts of the other column.

In [32]:
#Initialize the dataframes with their corresponding conditional variables

d_falseposi = df[ df.koi_disposition == 'FALSE POSITIVE']
d_confirmed = df[ df.koi_disposition == 'CONFIRMED']
d_candidate = df[ df.koi_disposition == 'CANDIDATE']

disp_cols = [('False Positive', d_falseposi), ('Confirmed', d_confirmed), ('Candidate', d_candidate)]

dp_falseposi = df[ df.koi_pdisposition == 'FALSE POSITIVE']
dp_candidate = df[ df.koi_pdisposition == 'CANDIDATE']
             
dp_cols = [('False Positive', dp_falseposi), ('Candidate', dp_candidate)]
             
for value, dataf in disp_cols:
    print(value)
    print(dataf.koi_pdisposition.value_counts())
    
print(end='\n\n')
    
for value, dataf in dp_cols:
    print(value)
    print(dataf.koi_disposition.value_counts())


False Positive
FALSE POSITIVE    5023
Name: koi_pdisposition, dtype: int64
Confirmed
CANDIDATE         2248
FALSE POSITIVE      45
Name: koi_pdisposition, dtype: int64
Candidate
CANDIDATE    2248
Name: koi_pdisposition, dtype: int64


False Positive
FALSE POSITIVE    5023
CONFIRMED           45
Name: koi_disposition, dtype: int64
Candidate
CANDIDATE    2248
CONFIRMED    2248
Name: koi_disposition, dtype: int64


Now we have a more detailed look at how these columns are related.

Every single False Positive within the disposition column is also False Positive for p_disposition. Other the other hand, there are 45 False Positives for p_disposition that are actually listed as confirmed. There are 2248 candidates and confirmed that listed as just candidates for p_disposition.

In [33]:
id_cols = ['kepid', 'kepoi_name']

exo_archive_cols = ['kepler_name', 'koi_disposition']

project_disposition_cols = ['koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec']

transit_cols = ['koi_period', 'koi_time0bk', 'koi_eccen', 'koi_longp', 'koi_duration', 'koi_ingress', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_limbdark_mod', 'koi_parm_prov']

actual_transit = []
for col in transit_cols:
    if col in df.columns:
        actual_transit.append(col)