In [1]:
from sklearn import tree
import pandas as pd
import os
import numpy as np

# Preprocess the Data

In [2]:
# Pull in cumulative.csv
rawData = pd.read_csv(os.path.join("Resources", "cumulative.csv"))
rawData.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


## Preprocess the raw dataset prior to fitting the model

In [3]:
rawData.count()

rowid                9564
kepid                9564
kepoi_name           9564
kepler_name          2294
koi_disposition      9564
koi_pdisposition     9564
koi_score            8054
koi_fpflag_nt        9564
koi_fpflag_ss        9564
koi_fpflag_co        9564
koi_fpflag_ec        9564
koi_period           9564
koi_period_err1      9110
koi_period_err2      9110
koi_time0bk          9564
koi_time0bk_err1     9110
koi_time0bk_err2     9110
koi_impact           9201
koi_impact_err1      9110
koi_impact_err2      9110
koi_duration         9564
koi_duration_err1    9110
koi_duration_err2    9110
koi_depth            9201
koi_depth_err1       9110
koi_depth_err2       9110
koi_prad             9201
koi_prad_err1        9201
koi_prad_err2        9201
koi_teq              9201
koi_teq_err1            0
koi_teq_err2            0
koi_insol            9243
koi_insol_err1       9243
koi_insol_err2       9243
koi_model_snr        9201
koi_tce_plnt_num     9218
koi_tce_delivname    9218
koi_steff   

In [4]:
rawData.shape

(9564, 50)

In [5]:
# Features
features = rawData

In [6]:
features.shape

(9564, 50)

In [7]:
features = features.dropna(subset=['koi_score', 'koi_period_err1','koi_period_err2','koi_time0bk_err1','koi_time0bk_err2','koi_impact','koi_impact_err1','koi_impact_err2','koi_duration_err1','koi_duration_err2','koi_depth','koi_depth_err1','koi_depth_err2','koi_prad','koi_prad_err1','koi_prad_err2','koi_teq','koi_insol','koi_insol_err1','koi_insol_err2','koi_model_snr','koi_steff','koi_steff_err1','koi_steff_err2','koi_slogg','koi_slogg_err1','koi_slogg_err2','koi_srad','koi_srad_err1','koi_srad_err2'])

In [8]:
features.shape

(7803, 50)

In [9]:
features['Label'] = (features.koi_disposition == 'CONFIRMED').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
# define target

y = np.array(features.Label)
# convert to numerical value

# show y
y

array([1, 1, 0, ..., 0, 0, 0])

In [11]:
# drop non used columns
features = features.drop('rowid', axis=1)
features = features.drop('kepid', axis=1)
features = features.drop('kepoi_name', axis=1)
features = features.drop('Label', axis=1)
features = features.drop('koi_score', axis=1)
features = features.drop('koi_disposition', axis=1)
features = features.drop('koi_pdisposition', axis=1)
features = features.drop('kepler_name', axis=1)
features = features.drop('koi_tce_plnt_num', axis=1)
features = features.drop('koi_tce_delivname', axis=1)
# error columns removed per feature review below
features = features.drop('koi_period_err1', axis=1)
features = features.drop('koi_period_err2', axis=1)
features = features.drop('koi_time0bk_err1', axis=1)
features = features.drop('koi_time0bk_err2', axis=1)
features = features.drop('koi_impact_err1', axis=1)
features = features.drop('koi_impact_err2', axis=1)
features = features.drop('koi_duration_err1', axis=1)
features = features.drop('koi_duration_err2', axis=1)
features = features.drop('koi_depth_err1', axis=1)
features = features.drop('koi_depth_err2', axis=1)
features = features.drop('koi_prad_err1', axis=1)
features = features.drop('koi_prad_err2', axis=1)
features = features.drop('koi_teq_err1', axis=1)
features = features.drop('koi_teq_err2', axis=1)
features = features.drop('koi_insol_err1', axis=1)
features = features.drop('koi_insol_err2', axis=1)
features = features.drop('koi_steff_err1', axis=1)
features = features.drop('koi_steff_err2', axis=1)
features = features.drop('koi_slogg_err1', axis=1)
features = features.drop('koi_slogg_err2', axis=1)
features = features.drop('koi_srad_err1', axis=1)
features = features.drop('koi_srad_err2', axis=1)

In [12]:
features.count()

koi_fpflag_nt    7803
koi_fpflag_ss    7803
koi_fpflag_co    7803
koi_fpflag_ec    7803
koi_period       7803
koi_time0bk      7803
koi_impact       7803
koi_duration     7803
koi_depth        7803
koi_prad         7803
koi_teq          7803
koi_insol        7803
koi_model_snr    7803
koi_steff        7803
koi_slogg        7803
koi_srad         7803
ra               7803
dec              7803
koi_kepmag       7803
dtype: int64

In [13]:
features.shape

(7803, 19)

In [14]:
features.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,0,0,0,0,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927,291.93423,48.141651,15.347
1,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927,291.93423,48.141651,15.347
2,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638.0,39.3,76.3,5853.0,4.544,0.868,297.00482,48.134129,15.436
3,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791,285.53461,48.28521,15.597
4,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046,288.75488,48.2262,15.509


In [15]:
# only numerical values - there should not be any
features = features.select_dtypes(include=['number'])
features.shape                                   

(7803, 19)

In [16]:
features.describe()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
count,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0
mean,0.114828,0.263873,0.215174,0.133667,37.314267,157.877359,0.609685,5.349509,25728.006946,26.965784,1133.811739,7679.77,298.205267,5689.23978,4.320219,1.678491,292.073715,43.836122,14.309852
std,0.318834,0.440759,0.410969,0.340315,86.826061,57.892061,0.707276,6.198227,83507.082445,313.055376,832.013752,167949.9,853.877036,788.589568,0.420923,5.871606,4.787246,3.599892,1.335016
min,0.0,0.0,0.0,0.0,0.25982,120.565925,0.0,0.3028,4.5,0.14,92.0,0.02,2.4,2661.0,0.047,0.109,279.85608,36.577381,6.966
25%,0.0,0.0,0.0,0.0,2.455472,132.63356,0.213,2.417,162.35,1.41,610.0,32.635,14.4,5306.5,4.228,0.826,288.68173,40.81496,13.519
50%,0.0,0.0,0.0,0.0,7.698431,136.0162,0.579,3.736,447.8,2.46,934.0,180.14,27.4,5753.0,4.442,0.993,292.29459,43.718182,14.582
75%,0.0,1.0,0.0,0.0,24.089219,159.775731,0.9075,5.973,1862.35,19.015,1426.0,975.975,99.35,6100.0,4.545,1.3195,295.895305,46.73485,15.3335
max,1.0,1.0,1.0,1.0,1071.232624,1472.522306,25.224,138.54,921670.0,26042.9,14667.0,10947550.0,9054.7,15896.0,5.364,180.013,301.72076,52.33601,19.065


In [17]:
feature_names = features.columns
feature_names

Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth',
       'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff',
       'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

In [18]:
# X is an array of arrays based on the dataframe
X = features.to_numpy(copy=True)
X

array([[  0.      ,   0.      ,   0.      , ..., 291.93423 ,  48.141651,
         15.347   ],
       [  0.      ,   0.      ,   0.      , ..., 291.93423 ,  48.141651,
         15.347   ],
       [  0.      ,   1.      ,   0.      , ..., 297.00482 ,  48.134129,
         15.436   ],
       ...,
       [  0.      ,   0.      ,   0.      , ..., 286.50937 ,  47.163219,
         14.757   ],
       [  0.      ,   0.      ,   1.      , ..., 294.16489 ,  47.176281,
         15.385   ],
       [  0.      ,   0.      ,   1.      , ..., 297.00977 ,  47.121021,
         14.826   ]])

## Perform feature selection and remove unnecessary features

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_train, y_train)

1.0

In [22]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.02486544, 0.0579346 , 0.08220616, 0.02255377, 0.04652864,
       0.03602047, 0.07087274, 0.04663028, 0.0614051 , 0.11316039,
       0.04240811, 0.04083403, 0.17297702, 0.03033544, 0.02998916,
       0.02862562, 0.03154503, 0.02969323, 0.03141478])

In [23]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.17297702334056986, 'koi_model_snr'),
 (0.11316038609389577, 'koi_prad'),
 (0.08220615664057876, 'koi_fpflag_co'),
 (0.07087274005415378, 'koi_impact'),
 (0.061405097118620355, 'koi_depth'),
 (0.05793459594361778, 'koi_fpflag_ss'),
 (0.04663027511520748, 'koi_duration'),
 (0.046528639798448373, 'koi_period'),
 (0.04240811187155299, 'koi_teq'),
 (0.04083402729187062, 'koi_insol'),
 (0.03602047317872307, 'koi_time0bk'),
 (0.03154502992625068, 'ra'),
 (0.0314147808867297, 'koi_kepmag'),
 (0.030335439702600858, 'koi_steff'),
 (0.02998916359378712, 'koi_slogg'),
 (0.029693231200510848, 'dec'),
 (0.028625616318438202, 'koi_srad'),
 (0.024865438296039682, 'koi_fpflag_nt'),
 (0.022553773628404192, 'koi_fpflag_ec')]

In [24]:
# validate per test data
rf.score(X_test, y_test)

0.9036391594054332