In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os


# Read the CSV and Perform Basic Data Cleaning

In [2]:
# Given to us
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
# See example Stu_Voice_Recognition - Class 2
X = df.drop("koi_disposition", axis=1)
feature_names = X.columns
y = df["koi_disposition"].values.reshape(-1, 1)
print(X.shape, y.shape)


(8744, 40) (8744, 1)


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [5]:
X_train.head()


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
8017,0,1,1,0,0.806277,4.947e-06,-4.947e-06,131.78567,0.00672,-0.00672,...,-184.0,4.471,0.054,-0.229,0.996,0.324,-0.108,290.81723,38.53912,13.614
1233,0,1,1,0,3.582077,4.318e-06,-4.318e-06,355.515064,0.000864,-0.000864,...,-235.0,4.422,0.09,-0.195,0.993,0.283,-0.131,296.07822,43.13694,15.193
2592,0,0,0,0,5.060923,2.616e-05,-2.616e-05,134.47316,0.00473,-0.00473,...,-112.0,4.492,0.048,-0.112,0.911,0.121,-0.06,289.91742,40.828606,13.346
4770,0,1,0,1,8.480304,3.32e-07,-3.32e-07,135.854534,3.1e-05,-3.1e-05,...,-169.0,3.946,0.195,-0.105,2.21,0.375,-0.563,298.8002,46.665539,7.631
6632,0,0,0,1,4.994716,4.495e-05,-4.495e-05,136.1833,0.0095,-0.0095,...,-194.0,3.706,0.32,-0.08,2.83,0.458,-1.068,282.58215,46.81551,13.352


# Pre-processing

Scale the data using the MinMaxScaler

In [6]:
# See example Ins_Data_Preprocessing - Class 1
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)


  return self.partial_fit(X, y)


In [7]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Decision Tree

In [8]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)

0.8467520585544374

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


0.8956999085086916

# Importance Features

In [10]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.11273570168464522, 'koi_fpflag_co'),
 (0.09925866742942198, 'koi_fpflag_nt'),
 (0.06668376727095261, 'koi_fpflag_ss'),
 (0.0546520641317526, 'koi_model_snr'),
 (0.044601055198661255, 'koi_prad'),
 (0.03661726524051108, 'koi_fpflag_ec'),
 (0.03618985974519387, 'koi_duration_err2'),
 (0.035337532167481894, 'koi_prad_err2'),
 (0.03506029813579917, 'koi_duration_err1'),
 (0.03201226879473367, 'koi_prad_err1'),
 (0.030727334660309275, 'koi_steff_err1'),
 (0.027047535887085212, 'koi_steff_err2'),
 (0.02455869340695758, 'koi_time0bk_err2'),
 (0.02453265983256864, 'koi_duration'),
 (0.02368644414268955, 'koi_depth'),
 (0.021478572323460555, 'koi_time0bk_err1'),
 (0.020657178349964896, 'koi_period'),
 (0.020243776342384456, 'koi_period_err2'),
 (0.017781171899269692, 'koi_period_err1'),
 (0.01708746569640883, 'koi_impact'),
 (0.016006322062405133, 'koi_teq'),
 (0.014886462527475795, 'koi_insol_err1'),
 (0.014389835198211146, 'koi_time0bk'),
 (0.013311902193062804, 'koi_depth_err1'),
 (0.013