In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [16]:
from sklearn import tree
import pandas as pd
import os

# Read the CSV and Perform Basic Data Cleaning

In [17]:
df = pd.read_csv("Resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [18]:
# Set features. This will also be used as your x values.
target = df["koi_disposition"]
target_names = ["negative" , "positive"]

In [19]:
data = df.drop("koi_disposition", axis =1)
feature_names = data.columns
data.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [20]:
# Split the data using train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [21]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,7.38e-05,-7.38e-05,133.07724,0.00844,-0.00844,...,-171,4.327,0.153,-0.187,1.125,0.31,-0.207,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,6.06e-06,-6.06e-06,132.02005,0.00795,-0.00795,...,-175,4.578,0.033,-0.187,0.797,0.211,-0.056,284.50391,42.46386,15.77
2879,1,0,0,0,7.652707,6.54e-05,-6.54e-05,134.46038,0.00619,-0.00619,...,-189,4.481,0.05,-0.2,0.963,0.29,-0.097,295.50211,38.98354,13.099
107,0,0,0,0,7.953547,1.91e-05,-1.91e-05,174.66224,0.00182,-0.00182,...,-85,4.536,0.056,-0.016,0.779,0.023,-0.049,291.15878,40.750271,15.66
29,0,0,0,0,4.959319,5.15e-07,-5.15e-07,172.258529,8.3e-05,-8.3e-05,...,-77,4.359,0.11,-0.11,1.082,0.173,-0.13,292.16705,48.727589,15.263


In [22]:
# Create a Decision Tree Classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [23]:
data.describe()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
count,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,...,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0
mean,0.157059,0.244743,0.202975,0.125018,56.191248,0.001851122,-0.001851122,164.48882,0.00934,-0.00934,...,-161.20698,4.305049,0.121091,-0.14048,1.740749,0.35271,-0.388568,292.082406,43.812143,14.271508
std,0.363882,0.429966,0.402243,0.330763,117.570962,0.007184503,0.007184503,67.020475,0.021989,0.021989,...,71.448481,0.439238,0.132048,0.08199,5.903415,0.839017,1.907797,4.762908,3.606167,1.350802
min,0.0,0.0,0.0,0.0,0.25982,1.1e-08,-0.1568,120.515914,9e-06,-0.569,...,-1733.0,0.047,0.0,-1.007,0.109,0.0,-103.825,279.85608,36.577381,6.966
25%,0.0,0.0,0.0,0.0,2.620126,5.005e-06,-0.0002401,132.683917,0.001145,-0.01,...,-197.0,4.209,0.044,-0.195,0.829,0.128,-0.252,288.70473,40.79776,13.455
50%,0.0,0.0,0.0,0.0,8.947426,3.3e-05,-3.3e-05,136.73923,0.00399,-0.00399,...,-159.0,4.436,0.07,-0.127,0.999,0.248,-0.111,292.31476,43.679661,14.534
75%,0.0,0.0,0.0,0.0,34.282605,0.0002401,-5.005e-06,169.937005,0.01,-0.001145,...,-112.0,4.543,0.149,-0.088,1.357,0.357,-0.069,295.88855,46.693659,15.322
max,1.0,1.0,1.0,1.0,1071.232624,0.1568,-1.1e-08,1472.522306,0.569,-9e-06,...,0.0,5.364,1.472,0.0,180.013,25.956,0.0,301.72076,52.33601,19.065


In [24]:
clf.score(X_test, y_test)

0.8432494279176201

In [25]:
# Fit the classifier to the data
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
# 0.729167

In [26]:
rf.score(X_test, y_test)

0.8998855835240275

In [27]:
features_data=sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [28]:
 features_data

[(0.10302616234264474, 'koi_fpflag_co'),
 (0.10285816543681475, 'koi_fpflag_nt'),
 (0.07102747968771927, 'koi_fpflag_ss'),
 (0.05543272025317121, 'koi_model_snr'),
 (0.043473806424630876, 'koi_prad'),
 (0.036737116211899706, 'koi_duration_err1'),
 (0.03330703486650935, 'koi_steff_err1'),
 (0.03238812774167004, 'koi_fpflag_ec'),
 (0.03232317403594832, 'koi_duration_err2'),
 (0.030376263862039097, 'koi_prad_err2'),
 (0.030139684987393528, 'koi_prad_err1'),
 (0.02801401689980022, 'koi_steff_err2'),
 (0.023097922715847968, 'koi_period'),
 (0.022169342736265724, 'koi_duration'),
 (0.022119475800461804, 'koi_time0bk_err2'),
 (0.021719091763947772, 'koi_depth'),
 (0.020820671833089836, 'koi_time0bk_err1'),
 (0.019794025848119588, 'koi_impact'),
 (0.018747293393737582, 'koi_period_err2'),
 (0.018499686241235484, 'koi_insol_err1'),
 (0.01813387168173789, 'koi_period_err1'),
 (0.017000548799375056, 'koi_insol'),
 (0.016180259928194097, 'koi_teq'),
 (0.01594120863102529, 'koi_insol_err2'),
 (0.01

In [29]:
[x[0] for x in features_data]

[0.10302616234264474,
 0.10285816543681475,
 0.07102747968771927,
 0.05543272025317121,
 0.043473806424630876,
 0.036737116211899706,
 0.03330703486650935,
 0.03238812774167004,
 0.03232317403594832,
 0.030376263862039097,
 0.030139684987393528,
 0.02801401689980022,
 0.023097922715847968,
 0.022169342736265724,
 0.022119475800461804,
 0.021719091763947772,
 0.020820671833089836,
 0.019794025848119588,
 0.018747293393737582,
 0.018499686241235484,
 0.01813387168173789,
 0.017000548799375056,
 0.016180259928194097,
 0.01594120863102529,
 0.013577079859226217,
 0.013490117376311138,
 0.012739188997306496,
 0.012208811725963536,
 0.01210752925237702,
 0.011899080959351032,
 0.011522885314311393,
 0.01106956707527889,
 0.010224131668037942,
 0.010180047755712036,
 0.010147923907617819,
 0.009259318292792392,
 0.008926526893299752,
 0.008463519098151477,
 0.008153530147367208,
 0.002703589553616458]

In [30]:
np.sum([x[0] for x in features_data])

NameError: name 'np' is not defined

In [31]:
X_train.iloc[:,1:]

Unnamed: 0,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6122,0,0,0,6.768901,7.380000e-05,-7.380000e-05,133.077240,0.008440,-0.008440,0.150,...,-171,4.327,0.153,-0.187,1.125,0.310,-0.207,294.40472,39.351681,14.725
6370,1,0,1,0.733726,6.060000e-06,-6.060000e-06,132.020050,0.007950,-0.007950,0.291,...,-175,4.578,0.033,-0.187,0.797,0.211,-0.056,284.50391,42.463860,15.770
2879,0,0,0,7.652707,6.540000e-05,-6.540000e-05,134.460380,0.006190,-0.006190,0.970,...,-189,4.481,0.050,-0.200,0.963,0.290,-0.097,295.50211,38.983540,13.099
107,0,0,0,7.953547,1.910000e-05,-1.910000e-05,174.662240,0.001820,-0.001820,0.300,...,-85,4.536,0.056,-0.016,0.779,0.023,-0.049,291.15878,40.750271,15.660
29,0,0,0,4.959319,5.150000e-07,-5.150000e-07,172.258529,0.000083,-0.000083,0.831,...,-77,4.359,0.110,-0.110,1.082,0.173,-0.130,292.16705,48.727589,15.263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,1,0,0,8.268081,6.340000e-07,-6.340000e-07,135.056330,0.000064,-0.000064,1.099,...,-190,4.502,0.050,-0.200,0.922,0.273,-0.091,292.53125,46.728699,15.768
5191,0,0,0,11.161938,1.677000e-04,-1.677000e-04,133.553800,0.013000,-0.013000,0.739,...,-124,4.072,0.188,-0.101,1.640,0.281,-0.343,295.21268,49.562180,13.374
5226,1,0,0,6.150251,7.000000e-07,-7.000000e-07,134.422825,0.000088,-0.000088,1.270,...,-458,3.896,0.270,-0.180,2.867,0.988,-1.087,297.18176,45.988441,10.622
5390,0,0,0,3.343285,4.380000e-05,-4.380000e-05,134.845100,0.011200,-0.011200,1.210,...,-197,3.773,0.293,-0.098,2.652,0.433,-0.939,296.86258,41.147419,13.276


In [32]:
# from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train.iloc[:,1:], y_train)

In [33]:
rf.score(X_test.iloc[:,1:], y_test)

0.8495423340961098

In [34]:
feature_names = data.iloc[:,1:].columns

In [35]:
features_data=sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [36]:
features_data

[(0.10344017465445904, 'koi_fpflag_co'),
 (0.060196545036592594, 'koi_fpflag_ss'),
 (0.05977887712185082, 'koi_model_snr'),
 (0.05225463360393756, 'koi_prad'),
 (0.040066996274379915, 'koi_duration_err1'),
 (0.038810540229084416, 'koi_duration_err2'),
 (0.03574270854346386, 'koi_steff_err1'),
 (0.034836675292299404, 'koi_fpflag_ec'),
 (0.03372038590059782, 'koi_prad_err2'),
 (0.03080390089757596, 'koi_steff_err2'),
 (0.030515533146731534, 'koi_prad_err1'),
 (0.02987490723451296, 'koi_period'),
 (0.026732029426637435, 'koi_depth'),
 (0.026335595291524964, 'koi_duration'),
 (0.024582231328786282, 'koi_time0bk_err1'),
 (0.024020629030178445, 'koi_time0bk_err2'),
 (0.023758670011733964, 'koi_insol_err1'),
 (0.02231462987325875, 'koi_period_err2'),
 (0.02087478023936006, 'koi_impact'),
 (0.019827440607126427, 'koi_period_err1'),
 (0.018538029083482347, 'koi_insol'),
 (0.018514834449038106, 'koi_teq'),
 (0.01794255377744076, 'koi_time0bk'),
 (0.01661007872399498, 'koi_srad_err1'),
 (0.016501

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [37]:
# Scale your data

# Train the Model



In [38]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

NameError: name 'model2' is not defined

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'model_random_forest.sav'
joblib.dump(your_model, filename)