In [5]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [6]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Read the CSV and Perform Basic Data Cleaning

In [8]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

#Drop rows where a candidate hasn't been declared confirmed or false
df.drop(df[df['koi_disposition'] == 'CANDIDATE'].index, inplace = True)
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Selecting features

In [10]:
#set target (y-values)
target = df[['koi_disposition']]

# Set features (x-values)
selected_features = df[['koi_fpflag_nt',
                        'koi_fpflag_ss',
                        'koi_fpflag_co',
                        'koi_fpflag_ec', 
                        'koi_period',
                        'koi_time0bk', 
                        'koi_impact',
                        'koi_duration',
                        'koi_depth',
                        'koi_prad',
                        'koi_teq',
                        'koi_insol',
                        'koi_model_snr',
                        'koi_tce_plnt_num',
                        'koi_steff',
                        'koi_slogg',
                        'koi_srad',
                        'ra',
                        'dec',
                        'koi_kepmag']]


In [11]:
##confirming deletion of "candidate" rows

target_list = target.values
np.unique(target_list)


array(['CONFIRMED', 'FALSE POSITIVE'], dtype=object)

# Label Encoding

In [21]:
#reformat data
target = df[['koi_disposition']]
y = target.values.reshape(-1,1)


#label encoding
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_targets = label_encoder.transform(y)

encoded_targets


  return f(*args, **kwargs)


array([0, 1, 1, ..., 1, 1, 1])

# Train Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(selected_features, encoded_targets, random_state= 43)

# Pre-processing


In [14]:
# Scale your data with MinMaxScaler


X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)



# Train the Model



In [15]:
rf = RandomForestClassifier(n_estimators = 1000)
rf = rf.fit(X_train_scaled, y_train)
training_score = rf.score(X_train_scaled, y_train)
test_score = rf.score(X_test_scaled, y_test)

In [16]:
print(f'Training Score: {training_score}')
print(f'Test Score: {test_score}')

Training Score: 1.0
Test Score: 0.9894419306184012


In [17]:
importances = rf.feature_importances_
sorted(zip(rf.feature_importances_, list(selected_features.columns.values)), reverse = True)

[(0.18257799431463212, 'koi_fpflag_co'),
 (0.1469682702786833, 'koi_fpflag_nt'),
 (0.1306953111591826, 'koi_fpflag_ss'),
 (0.11170405536553929, 'koi_prad'),
 (0.07363980938329014, 'koi_model_snr'),
 (0.05827644478038467, 'koi_fpflag_ec'),
 (0.04518453291076131, 'koi_period'),
 (0.04287324128494685, 'koi_depth'),
 (0.03877539374324937, 'koi_teq'),
 (0.03859254404176461, 'koi_insol'),
 (0.03722279905724088, 'koi_impact'),
 (0.0197962378467779, 'koi_duration'),
 (0.018444722596687998, 'koi_time0bk'),
 (0.010020410329967122, 'koi_steff'),
 (0.009359239593527363, 'koi_srad'),
 (0.009211077894028181, 'koi_tce_plnt_num'),
 (0.00811588349390463, 'koi_slogg'),
 (0.00773766313798227, 'ra'),
 (0.0055170785422815624, 'koi_kepmag'),
 (0.005287290245167826, 'dec')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [19]:
# Create the GridSearchCV model

n_estimators = [10, 100, 250, 500, 1000, 2000]
criterion= ["gini", "entropy"]
max_depth = [2, 8, 16, 32, 64, 128, 500, 1000]
bootstrap = [True, False]
param_grid = {"n_estimators": n_estimators,
              "criterion": criterion,
              "max_depth": max_depth,
              "bootstrap": bootstrap}

grid = GridSearchCV(rf, param_grid, verbose=3)



In [20]:
# Train the model with GridSearch

grid.fit(X_train_scaled, y_train)



Fitting 5 folds for each of 192 candidates, totalling 960 fits
[CV 1/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=10; total time=   0.0s
[CV 2/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=10; total time=   0.0s
[CV 3/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=10; total time=   0.0s
[CV 4/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=10; total time=   0.0s
[CV 5/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=10; total time=   0.0s
[CV 1/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=100; total time=   0.2s
[CV 2/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=100; total time=   0.2s
[CV 3/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=100; total time=   0.2s
[CV 4/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=100; total time=   0.2s
[CV 5/5] END bootstrap=True, criterion=gini, max_depth=2, n_estimators=100; total time=

[CV 2/5] END bootstrap=True, criterion=gini, max_depth=16, n_estimators=2000; total time=  11.2s
[CV 3/5] END bootstrap=True, criterion=gini, max_depth=16, n_estimators=2000; total time=  12.8s
[CV 4/5] END bootstrap=True, criterion=gini, max_depth=16, n_estimators=2000; total time=  19.0s
[CV 5/5] END bootstrap=True, criterion=gini, max_depth=16, n_estimators=2000; total time=  11.8s
[CV 1/5] END bootstrap=True, criterion=gini, max_depth=32, n_estimators=10; total time=   0.0s
[CV 2/5] END bootstrap=True, criterion=gini, max_depth=32, n_estimators=10; total time=   0.0s
[CV 3/5] END bootstrap=True, criterion=gini, max_depth=32, n_estimators=10; total time=   0.0s
[CV 4/5] END bootstrap=True, criterion=gini, max_depth=32, n_estimators=10; total time=   0.0s
[CV 5/5] END bootstrap=True, criterion=gini, max_depth=32, n_estimators=10; total time=   0.0s
[CV 1/5] END bootstrap=True, criterion=gini, max_depth=32, n_estimators=100; total time=   0.6s
[CV 2/5] END bootstrap=True, criterion=gi

[CV 3/5] END bootstrap=True, criterion=gini, max_depth=128, n_estimators=1000; total time=   5.7s
[CV 4/5] END bootstrap=True, criterion=gini, max_depth=128, n_estimators=1000; total time=   5.7s
[CV 5/5] END bootstrap=True, criterion=gini, max_depth=128, n_estimators=1000; total time=   5.9s
[CV 1/5] END bootstrap=True, criterion=gini, max_depth=128, n_estimators=2000; total time=  12.1s
[CV 2/5] END bootstrap=True, criterion=gini, max_depth=128, n_estimators=2000; total time=  16.7s
[CV 3/5] END bootstrap=True, criterion=gini, max_depth=128, n_estimators=2000; total time=  14.6s
[CV 4/5] END bootstrap=True, criterion=gini, max_depth=128, n_estimators=2000; total time=  11.9s
[CV 5/5] END bootstrap=True, criterion=gini, max_depth=128, n_estimators=2000; total time=  11.7s
[CV 1/5] END bootstrap=True, criterion=gini, max_depth=500, n_estimators=10; total time=   0.0s
[CV 2/5] END bootstrap=True, criterion=gini, max_depth=500, n_estimators=10; total time=   0.0s
[CV 3/5] END bootstrap=T

[CV 2/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=500; total time=   1.6s
[CV 3/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=500; total time=   1.6s
[CV 4/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=500; total time=   1.7s
[CV 5/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=500; total time=   1.5s
[CV 1/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=1000; total time=   3.5s
[CV 2/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=1000; total time=   3.4s
[CV 3/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=1000; total time=   3.4s
[CV 4/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=1000; total time=   3.6s
[CV 5/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=1000; total time=   3.4s
[CV 1/5] END bootstrap=True, criterion=entropy, max_depth=2, n_estimators=2000; total time=   6.8s
[CV 2/5] END b

[CV 1/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=250; total time=   1.7s
[CV 2/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=250; total time=   1.6s
[CV 3/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=250; total time=   1.9s
[CV 4/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=250; total time=   1.6s
[CV 5/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=250; total time=   1.7s
[CV 1/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=500; total time=   3.4s
[CV 2/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=500; total time=   3.2s
[CV 3/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=500; total time=   3.8s
[CV 4/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=500; total time=   3.5s
[CV 5/5] END bootstrap=True, criterion=entropy, max_depth=32, n_estimators=500; total time=   3.6s
[CV 1/5] E

[CV 1/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=100; total time=   0.5s
[CV 2/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=100; total time=   0.8s
[CV 4/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=100; total time=   0.6s
[CV 5/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=100; total time=   0.6s
[CV 1/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=250; total time=   1.6s
[CV 2/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=250; total time=   1.8s
[CV 3/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=250; total time=   1.6s
[CV 4/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=250; total time=   1.7s
[CV 5/5] END bootstrap=True, criterion=entropy, max_depth=500, n_estimators=250; total time=   1.7s


[CV 4/5] END bootstrap=False, criterion=gini, max_depth=2, n_estimators=2000; total time=   6.4s
[CV 5/5] END bootstrap=False, criterion=gini, max_depth=2, n_estimators=2000; total time=   6.3s
[CV 1/5] END bootstrap=False, criterion=gini, max_depth=8, n_estimators=10; total time=   0.0s
[CV 2/5] END bootstrap=False, criterion=gini, max_depth=8, n_estimators=10; total time=   0.0s
[CV 3/5] END bootstrap=False, criterion=gini, max_depth=8, n_estimators=10; total time=   0.0s
[CV 4/5] END bootstrap=False, criterion=gini, max_depth=8, n_estimators=10; total time=   0.0s
[CV 5/5] END bootstrap=False, criterion=gini, max_depth=8, n_estimators=10; total time=   0.0s
[CV 1/5] END bootstrap=False, criterion=gini, max_depth=8, n_estimators=100; total time=   0.6s
[CV 2/5] END bootstrap=False, criterion=gini, max_depth=8, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=False, criterion=gini, max_depth=8, n_estimators=100; total time=   0.6s
[CV 4/5] END bootstrap=False, criterion=gin

[CV 4/5] END bootstrap=False, criterion=gini, max_depth=32, n_estimators=1000; total time=   8.0s
[CV 5/5] END bootstrap=False, criterion=gini, max_depth=32, n_estimators=1000; total time=   8.2s
[CV 1/5] END bootstrap=False, criterion=gini, max_depth=32, n_estimators=2000; total time=  17.5s
[CV 2/5] END bootstrap=False, criterion=gini, max_depth=32, n_estimators=2000; total time=  18.0s
[CV 3/5] END bootstrap=False, criterion=gini, max_depth=32, n_estimators=2000; total time=  18.8s
[CV 4/5] END bootstrap=False, criterion=gini, max_depth=32, n_estimators=2000; total time=  18.6s
[CV 5/5] END bootstrap=False, criterion=gini, max_depth=32, n_estimators=2000; total time=  17.5s
[CV 1/5] END bootstrap=False, criterion=gini, max_depth=64, n_estimators=10; total time=   0.0s
[CV 2/5] END bootstrap=False, criterion=gini, max_depth=64, n_estimators=10; total time=   0.0s
[CV 3/5] END bootstrap=False, criterion=gini, max_depth=64, n_estimators=10; total time=   0.0s
[CV 4/5] END bootstrap=Fal

[CV 3/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=500; total time=   4.0s
[CV 4/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=500; total time=   3.9s
[CV 5/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=500; total time=   4.2s
[CV 1/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=1000; total time=  16.5s
[CV 2/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=1000; total time=   8.6s
[CV 3/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=1000; total time=   8.1s
[CV 4/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=1000; total time=   8.3s
[CV 5/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=1000; total time=   8.4s
[CV 1/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=2000; total time=  33.9s
[CV 2/5] END bootstrap=False, criterion=gini, max_depth=500, n_estimators=2000; total time=  19.5s
[CV 3/5] END 

[CV 1/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=250; total time=   2.2s
[CV 2/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=250; total time=   2.2s
[CV 3/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=250; total time=   4.4s
[CV 4/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=250; total time=   6.1s
[CV 5/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=250; total time=   5.8s
[CV 1/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=500; total time=   6.6s
[CV 2/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=500; total time=   4.8s
[CV 3/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=500; total time=   5.1s
[CV 4/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=500; total time=   4.5s
[CV 5/5] END bootstrap=False, criterion=entropy, max_depth=8, n_estimators=500; total time=   4.8s
[CV 1/5] E

[CV 3/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=10; total time=   0.2s
[CV 4/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=10; total time=   0.2s
[CV 5/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=10; total time=   0.1s
[CV 1/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=100; total time=   2.6s
[CV 2/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=100; total time=   2.6s
[CV 3/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=100; total time=   2.6s
[CV 4/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=100; total time=   2.5s
[CV 5/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=100; total time=   1.4s
[CV 1/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=250; total time=   3.0s
[CV 2/5] END bootstrap=False, criterion=entropy, max_depth=64, n_estimators=250; total time=   2.5s
[CV

[CV 5/5] END bootstrap=False, criterion=entropy, max_depth=500, n_estimators=1000; total time=  11.0s
[CV 1/5] END bootstrap=False, criterion=entropy, max_depth=500, n_estimators=2000; total time=  32.8s
[CV 2/5] END bootstrap=False, criterion=entropy, max_depth=500, n_estimators=2000; total time=  34.4s
[CV 3/5] END bootstrap=False, criterion=entropy, max_depth=500, n_estimators=2000; total time=  42.4s
[CV 4/5] END bootstrap=False, criterion=entropy, max_depth=500, n_estimators=2000; total time=  22.9s
[CV 5/5] END bootstrap=False, criterion=entropy, max_depth=500, n_estimators=2000; total time=  27.8s
[CV 1/5] END bootstrap=False, criterion=entropy, max_depth=1000, n_estimators=10; total time=   0.0s
[CV 2/5] END bootstrap=False, criterion=entropy, max_depth=1000, n_estimators=10; total time=   0.0s
[CV 3/5] END bootstrap=False, criterion=entropy, max_depth=1000, n_estimators=10; total time=   0.0s
[CV 4/5] END bootstrap=False, criterion=entropy, max_depth=1000, n_estimators=10; tot

GridSearchCV(estimator=RandomForestClassifier(n_estimators=1000),
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 8, 16, 32, 64, 128, 500, 1000],
                         'n_estimators': [10, 100, 250, 500, 1000, 2000]},
             verbose=3)

In [22]:
print(grid.best_params_)
print(grid.best_score_)

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 16, 'n_estimators': 1000}
0.9904487848045258


# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)