In [2]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [3]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Read the CSV and Perform Basic Data Cleaning

In [63]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

#Drop rows where a candidate hasn't been declared confirmed or false
df.drop(df[df['koi_disposition'] == 'CANDIDATE'].index, inplace = True)
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Selecting features

In [64]:
#set target (y-values)
target = df[['koi_disposition']]

# Set features (x-values)
selected_features = df[['koi_fpflag_nt',
                        'koi_fpflag_ss',
                        'koi_fpflag_co',
                        'koi_fpflag_ec', 
                        'koi_period',
                        'koi_time0bk', 
                        'koi_impact',
                        'koi_duration',
                        'koi_depth',
                        'koi_prad',
                        'koi_teq',
                        'koi_insol',
                        'koi_model_snr',
                        'koi_tce_plnt_num',
                        'koi_steff',
                        'koi_slogg',
                        'koi_srad',
                        'ra',
                        'dec',
                        'koi_kepmag']]
feat

In [65]:
##confirming deletion of "candidate" rows

target_list = target.values
# pd.unique(target_list)
np.unique(target_list)


array(['CONFIRMED', 'FALSE POSITIVE'], dtype=object)

# Label Encoding

In [82]:
#reformat data
target = df[['koi_disposition']]
# y = np.ravel(target.values)
y = target.values.reshape(-1,1)


#label encoding
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_targets = label_encoder.transform(y)

encoded_targets


array([0, 1, 1, ..., 1, 1, 1])

# Train Test Split

In [83]:
X_train, X_test, y_train, y_test = train_test_split(selected_features, encoded_targets, random_state= 43)

# Pre-processing


In [86]:
# Scale your data with MinMaxScaler


X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)



# Train the Model



In [96]:
rf = RandomForestClassifier(n_estimators = 1000)
rf = rf.fit(X_train_scaled, y_train)
training_score = rf.score(X_train_scaled, y_train)
test_score = rf.score(X_test_scaled, y_test)

In [98]:
print(f'Training Score: {training_score}')
print(f'Test Score: {test_score}')

Training Score: 1.0
Test Score: 0.9894419306184012


In [107]:
importances = rf.feature_importances_
sorted(zip(rf.feature_importances_, list(selected_features.columns.values)), reverse = True)

[(0.18262389218557884, 'koi_fpflag_co'),
 (0.13984852568144127, 'koi_fpflag_nt'),
 (0.1275426210574252, 'koi_fpflag_ss'),
 (0.11410639745802965, 'koi_prad'),
 (0.07854356081909786, 'koi_model_snr'),
 (0.05804223520599843, 'koi_fpflag_ec'),
 (0.050077914973038574, 'koi_period'),
 (0.04116892441555246, 'koi_teq'),
 (0.04034115801425574, 'koi_depth'),
 (0.03692526560268546, 'koi_impact'),
 (0.0353199111210911, 'koi_insol'),
 (0.019009515181496846, 'koi_duration'),
 (0.018228291533865234, 'koi_time0bk'),
 (0.011032317203463461, 'koi_steff'),
 (0.01077418444660602, 'koi_tce_plnt_num'),
 (0.009040736692679557, 'koi_srad'),
 (0.008227373990028089, 'ra'),
 (0.007925410504593212, 'koi_slogg'),
 (0.005615287448004364, 'koi_kepmag'),
 (0.0056064764650687006, 'dec')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)