In [None]:
# Updating sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
# Installing joblib
# !pip install joblib

In [3]:
import pandas as pd

In [4]:
# Reading CSV
df = pd.read_csv("source_data/exoplanet_data.csv")
# Dropping the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Dropping the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [5]:
# Setting features/x values
selected_features = df.drop("koi_disposition", axis=1)

In [6]:
# Setting y value
y = df["koi_disposition"]
print(selected_features.shape, y.shape)

(6991, 40) (6991,)


In [7]:
# Creating test/train split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(selected_features, y, random_state=21)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6966,1,0,1,0,361.901618,0.008898,-0.008898,405.3021,0.02,-0.02,...,-136,4.603,0.04,-0.06,0.717,0.086,-0.058,290.37891,39.694,12.673
1714,0,0,0,0,6.739683,6.4e-05,-6.4e-05,132.29296,0.00816,-0.00816,...,-71,4.261,0.168,-0.112,1.176,0.183,-0.203,280.82339,42.636051,14.455
225,0,0,0,0,3.166354,3e-06,-3e-06,170.966145,0.000631,-0.000631,...,-211,4.47,0.052,-0.208,0.998,0.318,-0.106,286.36011,45.414291,15.603
5266,0,0,0,0,25.090157,0.000482,-0.000482,138.4988,0.0162,-0.0162,...,-183,3.985,0.266,-0.114,1.788,0.351,-0.571,298.67377,43.795479,12.014
5468,0,1,0,0,7.234966,2e-06,-2e-06,134.582307,0.000207,-0.000207,...,-186,4.573,0.04,-0.16,0.798,0.206,-0.069,290.22617,37.75964,16.981


In [8]:
# Scaling the data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Training the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train_scaled, y_train)

print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8832951945080092


In [10]:
importances = model.feature_importances_
importances

array([0.10024405, 0.06912427, 0.10895143, 0.03391889, 0.02222167,
       0.01964898, 0.01861658, 0.0133119 , 0.02230137, 0.02120567,
       0.02016448, 0.01108689, 0.0101998 , 0.02370824, 0.03231444,
       0.03557253, 0.02092239, 0.01380853, 0.01428526, 0.04929524,
       0.03078548, 0.03025581, 0.01648772, 0.01361671, 0.01734265,
       0.01117992, 0.0612531 , 0.00322881, 0.00932537, 0.03003395,
       0.02506854, 0.00914312, 0.008394  , 0.01028688, 0.00926525,
       0.01126508, 0.00871171, 0.01173911, 0.01126848, 0.01044569])

In [11]:
feature_names = selected_features.columns
sorted(zip(model.feature_importances_, feature_names), reverse=True)

[(0.10895143194717612, 'koi_fpflag_co'),
 (0.10024404874357488, 'koi_fpflag_nt'),
 (0.06912427047482474, 'koi_fpflag_ss'),
 (0.061253095522498836, 'koi_model_snr'),
 (0.04929524189279832, 'koi_prad'),
 (0.03557252803301809, 'koi_duration_err2'),
 (0.03391889479710922, 'koi_fpflag_ec'),
 (0.03231444145757325, 'koi_duration_err1'),
 (0.03078547534052932, 'koi_prad_err1'),
 (0.03025580800793375, 'koi_prad_err2'),
 (0.030033951037824393, 'koi_steff_err1'),
 (0.025068543041762096, 'koi_steff_err2'),
 (0.02370824019756247, 'koi_duration'),
 (0.022301373189958014, 'koi_time0bk_err1'),
 (0.022221670939614392, 'koi_period'),
 (0.021205669421756695, 'koi_time0bk_err2'),
 (0.02092239403697677, 'koi_depth'),
 (0.020164475126395104, 'koi_impact'),
 (0.019648978972360665, 'koi_period_err1'),
 (0.018616583871767466, 'koi_period_err2'),
 (0.017342652349395183, 'koi_insol_err1'),
 (0.016487721545092596, 'koi_teq'),
 (0.014285261013565307, 'koi_depth_err2'),
 (0.01380852960822885, 'koi_depth_err1'),
 (0

In [12]:
# Re-setting significant features/x values
sig_features = df.drop(["koi_disposition", "koi_tce_plnt_num", "koi_slogg", "koi_srad_err2", "koi_slogg_err1", "koi_steff", "koi_slogg_err2", "koi_srad"], axis=1)

In [13]:
X_train2, X_test2, y_train, y_test = train_test_split(sig_features, y, random_state=21)
X_train2.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_steff_err1,koi_steff_err2,koi_srad_err1,ra,dec,koi_kepmag
6966,1,0,1,0,361.901618,0.008898,-0.008898,405.3021,0.02,-0.02,...,0.37,0.14,-0.09,8.5,136,-136,0.086,290.37891,39.694,12.673
1714,0,0,0,0,6.739683,6.4e-05,-6.4e-05,132.29296,0.00816,-0.00816,...,246.9,102.89,-86.42,15.4,82,-71,0.183,280.82339,42.636051,14.455
225,0,0,0,0,3.166354,3e-06,-3e-06,170.966145,0.000631,-0.000631,...,621.92,586.4,-191.31,128.3,169,-211,0.318,286.36011,45.414291,15.603
5266,0,0,0,0,25.090157,0.000482,-0.000482,138.4988,0.0162,-0.0162,...,124.33,76.2,-73.41,12.1,183,-183,0.351,298.67377,43.795479,12.014
5468,0,1,0,0,7.234966,2e-06,-2e-06,134.582307,0.000207,-0.000207,...,119.12,95.5,-32.16,671.3,186,-186,0.206,290.22617,37.75964,16.981


In [14]:
# Scaling the data
X_scaler2 = MinMaxScaler().fit(X_train2)
X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

In [15]:
# Training the model on the significant features
model.fit(X_train_scaled2, y_train)

print(f"Training Data Score: {model.score(X_train_scaled2, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled2, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8850114416475973


In [16]:
# Creating the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': [5, 10, 20], 'min_samples_split': [5, 10, 20]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [17]:
# RandomForestClassifier().get_params().keys()

In [18]:
# Training the model with GridSearch
grid.fit(X_train_scaled2, y_train);

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ..max_depth=5, min_samples_split=5;, score=0.848 total time=   0.9s
[CV 2/5] END ..max_depth=5, min_samples_split=5;, score=0.843 total time=   0.9s
[CV 3/5] END ..max_depth=5, min_samples_split=5;, score=0.866 total time=   0.9s
[CV 4/5] END ..max_depth=5, min_samples_split=5;, score=0.889 total time=   1.0s
[CV 5/5] END ..max_depth=5, min_samples_split=5;, score=0.870 total time=   0.9s
[CV 1/5] END .max_depth=5, min_samples_split=10;, score=0.845 total time=   0.9s
[CV 2/5] END .max_depth=5, min_samples_split=10;, score=0.844 total time=   0.9s
[CV 3/5] END .max_depth=5, min_samples_split=10;, score=0.877 total time=   1.0s
[CV 4/5] END .max_depth=5, min_samples_split=10;, score=0.887 total time=   0.9s
[CV 5/5] END .max_depth=5, min_samples_split=10;, score=0.873 total time=   0.9s
[CV 1/5] END .max_depth=5, min_samples_split=20;, score=0.844 total time=   0.9s
[CV 2/5] END .max_depth=5, min_samples_split=20;,

In [19]:
# Making inferences with the tuned model
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 20, 'min_samples_split': 5}
0.8960556764348453


In [20]:
# Calculating classification report
predictions = grid.predict(X_test_scaled2)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names = ["Confirmed", "False Positive", "Candidate"]))

                precision    recall  f1-score   support

     Confirmed       0.79      0.69      0.74       411
False Positive       0.78      0.82      0.80       455
     Candidate       0.96      0.99      0.98       882

      accuracy                           0.88      1748
     macro avg       0.85      0.84      0.84      1748
  weighted avg       0.88      0.88      0.88      1748



In [21]:
# Saving the model
import joblib

filename = 'random_forest_model.sav'
joblib.dump(model, filename)

['random_forest_model.sav']