In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [1]:
from rfpimp import *
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


# Select your features (columns)

In [3]:
# Set features. This will also be used as your x values.
#selected_features = df.drop(['koi_disposition'], axis = 1)
X = df.drop(['koi_disposition'], axis = 1)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
y = df['koi_disposition']
y_names = ['CONFIRMED','FALSE POSITIVE','CANDIDATE']
y
#df.koi_disposition.value_counts()

0            CONFIRMED
1       FALSE POSITIVE
2       FALSE POSITIVE
3            CONFIRMED
4            CONFIRMED
             ...      
6986    FALSE POSITIVE
6987    FALSE POSITIVE
6988         CANDIDATE
6989    FALSE POSITIVE
6990    FALSE POSITIVE
Name: koi_disposition, Length: 6991, dtype: object

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
#y_train

((5592, 40), (1399, 40), (5592,), (1399,))

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [6]:
# Scale your data
#No need to scale the data for Random Forests

# Train the Model


In [7]:
# Create a random forest classifier
#rf = RandomForestClassifier(n_estimators=200)
rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)

print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test, y_test)}")



Training Data Score: 1.0
Testing Data Score: 0.908506075768406


In [8]:
#Dinnara
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances
#it tells you that the 4th feature is the most important

array([0.09961445, 0.06382055, 0.10646996, 0.03907414, 0.02311699,
       0.01924093, 0.01717273, 0.01186414, 0.02617912, 0.0234044 ,
       0.01511591, 0.01142112, 0.0104001 , 0.02363759, 0.03173649,
       0.02924713, 0.02091222, 0.01403109, 0.01375511, 0.05283528,
       0.03640327, 0.03735329, 0.0136633 , 0.01517681, 0.01662452,
       0.01426184, 0.05754099, 0.00334439, 0.01004438, 0.02163528,
       0.02969188, 0.00868311, 0.0090074 , 0.0098131 , 0.00982214,
       0.01179546, 0.00797071, 0.01316848, 0.01051014, 0.01044006])

In [9]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.10646995881848048, 'koi_fpflag_co'),
 (0.0996144547208986, 'koi_fpflag_nt'),
 (0.06382055038292603, 'koi_fpflag_ss'),
 (0.057540994757141896, 'koi_model_snr'),
 (0.052835276069531235, 'koi_prad'),
 (0.039074135082760204, 'koi_fpflag_ec'),
 (0.03735329132695259, 'koi_prad_err2'),
 (0.03640326914312254, 'koi_prad_err1'),
 (0.03173648945900215, 'koi_duration_err1'),
 (0.029691880919211516, 'koi_steff_err2'),
 (0.02924712833294282, 'koi_duration_err2'),
 (0.02617912219223948, 'koi_time0bk_err1'),
 (0.023637590808727394, 'koi_duration'),
 (0.023404401706838423, 'koi_time0bk_err2'),
 (0.023116994352295715, 'koi_period'),
 (0.021635278461220882, 'koi_steff_err1'),
 (0.020912221811496223, 'koi_depth'),
 (0.01924093047759733, 'koi_period_err1'),
 (0.017172725071493092, 'koi_period_err2'),
 (0.016624520513602775, 'koi_insol_err1'),
 (0.015176814611783057, 'koi_insol'),
 (0.015115914242472916, 'koi_impact'),
 (0.014261837108723158, 'koi_insol_err2'),
 (0.014031089343674736, 'koi_depth_err1'),

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [10]:
# Create the GridSearchCV model

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators':[50, 100, 200, 300], 'random_state':[0,1,2,3]}
    
grid = GridSearchCV(rf, param_grid, verbose=3, return_train_score=True)


In [11]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END n_estimators=50, random_state=0;, score=(train=1.000, test=0.879) total time=   0.6s
[CV 2/5] END n_estimators=50, random_state=0;, score=(train=1.000, test=0.880) total time=   0.6s
[CV 3/5] END n_estimators=50, random_state=0;, score=(train=1.000, test=0.901) total time=   0.6s
[CV 4/5] END n_estimators=50, random_state=0;, score=(train=1.000, test=0.877) total time=   0.6s
[CV 5/5] END n_estimators=50, random_state=0;, score=(train=1.000, test=0.890) total time=   0.6s
[CV 1/5] END n_estimators=50, random_state=1;, score=(train=1.000, test=0.889) total time=   0.5s
[CV 2/5] END n_estimators=50, random_state=1;, score=(train=1.000, test=0.888) total time=   0.6s
[CV 3/5] END n_estimators=50, random_state=1;, score=(train=1.000, test=0.902) total time=   0.5s
[CV 4/5] END n_estimators=50, random_state=1;, score=(train=1.000, test=0.875) total time=   0.6s
[CV 5/5] END n_estimators=50, random_state=1;, score=(tra

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [50, 100, 200, 300],
                         'random_state': [0, 1, 2, 3]},
             return_train_score=True, verbose=3)

In [12]:
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 200, 'random_state': 1}
0.8923471793912594


In [13]:
#Dinnara
predictions = grid.predict(X_test)

In [14]:
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['CONFIRMED' 'CANDIDATE' 'CANDIDATE' 'CONFIRMED' 'CANDIDATE' 'CONFIRMED'
 'FALSE POSITIVE' 'FALSE POSITIVE' 'CONFIRMED' 'CONFIRMED']
First 10 Actual labels: ['CONFIRMED', 'CANDIDATE', 'CANDIDATE', 'CONFIRMED', 'CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'CANDIDATE']


In [15]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=y_names))

                precision    recall  f1-score   support

     CONFIRMED       0.85      0.78      0.82       343
FALSE POSITIVE       0.81      0.85      0.83       337
     CANDIDATE       0.98      1.00      0.99       719

      accuracy                           0.91      1399
     macro avg       0.88      0.88      0.88      1399
  weighted avg       0.91      0.91      0.91      1399



In [16]:
results = pd.DataFrame(grid.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.547984,0.002918,0.008954,7.2e-05,50,0,"{'n_estimators': 50, 'random_state': 0}",0.879357,0.88025,0.900716,...,0.885374,0.008907,16,1.0,1.0,0.999776,0.999776,1.0,0.999911,0.000109
1,0.542088,0.004207,0.009012,7.9e-05,50,1,"{'n_estimators': 50, 'random_state': 1}",0.889187,0.888293,0.90161,...,0.887339,0.008788,12,0.999776,1.0,1.0,1.0,1.0,0.999955,8.9e-05
2,0.547474,0.003186,0.009038,0.000101,50,2,"{'n_estimators': 50, 'random_state': 2}",0.881144,0.873101,0.907871,...,0.886448,0.012321,14,1.0,0.999776,0.999553,0.999776,0.999776,0.999776,0.000141
3,0.54627,0.003515,0.009004,9.8e-05,50,3,"{'n_estimators': 50, 'random_state': 3}",0.887399,0.887399,0.898927,...,0.888412,0.006277,9,1.0,0.999776,1.0,1.0,0.999776,0.999911,0.00011
4,1.085801,0.005504,0.016118,0.000105,100,0,"{'n_estimators': 100, 'random_state': 0}",0.887399,0.88025,0.90161,...,0.887519,0.008264,10,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,1.085175,0.00755,0.016273,8.4e-05,100,1,"{'n_estimators': 100, 'random_state': 1}",0.892761,0.889187,0.90966,...,0.892347,0.010587,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,1.08265,0.009773,0.016315,0.000129,100,2,"{'n_estimators': 100, 'random_state': 2}",0.893655,0.876676,0.905188,...,0.888592,0.01047,8,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,1.080155,0.006967,0.016367,0.00013,100,3,"{'n_estimators': 100, 'random_state': 3}",0.887399,0.885612,0.897138,...,0.886803,0.007189,13,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,2.174207,0.015965,0.03056,0.00024,200,0,"{'n_estimators': 200, 'random_state': 0}",0.883825,0.886506,0.899821,...,0.886266,0.0074,15,1.0,1.0,1.0,1.0,1.0,1.0,0.0
9,2.198749,0.017584,0.030639,0.000108,200,1,"{'n_estimators': 200, 'random_state': 1}",0.89008,0.889187,0.908766,...,0.892347,0.008871,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
