In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestClassifier

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("Resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [4]:
# Set features. This will also be used as your x values.
selected_features = df.drop(columns=['koi_disposition'])
selected_features.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [5]:
# Assign the data to X and y
# Reshape
X = selected_features
y = df.koi_disposition.values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
X[:3], y[:3]

Shape:  (6991, 40) (6991, 1)


(   koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_fpflag_ec  koi_period  \
 0              0              0              0              0   54.418383   
 1              0              1              0              0   19.899140   
 2              0              1              0              0    1.736952   
 
    koi_period_err1  koi_period_err2  koi_time0bk  koi_time0bk_err1  \
 0     2.479000e-04    -2.479000e-04   162.513840          0.003520   
 1     1.490000e-05    -1.490000e-05   175.850252          0.000581   
 2     2.630000e-07    -2.630000e-07   170.307565          0.000115   
 
    koi_time0bk_err2  ...  koi_steff_err2  koi_slogg  koi_slogg_err1  \
 0         -0.003520  ...             -81      4.467           0.064   
 1         -0.000581  ...            -176      4.544           0.044   
 2         -0.000115  ...            -174      4.564           0.053   
 
    koi_slogg_err2  koi_srad  koi_srad_err1  koi_srad_err2         ra  \
 0          -0.096     0.927         

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
X_train.head(3)

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3563,0,0,0,0,10.548413,5.47e-05,-5.47e-05,139.06402,0.00411,-0.00411,...,-133,4.387,0.066,-0.123,1.092,0.181,-0.097,298.09543,44.737061,13.204
4099,0,0,0,0,24.754385,0.0001365,-0.0001365,140.20732,0.00446,-0.00446,...,-144,4.519,0.078,-0.052,0.804,0.056,-0.076,295.73535,42.576248,15.514
5460,0,0,0,0,1.057336,1.23e-07,-1.23e-07,131.792007,9.6e-05,-9.6e-05,...,-140,4.594,0.054,-0.027,0.683,0.054,-0.06,292.18417,49.31004,15.414


In [8]:
y_train

array([['CANDIDATE'],
       ['CONFIRMED'],
       ['CANDIDATE'],
       ...,
       ['FALSE POSITIVE'],
       ['CONFIRMED'],
       ['CONFIRMED']], dtype=object)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
# Scale your data
from sklearn.preprocessing import StandardScaler
X_standard = StandardScaler().fit(X_train)

X_train_standard = X_standard.transform(X_train)
X_test_standard = X_standard.transform(X_test)

In [12]:
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train_standard, y_train)
rf.score(X_train_standard, y_train)

  rf = rf.fit(X_train_standard, y_train)


1.0

In [13]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [14]:
importances = rf.feature_importances_
importances

array([0.09181784, 0.05895257, 0.11349811, 0.03280491, 0.02246104,
       0.0169764 , 0.0188455 , 0.01529371, 0.02537053, 0.02028489,
       0.01868265, 0.01040641, 0.01103246, 0.02385316, 0.03214792,
       0.02902286, 0.02504046, 0.0118812 , 0.01278916, 0.04391584,
       0.03559608, 0.03567801, 0.01631204, 0.0152867 , 0.01849683,
       0.01691123, 0.05888182, 0.0028435 , 0.00903936, 0.02943526,
       0.0320347 , 0.0099348 , 0.00947875, 0.00951101, 0.00899766,
       0.01091255, 0.00937709, 0.01324166, 0.01139062, 0.01156273])

In [15]:
pd.DataFrame(sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)).head(10)

Unnamed: 0,0,1
0,0.113498,koi_fpflag_co
1,0.091818,koi_fpflag_nt
2,0.058953,koi_fpflag_ss
3,0.058882,koi_model_snr
4,0.043916,koi_prad
5,0.035678,koi_prad_err2
6,0.035596,koi_prad_err1
7,0.032805,koi_fpflag_ec
8,0.032148,koi_duration_err1
9,0.032035,koi_steff_err2


In [16]:
rf.predict_proba(X_test_standard)
rf.classes_

array(['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [17]:
# Set features. This will also be used as your x values.
best_features = df[['koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_model_snr',
                    'koi_prad', 'koi_prad_err2', 'koi_fpflag_ec', 'koi_prad_err1',
                    'koi_duration_err1', 'koi_steff_err1']]
best_features.head()

Unnamed: 0,koi_fpflag_co,koi_fpflag_nt,koi_fpflag_ss,koi_model_snr,koi_prad,koi_prad_err2,koi_fpflag_ec,koi_prad_err1,koi_duration_err1,koi_steff_err1
0,0,0,0,25.8,2.83,-0.19,0,0.32,0.116,81
1,0,0,1,76.3,14.6,-1.31,0,3.92,0.0341,158
2,0,0,1,505.6,33.46,-2.83,0,8.5,0.00537,157
3,0,0,0,40.9,2.75,-0.35,0,0.88,0.042,169
4,0,0,0,40.2,2.77,-0.3,0,0.9,0.0673,189


In [18]:
# Assign the data to X and y
# Reshape
X = best_features
y = df.koi_disposition.values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
X[:3], y[:3]

Shape:  (6991, 10) (6991, 1)


(   koi_fpflag_co  koi_fpflag_nt  koi_fpflag_ss  koi_model_snr  koi_prad  \
 0              0              0              0           25.8      2.83   
 1              0              0              1           76.3     14.60   
 2              0              0              1          505.6     33.46   
 
    koi_prad_err2  koi_fpflag_ec  koi_prad_err1  koi_duration_err1  \
 0          -0.19              0           0.32            0.11600   
 1          -1.31              0           3.92            0.03410   
 2          -2.83              0           8.50            0.00537   
 
    koi_steff_err1  
 0              81  
 1             158  
 2             157  ,
 array([['CONFIRMED'],
        ['FALSE POSITIVE'],
        ['FALSE POSITIVE']], dtype=object))

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train the Model



In [20]:
# Scale your data
from sklearn.preprocessing import StandardScaler
X_standard = StandardScaler().fit(X_train)

X_train_standard = X_standard.transform(X_train)
X_test_standard = X_standard.transform(X_test)

In [21]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
encoded_y_train

  return f(**kwargs)


array([0, 2, 2, ..., 2, 2, 2])

In [22]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_standard, encoded_y_train)
rf.score(X_train_standard, encoded_y_train)

1.0

In [23]:
# Calculate the mean_squared_error and the r-squared value
from sklearn.metrics import mean_squared_error, r2_score

predicted = rf.predict(X_test_standard)

mse = mean_squared_error(encoded_y_test, predicted)
r2 = r2_score(encoded_y_test, predicted)

print('mse: ', mse, 'r2: ', r2)

mse:  0.12471395881006865 r2:  0.8108025984703899


In [24]:
print(f"Training Data Score: {rf.score(X_train_standard, encoded_y_train)}")
print(f"Testing Data Score: {rf.score(X_test_standard, encoded_y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8890160183066361


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [40]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [47]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'criterion': ['gini','entropy'],
             'max_features': ['auto', 'sqrt', 'log2'],
             'class_weight': ['balanced', 'balanced_subsample']}
grid = GridSearchCV(rf, param_grid, verbose=3)
grid

GridSearchCV(estimator=RandomForestClassifier(n_estimators=200),
             param_grid={'class_weight': ['balanced', 'balanced_subsample'],
                         'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2']},
             verbose=3)

In [48]:
grid.fit(X_train_standard, encoded_y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] class_weight=balanced, criterion=gini, max_features=auto ........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  class_weight=balanced, criterion=gini, max_features=auto, score=0.903, total=   1.8s
[CV] class_weight=balanced, criterion=gini, max_features=auto ........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


[CV]  class_weight=balanced, criterion=gini, max_features=auto, score=0.898, total=   1.7s
[CV] class_weight=balanced, criterion=gini, max_features=auto ........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s


[CV]  class_weight=balanced, criterion=gini, max_features=auto, score=0.888, total=   1.8s
[CV] class_weight=balanced, criterion=gini, max_features=auto ........
[CV]  class_weight=balanced, criterion=gini, max_features=auto, score=0.885, total=   1.6s
[CV] class_weight=balanced, criterion=gini, max_features=auto ........
[CV]  class_weight=balanced, criterion=gini, max_features=auto, score=0.876, total=   1.9s
[CV] class_weight=balanced, criterion=gini, max_features=sqrt ........
[CV]  class_weight=balanced, criterion=gini, max_features=sqrt, score=0.908, total=   1.8s
[CV] class_weight=balanced, criterion=gini, max_features=sqrt ........
[CV]  class_weight=balanced, criterion=gini, max_features=sqrt, score=0.894, total=   1.7s
[CV] class_weight=balanced, criterion=gini, max_features=sqrt ........
[CV]  class_weight=balanced, criterion=gini, max_features=sqrt, score=0.890, total=   1.7s
[CV] class_weight=balanced, criterion=gini, max_features=sqrt ........
[CV]  class_weight=balanced,

[CV]  class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, score=0.897, total=   2.7s
[CV] class_weight=balanced_subsample, criterion=entropy, max_features=sqrt 
[CV]  class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, score=0.889, total=   2.6s
[CV] class_weight=balanced_subsample, criterion=entropy, max_features=sqrt 
[CV]  class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, score=0.883, total=   2.6s
[CV] class_weight=balanced_subsample, criterion=entropy, max_features=sqrt 
[CV]  class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, score=0.872, total=   2.8s
[CV] class_weight=balanced_subsample, criterion=entropy, max_features=log2 
[CV]  class_weight=balanced_subsample, criterion=entropy, max_features=log2, score=0.906, total=   2.7s
[CV] class_weight=balanced_subsample, criterion=entropy, max_features=log2 
[CV]  class_weight=balanced_subsample, criterion=entropy, max_features=log2, score=0.895, total=   2

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  2.2min finished


GridSearchCV(estimator=RandomForestClassifier(n_estimators=200),
             param_grid={'class_weight': ['balanced', 'balanced_subsample'],
                         'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2']},
             verbose=3)

In [49]:
print(grid.best_params_)

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto'}


In [50]:
predictions = grid.predict(X_test_standard)

In [51]:
print('Test Acc: %.3f' % grid.score(X_test_standard, encoded_y_test))

Test Acc: 0.887


In [53]:
# Train the model with GridSearch
from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, predictions,
                            target_names=['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE']))

                precision    recall  f1-score   support

     CONFIRMED       0.82      0.70      0.76       411
FALSE POSITIVE       0.78      0.86      0.81       484
     CANDIDATE       0.98      0.99      0.99       853

      accuracy                           0.89      1748
     macro avg       0.86      0.85      0.85      1748
  weighted avg       0.89      0.89      0.89      1748



In [57]:
print('Best Params= ', grid.best_params_)
print('Best Score= ', grid.best_score_)

Best Params=  {'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto'}
Best Score=  0.8907079807013586


# Save the Model

In [60]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'Craig_Matherson_Model_2.sav'
joblib.dump(grid, filename)

['Craig_Matherson_Model_2.sav']