In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib to save model 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

#Drop rows where a condidate hasn't been declared confirmed or false
df.drop(df[df['koi_disposition'] == 'CANDIDATE'].index, inplace = True)
df.head()


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select features

In [5]:
#set target (y-values)
target = df[['koi_disposition']]

# Set features (x-values)
selected_features = df[['koi_fpflag_nt',
                        'koi_fpflag_ss',
                        'koi_fpflag_co',
                        'koi_fpflag_ec', 
                        'koi_period',
                        'koi_time0bk', 
                        'koi_impact',
                        'koi_duration',
                        'koi_depth',
                        'koi_prad',
                        'koi_teq',
                        'koi_insol',
                        'koi_model_snr',
                        'koi_tce_plnt_num',
                        'koi_steff',
                        'koi_slogg',
                        'koi_srad',
                        'ra',
                        'dec',
                        'koi_kepmag']]

selected_features= selected_features.values
selected_features.shape

(5304, 20)

In [6]:
## confirming deletion of "candidate" rows

target_list = target.values
np.unique(target_list)


array(['CONFIRMED', 'FALSE POSITIVE'], dtype=object)

# Label Encoding

In [7]:
#reformat data
target = df[['koi_disposition']]
y = target.values.reshape(-1,1)
y= np.ravel(y)

#label encoding
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_targets = label_encoder.transform(y)

encoded_targets


array([0, 1, 1, ..., 1, 1, 1])

# Create a Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(selected_features, encoded_targets, random_state= 43)
X_train


array([[  0.      ,   1.      ,   0.      , ..., 288.96494 ,  42.469391,
         14.858   ],
       [  0.      ,   1.      ,   0.      , ..., 297.67096 ,  41.87598 ,
         13.826   ],
       [  0.      ,   0.      ,   0.      , ..., 284.86014 ,  39.240551,
         13.446   ],
       ...,
       [  0.      ,   0.      ,   0.      , ..., 296.159   ,  44.56311 ,
         15.965   ],
       [  0.      ,   0.      ,   1.      , ..., 291.8269  ,  41.7099  ,
         15.563   ],
       [  0.      ,   1.      ,   0.      , ..., 292.36017 ,  42.38184 ,
         15.943   ]])

# One hot encoding

In [9]:
one_hot_y_train = to_categorical(y_train)
one_hot_y_test = to_categorical(y_test)


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
# Scale your data

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [11]:
def create_model():
    model = Sequential()
    model.add(Dense(units = 6, activation = 'relu', input_dim = 20))
    model.add(Dense(units = 6, activation = 'relu'))
    model.add(Dense(units = 2, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

model = create_model()


In [12]:
#fit the model
model.fit(X_train, one_hot_y_train, epochs = 100, shuffle=True, verbose= 3)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x23dc5e89a60>

In [13]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, one_hot_y_test, verbose=3)
print(model_loss)
print(model_accuracy)

0.47569361329078674
0.6711915731430054


# Hyperparameter Tuning using GridSearchCV


In [14]:

# Create the GridSearchCV model
epochs = [10, 20, 100, 250]
shuffle = [True,False]

model = KerasClassifier(build_fn= create_model, epochs=10, batch_size=32, verbose=0)

param_grid = {'epochs':epochs, 'shuffle':shuffle }

grid = GridSearchCV(model, param_grid, verbose= 3, scoring='accuracy')


In [15]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
[CV 1/5] END ........................epochs=10, shuffle=True; total time=   1.4s
[CV 2/5] END ........................epochs=10, shuffle=True; total time=   1.5s
[CV 3/5] END ........................epochs=10, shuffle=True; total time=   1.1s
[CV 4/5] END ........................epochs=10, shuffle=True; total time=   1.2s
[CV 5/5] END ........................epochs=10, shuffle=True; total time=   1.1s
[CV 1/5] END .......................epochs=10, shuffle=False; total time=   1.2s
[CV 2/5] END .......................epochs=10, shuffle=False; total time=   1.0s
[CV 3/5] END .......................epo

GridSearchCV(estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x0000023DC750D820>,
             param_grid={'epochs': [10, 20, 100, 250],
                         'shuffle': [True, False]},
             scoring='accuracy', verbose=3)

In [16]:
print(grid.best_params_)
print(grid.best_score_)

{'epochs': 250, 'shuffle': True}
0.9909512973673398


In [18]:
def create_model():
    model = Sequential()
    model.add(Dense(units = 6, activation = 'relu', input_dim = 20))
    model.add(Dense(units = 6, activation = 'relu'))
    model.add(Dense(units = 2, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

model = create_model()

In [20]:
#fit the model
model.fit(X_train, one_hot_y_train, epochs = 250, shuffle=True, verbose= 3)

model_loss, model_accuracy = model.evaluate(X_test_scaled, one_hot_y_test, verbose=3)


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [22]:
print(model_loss)
print(model_accuracy)

0.8943232893943787
0.6711915731430054
