In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data2.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,1,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,1,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_disposition','koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co','koi_fpflag_ec','koi_period','koi_time0bk','koi_impact','koi_duration','koi_depth','koi_prad','koi_teq','koi_insol','koi_model_snr','koi_steff','koi_slogg','koi_srad','ra','dec','koi_kepmag']]
selected_features.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,1,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,5455,4.467,0.927,291.93423,48.141651,15.347
1,0,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,5853,4.544,0.868,297.00482,48.134129,15.436
2,0,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,5805,4.564,0.791,285.53461,48.28521,15.597
3,1,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,6031,4.438,1.046,288.75488,48.2262,15.509
4,1,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,6046,4.486,0.972,296.28613,48.22467,15.714


In [6]:
# Assign X (data) and y (target)
X = selected_features.drop("koi_disposition", axis=1)
y = selected_features["koi_disposition"]
print(X.shape, y.shape)

(6991, 19) (6991,)


## Use train_test_split to create training and testing data

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Data Preprocessing

It is really important to scale our data before using multilayer perceptron models. 

Without scaling, it is often difficult for the training cycle to converge

In [8]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

Remember to scale both the training and testing data

In [9]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

One-hot encode the labels

In [10]:
from tensorflow.keras.utils import to_categorical

In [11]:
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

## Creating our Model

We must first decide what kind of model to apply to our data.

For numerical data, we use a regressor model.

For categorical data, we use a classifier model. 

## Defining our Model Architecture (the layers)

We first need to create a sequential model

In [12]:
from tensorflow.keras.models import Sequential

model = Sequential()

Next, we add our first layer. This layer requires you to specify both the number of inputs and the number of nodes that you want in the hidden layer.

In [13]:
from tensorflow.keras.layers import Dense
number_inputs = 19
number_hidden_nodes = 6
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))
model.add(Dense(units=6, activation='relu'))
model.add(Dense(units=6, activation='relu'))
model.add(Dense(units=6, activation='relu'))

Our final layer is the output layer. Here, we need to specify the activation function (typically `softmax` for classification) and the number of classes (labels) that we are trying to predict (2 in this example).

In [14]:
number_classes = 3
model.add(Dense(units=number_classes, activation='softmax'))

## Model Summary

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 6)                 120       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 21        
Total params: 267
Trainable params: 267
Non-trainable params: 0
_________________________________________________________________


## Compile the Model

Now that we have our model architecture defined, we must compile the model using a loss function and optimizer. We can also specify additional training metrics such as accuracy.

In [16]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# Hint: your output layer in this example is using software for logistic regression (categorical)
# If your output layer activation was `linear` then you may want to use `mse` for loss
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

## Training the Model
Finally, we train our model using our training data

Training consists of updating our weights using our optimizer and loss function. In this example, we choose 1000 iterations (loops) of training that are called epochs.

We also choose to shuffle our training data and increase the detail printed out during each training cycle.

In [17]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2
)

Train on 5243 samples
Epoch 1/1000
5243/5243 - 3s - loss: 0.9282 - accuracy: 0.4639
Epoch 2/1000
5243/5243 - 0s - loss: 0.6751 - accuracy: 0.6660
Epoch 3/1000
5243/5243 - 0s - loss: 0.5806 - accuracy: 0.7212
Epoch 4/1000
5243/5243 - 0s - loss: 0.5326 - accuracy: 0.7480
Epoch 5/1000
5243/5243 - 0s - loss: 0.5041 - accuracy: 0.7475
Epoch 6/1000
5243/5243 - 0s - loss: 0.4841 - accuracy: 0.7477
Epoch 7/1000
5243/5243 - 0s - loss: 0.4696 - accuracy: 0.7477
Epoch 8/1000
5243/5243 - 0s - loss: 0.4587 - accuracy: 0.7473
Epoch 9/1000
5243/5243 - 0s - loss: 0.4498 - accuracy: 0.7471
Epoch 10/1000
5243/5243 - 0s - loss: 0.4425 - accuracy: 0.7471
Epoch 11/1000
5243/5243 - 0s - loss: 0.4364 - accuracy: 0.7473
Epoch 12/1000
5243/5243 - 0s - loss: 0.4314 - accuracy: 0.7471
Epoch 13/1000
5243/5243 - 0s - loss: 0.4270 - accuracy: 0.7471
Epoch 14/1000
5243/5243 - 0s - loss: 0.4232 - accuracy: 0.7475
Epoch 15/1000
5243/5243 - 0s - loss: 0.4200 - accuracy: 0.7475
Epoch 16/1000
5243/5243 - 0s - loss: 0.417

Epoch 131/1000
5243/5243 - 0s - loss: 0.3319 - accuracy: 0.8280
Epoch 132/1000
5243/5243 - 0s - loss: 0.3313 - accuracy: 0.8241
Epoch 133/1000
5243/5243 - 0s - loss: 0.3309 - accuracy: 0.8257
Epoch 134/1000
5243/5243 - 0s - loss: 0.3319 - accuracy: 0.8241
Epoch 135/1000
5243/5243 - 0s - loss: 0.3308 - accuracy: 0.8238
Epoch 136/1000
5243/5243 - 0s - loss: 0.3306 - accuracy: 0.8266
Epoch 137/1000
5243/5243 - 0s - loss: 0.3305 - accuracy: 0.8232
Epoch 138/1000
5243/5243 - 0s - loss: 0.3301 - accuracy: 0.8283
Epoch 139/1000
5243/5243 - 0s - loss: 0.3299 - accuracy: 0.8253
Epoch 140/1000
5243/5243 - 0s - loss: 0.3289 - accuracy: 0.8272
Epoch 141/1000
5243/5243 - 0s - loss: 0.3297 - accuracy: 0.8270
Epoch 142/1000
5243/5243 - 0s - loss: 0.3290 - accuracy: 0.8264
Epoch 143/1000
5243/5243 - 0s - loss: 0.3283 - accuracy: 0.8274
Epoch 144/1000
5243/5243 - 0s - loss: 0.3292 - accuracy: 0.8247
Epoch 145/1000
5243/5243 - 0s - loss: 0.3290 - accuracy: 0.8251
Epoch 146/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.3142 - accuracy: 0.8339
Epoch 260/1000
5243/5243 - 0s - loss: 0.3157 - accuracy: 0.8350
Epoch 261/1000
5243/5243 - 0s - loss: 0.3152 - accuracy: 0.8333
Epoch 262/1000
5243/5243 - 0s - loss: 0.3145 - accuracy: 0.8362
Epoch 263/1000
5243/5243 - 0s - loss: 0.3155 - accuracy: 0.8331
Epoch 264/1000
5243/5243 - 0s - loss: 0.3138 - accuracy: 0.8306
Epoch 265/1000
5243/5243 - 0s - loss: 0.3144 - accuracy: 0.8335
Epoch 266/1000
5243/5243 - 0s - loss: 0.3143 - accuracy: 0.8320
Epoch 267/1000
5243/5243 - 0s - loss: 0.3144 - accuracy: 0.8344
Epoch 268/1000
5243/5243 - 0s - loss: 0.3144 - accuracy: 0.8373
Epoch 269/1000
5243/5243 - 0s - loss: 0.3140 - accuracy: 0.8327
Epoch 270/1000
5243/5243 - 0s - loss: 0.3129 - accuracy: 0.8358
Epoch 271/1000
5243/5243 - 0s - loss: 0.3142 - accuracy: 0.8320
Epoch 272/1000
5243/5243 - 0s - loss: 0.3139 - accuracy: 0.8331
Epoch 273/1000
5243/5243 - 0s - loss: 0.3145 - accuracy: 0.8375
Epoch 274/1000
5243/5243 - 0s - loss: 0.3131 - accuracy

Epoch 388/1000
5243/5243 - 0s - loss: 0.3034 - accuracy: 0.8419
Epoch 389/1000
5243/5243 - 0s - loss: 0.3033 - accuracy: 0.8388
Epoch 390/1000
5243/5243 - 0s - loss: 0.3036 - accuracy: 0.8442
Epoch 391/1000
5243/5243 - 0s - loss: 0.3035 - accuracy: 0.8421
Epoch 392/1000
5243/5243 - 0s - loss: 0.3029 - accuracy: 0.8423
Epoch 393/1000
5243/5243 - 0s - loss: 0.3023 - accuracy: 0.8405
Epoch 394/1000
5243/5243 - 0s - loss: 0.3028 - accuracy: 0.8419
Epoch 395/1000
5243/5243 - 0s - loss: 0.3039 - accuracy: 0.8394
Epoch 396/1000
5243/5243 - 0s - loss: 0.3021 - accuracy: 0.8392
Epoch 397/1000
5243/5243 - 0s - loss: 0.3028 - accuracy: 0.8386
Epoch 398/1000
5243/5243 - 0s - loss: 0.3026 - accuracy: 0.8411
Epoch 399/1000
5243/5243 - 0s - loss: 0.3031 - accuracy: 0.8398
Epoch 400/1000
5243/5243 - 0s - loss: 0.3032 - accuracy: 0.8425
Epoch 401/1000
5243/5243 - 0s - loss: 0.3011 - accuracy: 0.8430
Epoch 402/1000
5243/5243 - 0s - loss: 0.3024 - accuracy: 0.8411
Epoch 403/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.2867 - accuracy: 0.8547
Epoch 517/1000
5243/5243 - 0s - loss: 0.2853 - accuracy: 0.8562
Epoch 518/1000
5243/5243 - 0s - loss: 0.2869 - accuracy: 0.8592
Epoch 519/1000
5243/5243 - 0s - loss: 0.2848 - accuracy: 0.8549
Epoch 520/1000
5243/5243 - 0s - loss: 0.2843 - accuracy: 0.8571
Epoch 521/1000
5243/5243 - 0s - loss: 0.2857 - accuracy: 0.8549
Epoch 522/1000
5243/5243 - 0s - loss: 0.2837 - accuracy: 0.8564
Epoch 523/1000
5243/5243 - 0s - loss: 0.2828 - accuracy: 0.8619
Epoch 524/1000
5243/5243 - 0s - loss: 0.2833 - accuracy: 0.8594
Epoch 525/1000
5243/5243 - 0s - loss: 0.2832 - accuracy: 0.8562
Epoch 526/1000
5243/5243 - 0s - loss: 0.2826 - accuracy: 0.8579
Epoch 527/1000
5243/5243 - 0s - loss: 0.2817 - accuracy: 0.8594
Epoch 528/1000
5243/5243 - 0s - loss: 0.2817 - accuracy: 0.8598
Epoch 529/1000
5243/5243 - 0s - loss: 0.2845 - accuracy: 0.8558
Epoch 530/1000
5243/5243 - 0s - loss: 0.2835 - accuracy: 0.8535
Epoch 531/1000
5243/5243 - 0s - loss: 0.2817 - accuracy

Epoch 645/1000
5243/5243 - 0s - loss: 0.2626 - accuracy: 0.8798
Epoch 646/1000
5243/5243 - 0s - loss: 0.2636 - accuracy: 0.8814
Epoch 647/1000
5243/5243 - 0s - loss: 0.2639 - accuracy: 0.8806
Epoch 648/1000
5243/5243 - 0s - loss: 0.2623 - accuracy: 0.8806
Epoch 649/1000
5243/5243 - 0s - loss: 0.2615 - accuracy: 0.8823
Epoch 650/1000
5243/5243 - 0s - loss: 0.2657 - accuracy: 0.8795
Epoch 651/1000
5243/5243 - 0s - loss: 0.2630 - accuracy: 0.8806
Epoch 652/1000
5243/5243 - 0s - loss: 0.2629 - accuracy: 0.8812
Epoch 653/1000
5243/5243 - 0s - loss: 0.2618 - accuracy: 0.8810
Epoch 654/1000
5243/5243 - 0s - loss: 0.2642 - accuracy: 0.8806
Epoch 655/1000
5243/5243 - 0s - loss: 0.2630 - accuracy: 0.8835
Epoch 656/1000
5243/5243 - 0s - loss: 0.2619 - accuracy: 0.8825
Epoch 657/1000
5243/5243 - 0s - loss: 0.2619 - accuracy: 0.8821
Epoch 658/1000
5243/5243 - 0s - loss: 0.2629 - accuracy: 0.8798
Epoch 659/1000
5243/5243 - 0s - loss: 0.2614 - accuracy: 0.8814
Epoch 660/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.2561 - accuracy: 0.8846
Epoch 774/1000
5243/5243 - 0s - loss: 0.2545 - accuracy: 0.8850
Epoch 775/1000
5243/5243 - 0s - loss: 0.2585 - accuracy: 0.8865
Epoch 776/1000
5243/5243 - 0s - loss: 0.2577 - accuracy: 0.8835
Epoch 777/1000
5243/5243 - 0s - loss: 0.2587 - accuracy: 0.8863
Epoch 778/1000
5243/5243 - 0s - loss: 0.2564 - accuracy: 0.8863
Epoch 779/1000
5243/5243 - 0s - loss: 0.2551 - accuracy: 0.8865
Epoch 780/1000
5243/5243 - 0s - loss: 0.2573 - accuracy: 0.8867
Epoch 781/1000
5243/5243 - 0s - loss: 0.2553 - accuracy: 0.8869
Epoch 782/1000
5243/5243 - 0s - loss: 0.2576 - accuracy: 0.8863
Epoch 783/1000
5243/5243 - 0s - loss: 0.2583 - accuracy: 0.8859
Epoch 784/1000
5243/5243 - 0s - loss: 0.2545 - accuracy: 0.8850
Epoch 785/1000
5243/5243 - 0s - loss: 0.2555 - accuracy: 0.8880
Epoch 786/1000
5243/5243 - 0s - loss: 0.2560 - accuracy: 0.8850
Epoch 787/1000
5243/5243 - 0s - loss: 0.2541 - accuracy: 0.8863
Epoch 788/1000
5243/5243 - 0s - loss: 0.2556 - accuracy

Epoch 902/1000
5243/5243 - 0s - loss: 0.2534 - accuracy: 0.8877
Epoch 903/1000
5243/5243 - 0s - loss: 0.2529 - accuracy: 0.8888
Epoch 904/1000
5243/5243 - 0s - loss: 0.2528 - accuracy: 0.8873
Epoch 905/1000
5243/5243 - 0s - loss: 0.2526 - accuracy: 0.8913
Epoch 906/1000
5243/5243 - 0s - loss: 0.2510 - accuracy: 0.8911
Epoch 907/1000
5243/5243 - 0s - loss: 0.2512 - accuracy: 0.8886
Epoch 908/1000
5243/5243 - 0s - loss: 0.2552 - accuracy: 0.8875
Epoch 909/1000
5243/5243 - 0s - loss: 0.2542 - accuracy: 0.8888
Epoch 910/1000
5243/5243 - 0s - loss: 0.2526 - accuracy: 0.8898
Epoch 911/1000
5243/5243 - 0s - loss: 0.2534 - accuracy: 0.8877
Epoch 912/1000
5243/5243 - 0s - loss: 0.2516 - accuracy: 0.8882
Epoch 913/1000
5243/5243 - 0s - loss: 0.2521 - accuracy: 0.8886
Epoch 914/1000
5243/5243 - 0s - loss: 0.2513 - accuracy: 0.8894
Epoch 915/1000
5243/5243 - 0s - loss: 0.2541 - accuracy: 0.8884
Epoch 916/1000
5243/5243 - 0s - loss: 0.2517 - accuracy: 0.8884
Epoch 917/1000
5243/5243 - 0s - loss: 0.

<tensorflow.python.keras.callbacks.History at 0x7fb7a4c50be0>

## Quantifying the Model
We use our testing data to validate our model. This is how we determine the validity of our model (i.e. the ability to predict new and previously unseen data points)

In [18]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1748/1 - 0s - loss: 0.3901 - accuracy: 0.8902
Loss: 0.4743309524293895, Accuracy: 0.8901602029800415


## Saving the Model
We can save our model using the HDF5 binary format with the extension `.h5`

In [19]:
# Save the model
model.save("model_5.h5")