In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data2.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,1,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,1,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_disposition','koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co','koi_fpflag_ec','koi_period','koi_time0bk','koi_impact','koi_duration','koi_depth','koi_prad','koi_teq','koi_insol','koi_model_snr','koi_steff','koi_slogg','koi_srad','ra','dec','koi_kepmag']]
selected_features.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,1,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,5455,4.467,0.927,291.93423,48.141651,15.347
1,0,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,5853,4.544,0.868,297.00482,48.134129,15.436
2,0,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,5805,4.564,0.791,285.53461,48.28521,15.597
3,1,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,6031,4.438,1.046,288.75488,48.2262,15.509
4,1,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,6046,4.486,0.972,296.28613,48.22467,15.714


In [6]:
# Assign X (data) and y (target)
X = selected_features.drop("koi_disposition", axis=1)
y = selected_features["koi_disposition"]
print(X.shape, y.shape)

(6991, 19) (6991,)


## Use train_test_split to create training and testing data

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Data Preprocessing

It is really important to scale our data before using multilayer perceptron models. 

Without scaling, it is often difficult for the training cycle to converge

In [8]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

Remember to scale both the training and testing data

In [9]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

One-hot encode the labels

In [10]:
from tensorflow.keras.utils import to_categorical

In [11]:
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

## Creating our Model

We must first decide what kind of model to apply to our data.

For numerical data, we use a regressor model.

For categorical data, we use a classifier model. 

## Defining our Model Architecture (the layers)

We first need to create a sequential model

In [12]:
from tensorflow.keras.models import Sequential

model = Sequential()

Next, we add our first layer. This layer requires you to specify both the number of inputs and the number of nodes that you want in the hidden layer.

In [13]:
from tensorflow.keras.layers import Dense
number_inputs = 19
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

Our final layer is the output layer. Here, we need to specify the activation function (typically `softmax` for classification) and the number of classes (labels) that we are trying to predict (2 in this example).

In [14]:
number_classes = 3
model.add(Dense(units=number_classes, activation='softmax'))

## Model Summary

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 80        
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 15        
Total params: 95
Trainable params: 95
Non-trainable params: 0
_________________________________________________________________


## Compile the Model

Now that we have our model architecture defined, we must compile the model using a loss function and optimizer. We can also specify additional training metrics such as accuracy.

In [16]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# Hint: your output layer in this example is using software for logistic regression (categorical)
# If your output layer activation was `linear` then you may want to use `mse` for loss
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

## Training the Model
Finally, we train our model using our training data

Training consists of updating our weights using our optimizer and loss function. In this example, we choose 1000 iterations (loops) of training that are called epochs.

We also choose to shuffle our training data and increase the detail printed out during each training cycle.

In [17]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2
)

Train on 5243 samples
Epoch 1/1000
5243/5243 - 1s - loss: 1.1017 - accuracy: 0.3796
Epoch 2/1000
5243/5243 - 0s - loss: 0.8497 - accuracy: 0.5817
Epoch 3/1000
5243/5243 - 0s - loss: 0.7188 - accuracy: 0.6988
Epoch 4/1000
5243/5243 - 0s - loss: 0.6278 - accuracy: 0.7471
Epoch 5/1000
5243/5243 - 0s - loss: 0.5596 - accuracy: 0.7692
Epoch 6/1000
5243/5243 - 0s - loss: 0.5126 - accuracy: 0.7795
Epoch 7/1000
5243/5243 - 0s - loss: 0.4823 - accuracy: 0.7763
Epoch 8/1000
5243/5243 - 0s - loss: 0.4614 - accuracy: 0.7772
Epoch 9/1000
5243/5243 - 0s - loss: 0.4462 - accuracy: 0.7788
Epoch 10/1000
5243/5243 - 0s - loss: 0.4342 - accuracy: 0.7776
Epoch 11/1000
5243/5243 - 0s - loss: 0.4240 - accuracy: 0.7761
Epoch 12/1000
5243/5243 - 0s - loss: 0.4154 - accuracy: 0.7776
Epoch 13/1000
5243/5243 - 0s - loss: 0.4079 - accuracy: 0.7786
Epoch 14/1000
5243/5243 - 0s - loss: 0.4020 - accuracy: 0.7803
Epoch 15/1000
5243/5243 - 0s - loss: 0.3970 - accuracy: 0.7810
Epoch 16/1000
5243/5243 - 0s - loss: 0.392

Epoch 131/1000
5243/5243 - 0s - loss: 0.3421 - accuracy: 0.8257
Epoch 132/1000
5243/5243 - 0s - loss: 0.3420 - accuracy: 0.8268
Epoch 133/1000
5243/5243 - 0s - loss: 0.3420 - accuracy: 0.8278
Epoch 134/1000
5243/5243 - 0s - loss: 0.3416 - accuracy: 0.8251
Epoch 135/1000
5243/5243 - 0s - loss: 0.3418 - accuracy: 0.8276
Epoch 136/1000
5243/5243 - 0s - loss: 0.3414 - accuracy: 0.8278
Epoch 137/1000
5243/5243 - 0s - loss: 0.3414 - accuracy: 0.8259
Epoch 138/1000
5243/5243 - 0s - loss: 0.3411 - accuracy: 0.8291
Epoch 139/1000
5243/5243 - 0s - loss: 0.3410 - accuracy: 0.8276
Epoch 140/1000
5243/5243 - 0s - loss: 0.3408 - accuracy: 0.8268
Epoch 141/1000
5243/5243 - 0s - loss: 0.3409 - accuracy: 0.8261
Epoch 142/1000
5243/5243 - 0s - loss: 0.3404 - accuracy: 0.8274
Epoch 143/1000
5243/5243 - 0s - loss: 0.3407 - accuracy: 0.8287
Epoch 144/1000
5243/5243 - 0s - loss: 0.3403 - accuracy: 0.8276
Epoch 145/1000
5243/5243 - 0s - loss: 0.3400 - accuracy: 0.8285
Epoch 146/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.3287 - accuracy: 0.8341
Epoch 260/1000
5243/5243 - 0s - loss: 0.3286 - accuracy: 0.8360
Epoch 261/1000
5243/5243 - 0s - loss: 0.3285 - accuracy: 0.8364
Epoch 262/1000
5243/5243 - 0s - loss: 0.3282 - accuracy: 0.8369
Epoch 263/1000
5243/5243 - 0s - loss: 0.3280 - accuracy: 0.8360
Epoch 264/1000
5243/5243 - 0s - loss: 0.3285 - accuracy: 0.8371
Epoch 265/1000
5243/5243 - 0s - loss: 0.3281 - accuracy: 0.8348
Epoch 266/1000
5243/5243 - 0s - loss: 0.3281 - accuracy: 0.8371
Epoch 267/1000
5243/5243 - 0s - loss: 0.3280 - accuracy: 0.8360
Epoch 268/1000
5243/5243 - 0s - loss: 0.3278 - accuracy: 0.8356
Epoch 269/1000
5243/5243 - 0s - loss: 0.3274 - accuracy: 0.8383
Epoch 270/1000
5243/5243 - 0s - loss: 0.3280 - accuracy: 0.8360
Epoch 271/1000
5243/5243 - 0s - loss: 0.3277 - accuracy: 0.8358
Epoch 272/1000
5243/5243 - 0s - loss: 0.3278 - accuracy: 0.8356
Epoch 273/1000
5243/5243 - 0s - loss: 0.3280 - accuracy: 0.8369
Epoch 274/1000
5243/5243 - 0s - loss: 0.3276 - accuracy

Epoch 388/1000
5243/5243 - 0s - loss: 0.3213 - accuracy: 0.8442
Epoch 389/1000
5243/5243 - 0s - loss: 0.3210 - accuracy: 0.8461
Epoch 390/1000
5243/5243 - 0s - loss: 0.3210 - accuracy: 0.8446
Epoch 391/1000
5243/5243 - 0s - loss: 0.3210 - accuracy: 0.8426
Epoch 392/1000
5243/5243 - 0s - loss: 0.3208 - accuracy: 0.8444
Epoch 393/1000
5243/5243 - 0s - loss: 0.3211 - accuracy: 0.8459
Epoch 394/1000
5243/5243 - 0s - loss: 0.3214 - accuracy: 0.8444
Epoch 395/1000
5243/5243 - 0s - loss: 0.3214 - accuracy: 0.8407
Epoch 396/1000
5243/5243 - 0s - loss: 0.3211 - accuracy: 0.8459
Epoch 397/1000
5243/5243 - 0s - loss: 0.3208 - accuracy: 0.8449
Epoch 398/1000
5243/5243 - 0s - loss: 0.3205 - accuracy: 0.8446
Epoch 399/1000
5243/5243 - 0s - loss: 0.3202 - accuracy: 0.8449
Epoch 400/1000
5243/5243 - 0s - loss: 0.3204 - accuracy: 0.8432
Epoch 401/1000
5243/5243 - 0s - loss: 0.3205 - accuracy: 0.8419
Epoch 402/1000
5243/5243 - 0s - loss: 0.3205 - accuracy: 0.8447
Epoch 403/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.3099 - accuracy: 0.8564
Epoch 517/1000
5243/5243 - 0s - loss: 0.3097 - accuracy: 0.8552
Epoch 518/1000
5243/5243 - 0s - loss: 0.3092 - accuracy: 0.8550
Epoch 519/1000
5243/5243 - 0s - loss: 0.3093 - accuracy: 0.8564
Epoch 520/1000
5243/5243 - 0s - loss: 0.3093 - accuracy: 0.8577
Epoch 521/1000
5243/5243 - 0s - loss: 0.3086 - accuracy: 0.8571
Epoch 522/1000
5243/5243 - 0s - loss: 0.3089 - accuracy: 0.8568
Epoch 523/1000
5243/5243 - 0s - loss: 0.3087 - accuracy: 0.8562
Epoch 524/1000
5243/5243 - 0s - loss: 0.3084 - accuracy: 0.8568
Epoch 525/1000
5243/5243 - 0s - loss: 0.3085 - accuracy: 0.8571
Epoch 526/1000
5243/5243 - 0s - loss: 0.3087 - accuracy: 0.8556
Epoch 527/1000
5243/5243 - 0s - loss: 0.3076 - accuracy: 0.8573
Epoch 528/1000
5243/5243 - 0s - loss: 0.3080 - accuracy: 0.8592
Epoch 529/1000
5243/5243 - 0s - loss: 0.3074 - accuracy: 0.8566
Epoch 530/1000
5243/5243 - 0s - loss: 0.3071 - accuracy: 0.8575
Epoch 531/1000
5243/5243 - 0s - loss: 0.3079 - accuracy

Epoch 645/1000
5243/5243 - 0s - loss: 0.2907 - accuracy: 0.8732
Epoch 646/1000
5243/5243 - 0s - loss: 0.2908 - accuracy: 0.8743
Epoch 647/1000
5243/5243 - 0s - loss: 0.2908 - accuracy: 0.8701
Epoch 648/1000
5243/5243 - 0s - loss: 0.2902 - accuracy: 0.8737
Epoch 649/1000
5243/5243 - 0s - loss: 0.2908 - accuracy: 0.8743
Epoch 650/1000
5243/5243 - 0s - loss: 0.2899 - accuracy: 0.8728
Epoch 651/1000
5243/5243 - 0s - loss: 0.2896 - accuracy: 0.8749
Epoch 652/1000
5243/5243 - 0s - loss: 0.2897 - accuracy: 0.8724
Epoch 653/1000
5243/5243 - 0s - loss: 0.2908 - accuracy: 0.8732
Epoch 654/1000
5243/5243 - 0s - loss: 0.2904 - accuracy: 0.8713
Epoch 655/1000
5243/5243 - 0s - loss: 0.2901 - accuracy: 0.8735
Epoch 656/1000
5243/5243 - 0s - loss: 0.2906 - accuracy: 0.8726
Epoch 657/1000
5243/5243 - 0s - loss: 0.2897 - accuracy: 0.8756
Epoch 658/1000
5243/5243 - 0s - loss: 0.2901 - accuracy: 0.8770
Epoch 659/1000
5243/5243 - 0s - loss: 0.2893 - accuracy: 0.8735
Epoch 660/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.2788 - accuracy: 0.8844
Epoch 774/1000
5243/5243 - 0s - loss: 0.2793 - accuracy: 0.8840
Epoch 775/1000
5243/5243 - 0s - loss: 0.2795 - accuracy: 0.8806
Epoch 776/1000
5243/5243 - 0s - loss: 0.2795 - accuracy: 0.8848
Epoch 777/1000
5243/5243 - 0s - loss: 0.2789 - accuracy: 0.8814
Epoch 778/1000
5243/5243 - 0s - loss: 0.2800 - accuracy: 0.8806
Epoch 779/1000
5243/5243 - 0s - loss: 0.2786 - accuracy: 0.8804
Epoch 780/1000
5243/5243 - 0s - loss: 0.2786 - accuracy: 0.8827
Epoch 781/1000
5243/5243 - 0s - loss: 0.2794 - accuracy: 0.8827
Epoch 782/1000
5243/5243 - 0s - loss: 0.2791 - accuracy: 0.8816
Epoch 783/1000
5243/5243 - 0s - loss: 0.2787 - accuracy: 0.8825
Epoch 784/1000
5243/5243 - 0s - loss: 0.2783 - accuracy: 0.8829
Epoch 785/1000
5243/5243 - 0s - loss: 0.2784 - accuracy: 0.8825
Epoch 786/1000
5243/5243 - 0s - loss: 0.2786 - accuracy: 0.8848
Epoch 787/1000
5243/5243 - 0s - loss: 0.2788 - accuracy: 0.8810
Epoch 788/1000
5243/5243 - 0s - loss: 0.2780 - accuracy

Epoch 902/1000
5243/5243 - 0s - loss: 0.2724 - accuracy: 0.8840
Epoch 903/1000
5243/5243 - 0s - loss: 0.2721 - accuracy: 0.8856
Epoch 904/1000
5243/5243 - 0s - loss: 0.2717 - accuracy: 0.8840
Epoch 905/1000
5243/5243 - 0s - loss: 0.2725 - accuracy: 0.8840
Epoch 906/1000
5243/5243 - 0s - loss: 0.2716 - accuracy: 0.8846
Epoch 907/1000
5243/5243 - 0s - loss: 0.2719 - accuracy: 0.8842
Epoch 908/1000
5243/5243 - 0s - loss: 0.2724 - accuracy: 0.8863
Epoch 909/1000
5243/5243 - 0s - loss: 0.2714 - accuracy: 0.8856
Epoch 910/1000
5243/5243 - 0s - loss: 0.2720 - accuracy: 0.8854
Epoch 911/1000
5243/5243 - 0s - loss: 0.2724 - accuracy: 0.8835
Epoch 912/1000
5243/5243 - 0s - loss: 0.2718 - accuracy: 0.8869
Epoch 913/1000
5243/5243 - 0s - loss: 0.2711 - accuracy: 0.8842
Epoch 914/1000
5243/5243 - 0s - loss: 0.2719 - accuracy: 0.8831
Epoch 915/1000
5243/5243 - 0s - loss: 0.2714 - accuracy: 0.8884
Epoch 916/1000
5243/5243 - 0s - loss: 0.2719 - accuracy: 0.8873
Epoch 917/1000
5243/5243 - 0s - loss: 0.

<tensorflow.python.keras.callbacks.History at 0x7f8d7ed2ea20>

## Quantifying the Model
We use our testing data to validate our model. This is how we determine the validity of our model (i.e. the ability to predict new and previously unseen data points)

In [18]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1748/1 - 0s - loss: 0.2828 - accuracy: 0.8976
Loss: 0.28730923012410475, Accuracy: 0.8975972533226013


## Saving the Model
We can save our model using the HDF5 binary format with the extension `.h5`

In [22]:
# Save the model
model.save("model_4.h5")