In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("exoplanet_data_cleaned.csv", index_col = 0)

In [3]:
df.head()

Unnamed: 0,kepoi_name,koi_disposition,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,K00752.01,y,m,negative,negative,negative,negative,9.488036,2.78e-05,-2.78e-05,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,K00752.02,y,m,negative,negative,negative,negative,54.418383,0.000248,-0.000248,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,K00753.01,n,n,negative,positive,negative,negative,19.89914,1.49e-05,-1.49e-05,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,K00754.01,n,n,negative,positive,negative,negative,1.736952,2.63e-07,-2.63e-07,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,K00755.01,y,m,negative,negative,negative,negative,2.525592,3.76e-06,-3.76e-06,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


---

#### Drop all rows with "m" (= "CANDIDATE") for "koi_disposition":

In [4]:
df_culled = df[df["koi_disposition"] != "m"]

#### Isolate target vector: "koi_disposition"; rename as y_disp.

In [5]:
y_disp = df_culled[["koi_disposition"]].reset_index(drop = True)

In [6]:
y_disp.value_counts()

koi_disposition
n                  5023
y                  2293
dtype: int64

In [7]:
df_culled.drop(columns = ["kepoi_name", "koi_disposition", "koi_pdisposition"], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
df_culled.reset_index(drop = True, inplace = True)

In [9]:
df_dummy = pd.get_dummies(df_culled)

In [10]:
df_dummy

Unnamed: 0,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,...,koi_fpflag_nt_positive,koi_fpflag_ss_negative,koi_fpflag_ss_positive,koi_fpflag_co_negative,koi_fpflag_co_positive,koi_fpflag_ec_negative,koi_fpflag_ec_positive,koi_tce_delivname_q1_q16_tce,koi_tce_delivname_q1_q17_dr24_tce,koi_tce_delivname_q1_q17_dr25_tce
0,9.488036,2.780000e-05,-2.780000e-05,170.538750,0.002160,-0.002160,0.146,0.318,-0.146,2.95750,...,0,1,0,1,0,1,0,0,0,1
1,54.418383,2.480000e-04,-2.480000e-04,162.513840,0.003520,-0.003520,0.586,0.059,-0.443,4.50700,...,0,1,0,1,0,1,0,0,0,1
2,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.78220,...,0,0,1,1,0,1,0,0,0,1
3,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,...,0,0,1,1,0,1,0,0,0,1
4,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,-0.001130,0.701,0.235,-0.478,1.65450,...,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7311,23.627035,2.260000e-04,-2.260000e-04,150.036200,0.010900,-0.010900,1.096,38.210,-0.106,11.48100,...,0,0,1,1,0,1,0,0,0,1
7312,8.589871,1.850000e-04,-1.850000e-04,132.016100,0.015700,-0.015700,0.765,0.023,-0.541,4.80600,...,0,1,0,1,0,0,1,0,0,1
7313,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,-0.000170,1.252,0.051,-0.049,3.22210,...,0,0,1,0,1,1,0,0,0,1
7314,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,-0.002850,0.147,0.309,-0.147,0.86500,...,0,1,0,0,1,1,0,0,0,1


In [11]:
df_dummy.shape

(7316, 47)

#### Impute the data using medians:

In [12]:
from sklearn.impute import SimpleImputer

In [13]:
impute_median = SimpleImputer(missing_values=np.nan, strategy = 'median')

In [14]:
impute_median.fit(df_dummy)

SimpleImputer(strategy='median')

In [15]:
df_imp = impute_median.transform(df_dummy)

#### Train, test, split the data:

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_imp, y_disp, random_state = 46)

#### Scale (by median):

In [17]:
from sklearn.preprocessing import MinMaxScaler

In [18]:
X_scaler = MinMaxScaler().fit(X_train)

In [19]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
X_train_scaled.shape

(5487, 47)

In [21]:
X_test_scaled.shape

(1829, 47)

In [22]:
y_train.shape

(5487, 1)

In [23]:
y_test.shape

(1829, 1)

#### Label encode the target:

In [24]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [25]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(np.ravel(y_train, order = "c"))
encoded_y_train = label_encoder.transform(np.ravel(y_train, order = "c"))
encoded_y_test = label_encoder.transform(np.ravel(y_test, order = "c"))

In [26]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

### Create deep learning neural network:

In [27]:
# first, create a normal neural network with 47 inputs, two hidden layers (20, 10), and 2 outputs.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

model = Sequential()
model.add(Dense(units = 20, activation = 'relu', input_dim = 47))
model.add(Dense(units = 10, activation = 'relu'))
model.add(Dense(units = 2, activation = 'softmax'))

In [28]:
# Compile the model
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['AUC', 'accuracy'])

In [None]:
# Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train_categorical,
    batch_size = 1,
    epochs = 100,
    shuffle = True,
    verbose = 2
)

Epoch 1/100
5487/5487 - 4s - loss: 0.0999 - auc: 0.9921 - accuracy: 0.9677
Epoch 2/100
5487/5487 - 4s - loss: 0.0435 - auc: 0.9938 - accuracy: 0.9887
Epoch 3/100
5487/5487 - 4s - loss: 0.0398 - auc: 0.9944 - accuracy: 0.9896
Epoch 4/100
5487/5487 - 4s - loss: 0.0381 - auc: 0.9946 - accuracy: 0.9902
Epoch 5/100
5487/5487 - 4s - loss: 0.0376 - auc: 0.9951 - accuracy: 0.9892
Epoch 6/100
5487/5487 - 4s - loss: 0.0366 - auc: 0.9945 - accuracy: 0.9903
Epoch 7/100
5487/5487 - 4s - loss: 0.0360 - auc: 0.9950 - accuracy: 0.9902
Epoch 8/100
5487/5487 - 4s - loss: 0.0346 - auc: 0.9954 - accuracy: 0.9909
Epoch 9/100
5487/5487 - 4s - loss: 0.0355 - auc: 0.9951 - accuracy: 0.9898
Epoch 10/100
5487/5487 - 5s - loss: 0.0349 - auc: 0.9955 - accuracy: 0.9907
Epoch 11/100
5487/5487 - 5s - loss: 0.0338 - auc: 0.9957 - accuracy: 0.9903
Epoch 12/100
5487/5487 - 4s - loss: 0.0338 - auc: 0.9947 - accuracy: 0.9913
Epoch 13/100
5487/5487 - 4s - loss: 0.0336 - auc: 0.9961 - accuracy: 0.9907
Epoch 14/100
5487/548

Beyond 10 epochs the model starts to overfit the data

In [None]:
model.evaluate(X_test_scaled, y_test_categorical)

Big improvement relative to the logit!

### Grid search with help of KerasClassifier:

#### Warning! This takes a while to run!

In [105]:
def create_model(optimizer='adam', init_mode = 'uniform'):
    model = Sequential()
    model.add(Dense(20, kernel_initializer = init_mode, input_dim = 47))
    model.add(Activation('relu')) # An "activation" is just a non-linear function applied to the output
#     model.add(Dropout(0.1))   # Dropout helps protect the model from memorizing or "overfitting" the training data
    model.add(Dense(10, kernel_initializer = init_mode))
    model.add(Activation('relu'))
#     model.add(Dropout(0.1))
    model.add(Dense(2, kernel_initializer = init_mode))
    model.add(Activation('softmax')) # This special "softmax" a
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', 'AUC']) 
    return model


In [106]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [107]:
model = KerasClassifier(build_fn = create_model)

optimizers = ['adam', 'rmsprop']
epochs = np.array([10, 20, 100])
batches = np.array([1, 32, 64])
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
param_grid = dict(optimizer = optimizers, nb_epoch = epochs, init_mode = init_mode, batch_size = batches)

grid = GridSearchCV(estimator = model, param_grid = param_grid)
grid_result = grid.fit(X_train, y_train_categorical)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))











































Best: 0.845809 using {'batch_size': 1, 'init_mode': 'uniform', 'nb_epoch': 100, 'optimizer': 'adam'}


---

### Save model:

In [None]:
# Save the model
model.save("exoplanet_dnn_CDA.h5")

In [None]:
# Load the model
# from tensorflow.keras.models import load_model
# model = load_model("explanet_dnn_CDA.h5")