In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
print(df["koi_disposition"].unique())

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"].values
print(X.shape, y.shape)

(8744, 40) (8744,)


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
9013,1,0,0,0,288.678554,0.005932,-0.005932,143.8403,0.0146,-0.0146,...,-131.0,4.667,0.054,-0.027,0.575,0.045,-0.056,298.98932,48.077171,13.145
7027,0,0,0,0,5.244916,0.0001332,-0.0001332,136.0365,0.0212,-0.0212,...,-158.0,4.486,0.078,-0.182,0.878,0.231,-0.099,295.0809,43.953239,13.695
8203,0,1,0,0,2.666962,8.04e-07,-8.04e-07,133.719065,0.000254,-0.000254,...,-216.0,4.484,0.04,-0.229,0.974,0.358,-0.084,296.05377,38.89426,13.968
257,0,0,0,0,7.156816,1.201e-05,-1.201e-05,174.13511,0.00133,-0.00133,...,-75.0,4.494,0.077,-0.027,0.838,0.033,-0.066,289.59476,44.141949,15.46
6728,0,0,0,0,7.954458,3.496e-05,-3.496e-05,135.48462,0.00543,-0.00543,...,-259.0,4.306,0.132,-0.198,1.196,0.378,-0.204,295.90891,42.988209,14.406


# Pre-processing

Scale the data using the MinMaxScaler

In [6]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


# Train the Support Vector Machine

In [7]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [8]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8504117108874657
Testing Data Score: 0.8366880146386093


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [9]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 2, 3, 10],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.5]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [10]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... C=1, gamma=0.0001, score=0.850480109739369, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8449222323879232, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8324942791762013, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ....... C=1, gamma=0.0005, score=0.850480109739369, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8449222323879232, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8324942791762013, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ........ C=1, gamma=0.001, score=0.850480109739369, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8449222323879232, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8324942791762013, total=   0.4s
[CV] C=1, gamma=0.005 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   41.0s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 2, 3, 10], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [11]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.8700823421774931


In [12]:
predictions = grid.predict(X_test_scaled)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['CANDIDATE' 'FALSE POSITIVE' 'CONFIRMED' 'CONFIRMED' 'CONFIRMED'
 'FALSE POSITIVE' 'FALSE POSITIVE' 'CONFIRMED' 'CANDIDATE'
 'FALSE POSITIVE']
First 10 Actual labels: ['CANDIDATE', 'FALSE POSITIVE', 'CANDIDATE', 'CONFIRMED', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'CANDIDATE', 'FALSE POSITIVE']


In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE']))

                precision    recall  f1-score   support

     CONFIRMED       0.83      0.59      0.69       528
FALSE POSITIVE       0.70      0.87      0.77       568
     CANDIDATE       0.98      1.00      0.99      1090

     micro avg       0.86      0.86      0.86      2186
     macro avg       0.84      0.82      0.82      2186
  weighted avg       0.87      0.86      0.86      2186



## Basic Classifier

In [14]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
classifier.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Testing Data Score: 0.8321134492223239


In [17]:
predictions = classifier.predict(X_test_scaled)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['CONFIRMED' 'FALSE POSITIVE' 'CONFIRMED' 'CONFIRMED' 'CONFIRMED'
 'FALSE POSITIVE' 'FALSE POSITIVE' 'CONFIRMED' 'CANDIDATE'
 'FALSE POSITIVE']
First 10 Actual labels: ['CANDIDATE', 'FALSE POSITIVE', 'CANDIDATE', 'CONFIRMED', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'CANDIDATE', 'FALSE POSITIVE']


## Simple Neural Network

In [18]:
from tensorflow.keras.utils import to_categorical

In [19]:
# Label encode the disposition values
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [20]:
one_hot_y_train = to_categorical(encoded_y_train)
one_hot_y_test = to_categorical(encoded_y_test)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=40))
model.add(Dense(units=3, activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               4100      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 4,403
Trainable params: 4,403
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [24]:
model.fit(
    X_train_scaled,
    one_hot_y_train,
    epochs=250,
    shuffle=True,
    verbose=2
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/250
 - 1s - loss: 0.7290 - acc: 0.6674
Epoch 2/250
 - 1s - loss: 0.4127 - acc: 0.8082
Epoch 3/250
 - 0s - loss: 0.3673 - acc: 0.8207
Epoch 4/250
 - 0s - loss: 0.3502 - acc: 0.8317
Epoch 5/250
 - 0s - loss: 0.3413 - acc: 0.8320
Epoch 6/250
 - 1s - loss: 0.3366 - acc: 0.8407
Epoch 7/250
 - 1s - loss: 0.3300 - acc: 0.8468
Epoch 8/250
 - 1s - loss: 0.3261 - acc: 0.8486
Epoch 9/250
 - 1s - loss: 0.3216 - acc: 0.8492
Epoch 10/250
 - 1s - loss: 0.3194 - acc: 0.8524
Epoch 11/250
 - 1s - loss: 0.3145 - acc: 0.8576
Epoch 12/250
 - 1s - loss: 0.3137 - acc: 0.8559
Epoch 13/250
 - 1s - loss: 0.3085 - acc: 0.8609
Epoch 14/250
 - 1s - loss: 0.3055 - acc: 0.8618
Epoch 15/250
 - 1s - loss: 0.3038 - acc: 0.8644
Epoch 16/250
 - 1s - loss: 0.3051 - acc: 0.8625
Epoch 17/250
 - 0s - loss: 0.2978 - acc: 0.8678
Epoch 18/250
 - 1s - loss: 0.2980 - acc: 0.8664
Epoch 19/250
 - 1s - loss: 0.2982 - acc: 0.8661
Epoch 20/250
 - 1s - loss: 0.2969 - acc: 0.8672
E

 - 1s - loss: 0.2322 - acc: 0.8975
Epoch 166/250
 - 0s - loss: 0.2323 - acc: 0.8987
Epoch 167/250
 - 0s - loss: 0.2281 - acc: 0.9016
Epoch 168/250
 - 1s - loss: 0.2269 - acc: 0.9016
Epoch 169/250
 - 0s - loss: 0.2297 - acc: 0.9015
Epoch 170/250
 - 1s - loss: 0.2281 - acc: 0.9033
Epoch 171/250
 - 1s - loss: 0.2310 - acc: 0.9003
Epoch 172/250
 - 1s - loss: 0.2285 - acc: 0.9000
Epoch 173/250
 - 0s - loss: 0.2270 - acc: 0.9039
Epoch 174/250
 - 1s - loss: 0.2295 - acc: 0.8978
Epoch 175/250
 - 1s - loss: 0.2298 - acc: 0.9001
Epoch 176/250
 - 1s - loss: 0.2299 - acc: 0.9003
Epoch 177/250
 - 1s - loss: 0.2269 - acc: 0.9041
Epoch 178/250
 - 0s - loss: 0.2269 - acc: 0.9032
Epoch 179/250
 - 0s - loss: 0.2273 - acc: 0.9029
Epoch 180/250
 - 1s - loss: 0.2292 - acc: 0.8997
Epoch 181/250
 - 0s - loss: 0.2269 - acc: 0.9013
Epoch 182/250
 - 0s - loss: 0.2272 - acc: 0.9030
Epoch 183/250
 - 0s - loss: 0.2270 - acc: 0.9023
Epoch 184/250
 - 1s - loss: 0.2260 - acc: 0.9053
Epoch 185/250
 - 0s - loss: 0.2275

<tensorflow.python.keras.callbacks.History at 0x1a4f07e780>

## Deep Neural Network

In [25]:
deep_model = Sequential()
deep_model.add(Dense(units=100, activation='relu', input_dim=40))
deep_model.add(Dense(units=100, activation='relu'))
deep_model.add(Dense(units=3, activation='softmax'))

In [26]:
deep_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 100)               4100      
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 303       
Total params: 14,503
Trainable params: 14,503
Non-trainable params: 0
_________________________________________________________________


In [27]:
deep_model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [28]:
deep_model.fit(
    X_train_scaled,
    one_hot_y_train,
    epochs=250,
    shuffle=True,
    verbose=2
)

Epoch 1/250
 - 1s - loss: 0.5112 - acc: 0.7540
Epoch 2/250
 - 1s - loss: 0.3628 - acc: 0.8068
Epoch 3/250
 - 1s - loss: 0.3464 - acc: 0.8262
Epoch 4/250
 - 1s - loss: 0.3404 - acc: 0.8358
Epoch 5/250
 - 1s - loss: 0.3362 - acc: 0.8317
Epoch 6/250
 - 1s - loss: 0.3238 - acc: 0.8474
Epoch 7/250
 - 1s - loss: 0.3223 - acc: 0.8457
Epoch 8/250
 - 1s - loss: 0.3157 - acc: 0.8507
Epoch 9/250
 - 1s - loss: 0.3104 - acc: 0.8591
Epoch 10/250
 - 1s - loss: 0.3084 - acc: 0.8590
Epoch 11/250
 - 1s - loss: 0.3078 - acc: 0.8596
Epoch 12/250
 - 1s - loss: 0.3018 - acc: 0.8594
Epoch 13/250
 - 1s - loss: 0.2995 - acc: 0.8664
Epoch 14/250
 - 1s - loss: 0.2970 - acc: 0.8690
Epoch 15/250
 - 1s - loss: 0.2966 - acc: 0.8679
Epoch 16/250
 - 1s - loss: 0.2901 - acc: 0.8692
Epoch 17/250
 - 1s - loss: 0.2880 - acc: 0.8751
Epoch 18/250
 - 1s - loss: 0.2852 - acc: 0.8771
Epoch 19/250
 - 1s - loss: 0.2855 - acc: 0.8719
Epoch 20/250
 - 1s - loss: 0.2786 - acc: 0.8803
Epoch 21/250
 - 1s - loss: 0.2812 - acc: 0.8763
E

Epoch 171/250
 - 1s - loss: 0.1902 - acc: 0.9155
Epoch 172/250
 - 1s - loss: 0.1959 - acc: 0.9123
Epoch 173/250
 - 1s - loss: 0.1886 - acc: 0.9184
Epoch 174/250
 - 1s - loss: 0.1900 - acc: 0.9135
Epoch 175/250
 - 1s - loss: 0.1961 - acc: 0.9091
Epoch 176/250
 - 1s - loss: 0.1863 - acc: 0.9181
Epoch 177/250
 - 1s - loss: 0.1856 - acc: 0.9169
Epoch 178/250
 - 1s - loss: 0.1891 - acc: 0.9166
Epoch 179/250
 - 1s - loss: 0.1884 - acc: 0.9174
Epoch 180/250
 - 1s - loss: 0.1938 - acc: 0.9138
Epoch 181/250
 - 1s - loss: 0.1888 - acc: 0.9137
Epoch 182/250
 - 1s - loss: 0.1900 - acc: 0.9119
Epoch 183/250
 - 1s - loss: 0.1844 - acc: 0.9190
Epoch 184/250
 - 1s - loss: 0.1878 - acc: 0.9158
Epoch 185/250
 - 1s - loss: 0.1867 - acc: 0.9166
Epoch 186/250
 - 1s - loss: 0.1875 - acc: 0.9201
Epoch 187/250
 - 1s - loss: 0.1874 - acc: 0.9161
Epoch 188/250
 - 1s - loss: 0.1882 - acc: 0.9145
Epoch 189/250
 - 1s - loss: 0.1841 - acc: 0.9190
Epoch 190/250
 - 1s - loss: 0.1839 - acc: 0.9212
Epoch 191/250
 - 1s 

<tensorflow.python.keras.callbacks.History at 0x1a50188be0>

## Compare the Models

In [29]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, one_hot_y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 0s - loss: 0.2647 - acc: 0.8838
Normal Neural Network - Loss: 0.26471680331862746, Accuracy: 0.883806049823761


In [30]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test_scaled, one_hot_y_test, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 0s - loss: 0.3382 - acc: 0.8888
Deep Neural Network - Loss: 0.3381931368953464, Accuracy: 0.8888380527496338
