In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import sys
import pandas as pd
import pandas_profiling as pds
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("../data/raw/exoplanet_data.csv")
# Drop the null columns where all values are null
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    6991 non-null   object 
 1   koi_fpflag_nt      6991 non-null   int64  
 2   koi_fpflag_ss      6991 non-null   int64  
 3   koi_fpflag_co      6991 non-null   int64  
 4   koi_fpflag_ec      6991 non-null   int64  
 5   koi_period         6991 non-null   float64
 6   koi_period_err1    6991 non-null   float64
 7   koi_period_err2    6991 non-null   float64
 8   koi_time0bk        6991 non-null   float64
 9   koi_time0bk_err1   6991 non-null   float64
 10  koi_time0bk_err2   6991 non-null   float64
 11  koi_impact         6991 non-null   float64
 12  koi_impact_err1    6991 non-null   float64
 13  koi_impact_err2    6991 non-null   float64
 14  koi_duration       6991 non-null   float64
 15  koi_duration_err1  6991 non-null   float64
 16  koi_duration_err2  6991 

In [5]:
# profile = ProfileReport(df, title="Pandas Profiling Report")
# profile
# Select your features (columns)
# Set features. This will also be used as your x values.
# selected_features = df[['names', 'of', 'selected', 'features', 'here']]

In [6]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec','koi_period','koi_time0bk','koi_impact','koi_duration','koi_duration','koi_depth','koi_prad','koi_teq','koi_insol','koi_model_snr','koi_tce_plnt_num','koi_steff','koi_slogg','koi_srad','ra','dec','koi_kepmag','koi_disposition']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
y = selected_features['koi_disposition'].values
data = selected_features.values
X = data[:, 0:20]
# X = X.reshape(-1,1)
y = y.reshape(-1,1)

In [8]:
X.shape

(6991, 20)

In [9]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
encoded_y.shape

  return f(**kwargs)


(6991,)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=1)

In [11]:
y_train

array([0, 1, 0, ..., 2, 1, 1])

In [12]:
# for label, original_class in zip(encoded_y, y):
#     print('Original Class: ' + str(original_class))
#     print('Encoded Label: ' + str(label))
#     print('-' * 12)



# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# One-hot encoding
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [15]:
# first, create a normal neural network with 3 inputs, 20 hidden nodes, and 3 outputs
from tensorflow.keras.models import Sequential
model = Sequential()

In [16]:
from tensorflow.keras.layers import Dense
# number_inputs = 6
number_hidden_nodes = 20
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=20))

In [17]:
number_classes = 3
model.add(Dense(units=number_classes, activation='softmax'))

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                420       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 63        
Total params: 483
Trainable params: 483
Non-trainable params: 0
_________________________________________________________________


In [19]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the Model



In [20]:
# Fit the model to the training data
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2
)

Epoch 1/1000
164/164 - 0s - loss: 0.8568 - accuracy: 0.5363
Epoch 2/1000
164/164 - 0s - loss: 0.5600 - accuracy: 0.7463
Epoch 3/1000
164/164 - 0s - loss: 0.4520 - accuracy: 0.7675
Epoch 4/1000
164/164 - 0s - loss: 0.4166 - accuracy: 0.7795
Epoch 5/1000
164/164 - 0s - loss: 0.4027 - accuracy: 0.7854
Epoch 6/1000
164/164 - 0s - loss: 0.3951 - accuracy: 0.7885
Epoch 7/1000
164/164 - 0s - loss: 0.3907 - accuracy: 0.7921
Epoch 8/1000
164/164 - 0s - loss: 0.3872 - accuracy: 0.7950
Epoch 9/1000
164/164 - 0s - loss: 0.3844 - accuracy: 0.7936
Epoch 10/1000
164/164 - 0s - loss: 0.3828 - accuracy: 0.7940
Epoch 11/1000
164/164 - 0s - loss: 0.3808 - accuracy: 0.7982
Epoch 12/1000
164/164 - 0s - loss: 0.3796 - accuracy: 0.7959
Epoch 13/1000
164/164 - 0s - loss: 0.3780 - accuracy: 0.7980
Epoch 14/1000
164/164 - 0s - loss: 0.3768 - accuracy: 0.7959
Epoch 15/1000
164/164 - 0s - loss: 0.3757 - accuracy: 0.7986
Epoch 16/1000
164/164 - 0s - loss: 0.3747 - accuracy: 0.7994
Epoch 17/1000
164/164 - 0s - loss

<tensorflow.python.keras.callbacks.History at 0x1d18b8bda00>

In [21]:
model_loss, model_accuracy = model.evaluate(
    X_train_scaled, y_train_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

164/164 - 0s - loss: 0.2194 - accuracy: 0.9086
Normal Neural Network - Loss: 0.21943208575248718, Accuracy: 0.9086400866508484


In [22]:
# Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled,y_train
         )
predictions = model.predict(X_test_scaled)
model

SVC(kernel='linear')

In [23]:
from sklearn import svm, datasets

In [24]:
def plotSVC(title):
    # create a mesh to plot in
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    h = (x_max / x_min)/100
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
    np.arange(y_min, y_max, h))
    plt.subplot(1, 1, 1)
    Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.title(title)
    plt.savefig(f'../reports/figures/{title}.png')
    plt.show()
   

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions,target_names=['confirmed','false positive','candidate']))

                precision    recall  f1-score   support

     confirmed       0.77      0.35      0.48       404
false positive       0.60      0.89      0.71       435
     candidate       0.99      1.00      0.99       909

      accuracy                           0.82      1748
     macro avg       0.78      0.74      0.73      1748
  weighted avg       0.84      0.82      0.81      1748



In [26]:
# kernels = ['linear', 'rbf', 'poly']
# for kernel in kernels:
#     svc = svm.SVC(kernel=kernel).fit(X, encoded_y)
#     plotSVC('kernel=' + str(kernel))

In [29]:
# print(f"Training Data Score: {model.score(X_train_scaled, y_train_categorical)}")
# print(f"Testing Data Score: {model.score(X_test_scaled, y_test_categorical)}")

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [30]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 3, 6, 12],
              'gamma': [0.01, 0.03, 0.6, 0.9]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch

# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.01 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

In [None]:
# gammas = [0.01, 0.03, 0.6, 0.9]
# for gamma in gammas:
#     svc = svm.SVC(kernel='rbf', gamma=gamma).fit(X, encoded_y)
#     plotSVC('gamma=' + str(gamma))

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'exoplanet_SVC.sav'
joblib.dump(your_model, filename)