In [None]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [1]:
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
print(df["koi_disposition"].unique())

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']


# Select your features (columns)

In [4]:
# Set X equal to the entire data set, except for the first column
X = df.iloc[:,1:]

# Set y equal to the first column
y = df.iloc[:,0]

In [5]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
model.feature_importances_

array([0.13137241, 0.1705081 , 0.12486014, 0.056001  , 0.01745946,
       0.01478536, 0.01559122, 0.01220385, 0.02194748, 0.02369641,
       0.01180889, 0.01175076, 0.00975019, 0.01729161, 0.02765839,
       0.02601119, 0.02046664, 0.01207435, 0.01190952, 0.0134864 ,
       0.01125992, 0.01111993, 0.01292479, 0.00877106, 0.00914276,
       0.00864065, 0.03022476, 0.00913711, 0.0102686 , 0.02396508,
       0.01858717, 0.01171065, 0.00977524, 0.01394186, 0.00923474,
       0.00985086, 0.00965082, 0.0109733 , 0.01012794, 0.01005938])

In [6]:
# Store the top 10 features as a series, using the column headers as the index
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
feat_imp

koi_fpflag_ss        0.170508
koi_fpflag_nt        0.131372
koi_fpflag_co        0.124860
koi_fpflag_ec        0.056001
koi_model_snr        0.030225
koi_duration_err1    0.027658
koi_duration_err2    0.026011
koi_steff_err1       0.023965
koi_time0bk_err2     0.023696
koi_time0bk_err1     0.021947
dtype: float64

In [7]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['koi_disposition']

# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [9]:
X_train.head()

Unnamed: 0,koi_fpflag_ss,koi_fpflag_nt,koi_fpflag_co,koi_fpflag_ec,koi_model_snr,koi_duration_err1,koi_duration_err2,koi_steff_err1,koi_time0bk_err2,koi_time0bk_err1
1744,0,0,0,0,18.9,0.0843,-0.0843,82,-0.00247,0.00247
5570,0,0,0,0,8.5,0.335,-0.335,182,-0.00756,0.00756
5071,0,0,0,0,9.8,0.507,-0.507,151,-0.0179,0.0179
3562,0,0,0,0,14.4,0.273,-0.273,166,-0.00964,0.00964
4712,0,0,0,0,10.6,0.854,-0.854,79,-0.0267,0.0267


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [17]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y = label_encoder.transform(y_train)
encoded_y

array([0, 0, 0, ..., 2, 0, 1])

In [18]:
for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CO

------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class:

Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded

------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Origi

Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded 

Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------

Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Enc

In [19]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
one_hot_y = to_categorical(encoded_y)
one_hot_y

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]], dtype=float32)

In [20]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
tencoded_y = label_encoder.transform(y_test)
tencoded_y

array([1, 2, 2, ..., 0, 1, 2])

In [21]:
for label, original_class in zip(tencoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CO

Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
-----------

Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Ori

In [22]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
tone_hot_y = to_categorical(tencoded_y)
tone_hot_y

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

In [23]:
from sklearn.linear_model import Ridge
# Initialize model
model = Ridge(alpha=1).fit(X_train_scaled, one_hot_y)

In [24]:
print(f"Training Data Score: {model.score(X_train_scaled, one_hot_y)}")
print(f"Testing Data Score: {model.score(X_test_scaled, tone_hot_y)}")

Training Data Score: 0.4689730441842516
Testing Data Score: 0.4680676433386983


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [31]:
# find optimal alpha with grid search
from sklearn.model_selection import GridSearchCV
import numpy as np

alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, one_hot_y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best Score:  0.4666373079524724
Best Params:  {'alpha': 0.1}


In [34]:
# Train the model with GridSearch
ridge = Ridge(alpha=0.1).fit(X_train, one_hot_y)


print('Train Score: ', ridge.score(X_train, one_hot_y))
print('Test Score: ', ridge.score(X_test, tone_hot_y))


Train Score:  0.4690607997561667
Test Score:  0.4676133907183728


In [36]:
ridge.intercept_

array([ 0.30144988,  0.7330134 , -0.03446328])

In [37]:
ridge.coef_

array([[-3.13936986e-01, -4.52048824e-01, -2.43435559e-01,
        -8.42374366e-02, -3.77306774e-05,  3.42052196e-02,
        -3.42052196e-02,  8.48926239e-04, -7.25206724e-01,
         7.25206724e-01],
       [-3.16490758e-01, -2.79431919e-01, -2.15730888e-01,
        -6.60564463e-02, -2.39681008e-05, -3.76248520e-02,
         3.76248520e-02, -1.78773366e-03,  8.00691111e-01,
        -8.00691111e-01],
       [ 6.30427745e-01,  7.31480743e-01,  4.59166447e-01,
         1.50293883e-01,  6.16987782e-05,  3.41963241e-03,
        -3.41963241e-03,  9.38807419e-04, -7.54843870e-02,
         7.54843870e-02]])