In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [3]:
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [5]:
print(df["koi_disposition"].unique())

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']


# Select your features (columns)

In [6]:
# Set X equal to the entire data set, except for the first column
X = df.iloc[:,1:]

# Set y equal to the first column
y = df.iloc[:,0]

In [7]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
model.feature_importances_

array([0.13290192, 0.1654655 , 0.12224298, 0.05799246, 0.01692732,
       0.01313587, 0.01534428, 0.01233949, 0.02169887, 0.01871759,
       0.01236644, 0.01145002, 0.00997208, 0.01807782, 0.0295692 ,
       0.03165504, 0.02065896, 0.01092502, 0.01224748, 0.01457682,
       0.01025723, 0.01148387, 0.01368063, 0.00892005, 0.00955023,
       0.00834809, 0.03475614, 0.01101526, 0.01101255, 0.02027276,
       0.01793694, 0.01064332, 0.01023408, 0.01342747, 0.00871335,
       0.01073666, 0.0093852 , 0.01118465, 0.0102706 , 0.00990576])

In [8]:
# Store the top 10 features as a series, using the column headers as the index
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
feat_imp

koi_fpflag_ss        0.165465
koi_fpflag_nt        0.132902
koi_fpflag_co        0.122243
koi_fpflag_ec        0.057992
koi_model_snr        0.034756
koi_duration_err2    0.031655
koi_duration_err1    0.029569
koi_time0bk_err1     0.021699
koi_depth            0.020659
koi_steff_err1       0.020273
dtype: float64

In [9]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['koi_disposition']

# Create a Train Test Split

Use `koi_disposition` for the y values

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [11]:
X_train.head()

Unnamed: 0,koi_fpflag_ss,koi_fpflag_nt,koi_fpflag_co,koi_fpflag_ec,koi_model_snr,koi_duration_err2,koi_duration_err1,koi_time0bk_err1,koi_depth,koi_steff_err1
4388,0,1,0,0,8.4,-0.404,0.404,0.0123,464.3,72
2385,0,1,0,0,331.6,-0.0566,0.0566,0.000353,548.9,302
4149,0,0,1,1,29.8,-0.195,0.195,0.0125,82.0,78
2643,0,0,1,1,26.7,-0.201,0.201,0.00606,250.2,164
2782,0,1,0,0,55.9,-0.513,0.513,0.0123,839.6,192


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [12]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [13]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y = label_encoder.transform(y_train)
encoded_y

array([2, 2, 2, ..., 1, 2, 1])

In [14]:
for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CO

Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------

Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
-

Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original

Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------

------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSI

------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Origi

In [15]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
one_hot_y = to_categorical(encoded_y)
one_hot_y

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

In [16]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
tencoded_y = label_encoder.transform(y_test)
tencoded_y

array([2, 1, 2, ..., 1, 1, 0])

In [17]:
for label, original_class in zip(tencoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CO

Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE P

Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded 

In [18]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
tone_hot_y = to_categorical(tencoded_y)
tone_hot_y

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], dtype=float32)

In [22]:
from sklearn.linear_model import Lasso
# Initialize model
model = Lasso(alpha=1).fit(X_train_scaled, one_hot_y)

In [23]:
print(f"Training Data Score: {model.score(X_train_scaled, one_hot_y)}")
print(f"Testing Data Score: {model.score(X_test_scaled, tone_hot_y)}")

Training Data Score: 2.5087757908141366e-08
Testing Data Score: -7.432891117413003e-08


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [24]:
# find optimal alpha with grid search
from sklearn.model_selection import GridSearchCV
import numpy as np

alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, one_hot_y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best Score:  0.46545227774613007
Best Params:  {'alpha': 0.001}


In [26]:
# Train the model with GridSearch
lasso = Lasso(alpha=0.1).fit(X_train, one_hot_y)


print('Train Score: ', lasso.score(X_train, one_hot_y))
print('Test Score: ', lasso.score(X_test, tone_hot_y))


Train Score:  0.1138941583367028
Test Score:  0.12459695147534344


In [27]:
lasso.intercept_

array([ 0.32510254,  0.69427939, -0.02602132])

In [28]:
lasso.coef_

array([[-0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -5.37911013e-05, -0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -4.25439721e-07,
        -4.10135227e-04],
       [-0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -1.34070473e-05,  0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -5.13838903e-07,
        -2.92752601e-03],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  6.73659808e-05, -0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  9.33882685e-07,
         3.38436126e-03]])