In [1]:
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
print(df["koi_disposition"].unique())

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']


# Select your features (columns)

In [4]:
# Set X equal to the entire data set, except for the first column
X = df.iloc[:,1:]

# Set y equal to the first column
y = df.iloc[:,0]

In [5]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
model.feature_importances_

array([0.12900363, 0.1650932 , 0.12142846, 0.05620227, 0.01988945,
       0.01402254, 0.01507926, 0.01280889, 0.02161156, 0.02291334,
       0.01201135, 0.01199269, 0.01036573, 0.01714174, 0.02793848,
       0.0292363 , 0.02125933, 0.01315237, 0.01144604, 0.01290239,
       0.0126778 , 0.00966832, 0.01632456, 0.00884981, 0.00930302,
       0.00994504, 0.03076502, 0.01011641, 0.01092874, 0.02323911,
       0.01672494, 0.01124529, 0.01125708, 0.01319552, 0.00876683,
       0.01117794, 0.00943869, 0.01104288, 0.00995509, 0.00987886])

In [6]:
# Store the top 10 features as a series, using the column headers as the index
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
feat_imp

koi_fpflag_ss        0.165093
koi_fpflag_nt        0.129004
koi_fpflag_co        0.121428
koi_fpflag_ec        0.056202
koi_model_snr        0.030765
koi_duration_err2    0.029236
koi_duration_err1    0.027938
koi_steff_err1       0.023239
koi_time0bk_err2     0.022913
koi_time0bk_err1     0.021612
dtype: float64

In [7]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['koi_disposition']

# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [9]:
X_train.head()

Unnamed: 0,koi_fpflag_ss,koi_fpflag_nt,koi_fpflag_co,koi_fpflag_ec,koi_model_snr,koi_duration_err2,koi_duration_err1,koi_steff_err1,koi_time0bk_err2,koi_time0bk_err1
1278,0,0,0,0,32.2,-0.0876,0.0876,135,-0.00305,0.00305
1957,0,0,0,0,50.4,-0.0304,0.0304,171,-0.000828,0.000828
1107,0,0,0,0,43.1,-0.0457,0.0457,77,-0.00138,0.00138
1592,0,0,0,0,175.2,-0.101,0.101,124,-0.00131,0.00131
2175,0,0,1,0,52.9,-0.0385,0.0385,196,-0.00113,0.00113


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

LogisticRegression()

In [12]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8420751478161358
Testing Data Score: 0.8243707093821511


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [13]:
predictions = model.predict(X_test_scaled)
df_pred = pd.DataFrame({"Actual":y_test, "Predicted":predictions}) 
df_pred.head()

Unnamed: 0,Actual,Predicted
5027,FALSE POSITIVE,FALSE POSITIVE
1190,CONFIRMED,CONFIRMED
2927,CONFIRMED,CANDIDATE
5564,CANDIDATE,CONFIRMED
2006,FALSE POSITIVE,FALSE POSITIVE


In [14]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create the GridSearchCV model
param_grid = {'C':np.logspace(-4, 4, 20),
             'penalty':['l1','l2']}

grid = GridSearchCV(model, param_grid, verbose=3)

In [15]:
# Train the model with GridSearch
# Train the model with GridSearch
best_model = grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 2/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 3/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 4/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 5/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 1/5] END C=0.00026366508987303583, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END C=0.00026366508987303583, penalty=

[CV 1/5] END ...C=1.623776739188721, penalty=l2;, score=0.835 total time=   0.0s
[CV 2/5] END ...C=1.623776739188721, penalty=l2;, score=0.844 total time=   0.0s
[CV 3/5] END ...C=1.623776739188721, penalty=l2;, score=0.857 total time=   0.0s
[CV 4/5] END ...C=1.623776739188721, penalty=l2;, score=0.848 total time=   0.0s
[CV 5/5] END ...C=1.623776739188721, penalty=l2;, score=0.845 total time=   0.0s
[CV 1/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END ...C=4.281332398719396, penalty=l2;, score=0.842 total time=   0.0s
[CV 2/5] END ...C=4.281332398719396, penalty=l2;, score=0.850 total time=   0.1s
[CV 3/5] END ...C=4.28133239

In [16]:
print(best_model.best_params_)
print('Best Score: ', best_model.best_score_)

{'C': 10000.0, 'penalty': 'l2'}
Best Score:  0.8624857188598375


In [17]:
# Make predictions
grid_predictions = best_model.predict(X_test_scaled)
df_grid = pd.DataFrame({"Actual":y_test, "Predicted":grid_predictions}) 
df_grid.head()

Unnamed: 0,Actual,Predicted
5027,FALSE POSITIVE,FALSE POSITIVE
1190,CONFIRMED,CONFIRMED
2927,CONFIRMED,CANDIDATE
5564,CANDIDATE,CANDIDATE
2006,FALSE POSITIVE,FALSE POSITIVE


In [18]:
# Score the model
best_model.score(X_test_scaled, y_test)

0.8386727688787186

# Save the Model

In [19]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'LR00.sav'
joblib.dump(best_model, filename)

['LR00.sav']

In [20]:
# Print Classification Report.
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.72      0.59      0.65       422
     CONFIRMED       0.67      0.76      0.71       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.84      1748
     macro avg       0.79      0.78      0.78      1748
  weighted avg       0.84      0.84      0.84      1748

