In [1]:
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
print(df["koi_disposition"].unique())

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']


# Select your features (columns)

In [4]:
# Set X equal to the entire data set, except for the first column
X = df.iloc[:,1:]

# Set y equal to the first column
y = df.iloc[:,0]

In [5]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=250,learning_rate=1.0,max_depth=1,random_state=0)
model.fit(X,y)
model.feature_importances_

array([2.60688858e-01, 2.46980546e-01, 1.75985959e-01, 2.03134802e-02,
       2.32487160e-03, 4.59675115e-04, 1.85726096e-04, 5.00055556e-03,
       4.10157937e-04, 2.72422069e-04, 4.31920796e-03, 9.41308828e-04,
       2.38202743e-03, 8.88944750e-03, 3.83105489e-02, 2.31265322e-04,
       2.50125549e-03, 1.67951790e-03, 4.52128956e-04, 2.29894003e-02,
       9.63857808e-04, 5.42085677e-03, 1.26294956e-04, 2.86544032e-04,
       7.19421013e-04, 4.61592239e-04, 7.96938647e-02, 2.74682994e-03,
       9.71028723e-04, 9.81578136e-02, 1.08029809e-03, 8.15077613e-04,
       3.65355791e-04, 1.71499278e-03, 1.93201650e-03, 1.31314682e-03,
       9.42795086e-04, 2.23334203e-03, 1.79633572e-03, 2.94017676e-03])

In [6]:
# Store the top 10 features as a series, using the column headers as the index
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
feat_imp

koi_fpflag_nt        0.260689
koi_fpflag_ss        0.246981
koi_fpflag_co        0.175986
koi_steff_err1       0.098158
koi_model_snr        0.079694
koi_duration_err1    0.038311
koi_prad             0.022989
koi_fpflag_ec        0.020313
koi_duration         0.008889
koi_prad_err2        0.005421
dtype: float64

In [7]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['koi_disposition']

# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [9]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_steff_err1,koi_model_snr,koi_duration_err1,koi_prad,koi_fpflag_ec,koi_duration,koi_prad_err2
2476,0,0,0,158,139.8,0.0234,3.16,0,3.3603,-0.31
3872,0,1,1,155,37.8,0.0784,2.62,1,2.4404,-0.67
2181,0,0,0,104,61.7,0.0323,2.54,0,2.1417,-0.34
5593,0,1,0,216,943.7,0.00535,105.07,0,5.21948,-29.62
3744,0,0,0,195,17.4,0.143,1.41,0,3.762,-0.57


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

LogisticRegression()

In [12]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8277703604806409
Testing Data Score: 0.8226544622425629


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [13]:
predictions = model.predict(X_test_scaled)
df_pred = pd.DataFrame({"Actual":y_test, "Predicted":predictions}) 
df_pred.head()

Unnamed: 0,Actual,Predicted
2077,CANDIDATE,CONFIRMED
5669,FALSE POSITIVE,FALSE POSITIVE
3852,FALSE POSITIVE,FALSE POSITIVE
5023,CANDIDATE,CANDIDATE
6777,FALSE POSITIVE,FALSE POSITIVE


In [14]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create the GridSearchCV model
param_grid = {'C':np.logspace(-4, 4, 20),
             'penalty':['l1','l2']}

grid = GridSearchCV(model, param_grid, verbose=3)

In [15]:
# Train the model with GridSearch
# Train the model with GridSearch
best_model = grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ................C=0.0001, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 2/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 3/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 4/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 5/5] END ..............C=0.0001, penalty=l2;, score=0.501 total time=   0.0s
[CV 1/5] END C=0.00026366508987303583, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END C=0.00026366508987303583, penalty=

[CV 2/5] END ...C=1.623776739188721, penalty=l2;, score=0.812 total time=   0.1s
[CV 3/5] END ...C=1.623776739188721, penalty=l2;, score=0.830 total time=   0.0s
[CV 4/5] END ...C=1.623776739188721, penalty=l2;, score=0.825 total time=   0.1s
[CV 5/5] END ...C=1.623776739188721, penalty=l2;, score=0.844 total time=   0.0s
[CV 1/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END .....C=4.281332398719396, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END ...C=4.281332398719396, penalty=l2;, score=0.846 total time=   0.0s
[CV 2/5] END ...C=4.281332398719396, penalty=l2;, score=0.822 total time=   0.0s
[CV 3/5] END ...C=4.281332398719396, penalty=l2;, score=0.834 total time=   0.1s
[CV 4/5] END ...C=4.28133239

In [16]:
print(best_model.best_params_)
print('Best Score: ', best_model.best_score_)

{'C': 3792.690190732246, 'penalty': 'l2'}
Best Score:  0.8773617549247194


In [17]:
# Make predictions
grid_predictions = best_model.predict(X_test_scaled)
df_grid = pd.DataFrame({"Actual":y_test, "Predicted":grid_predictions}) 
df_grid.head()

Unnamed: 0,Actual,Predicted
2077,CANDIDATE,CONFIRMED
5669,FALSE POSITIVE,FALSE POSITIVE
3852,FALSE POSITIVE,FALSE POSITIVE
5023,CANDIDATE,CANDIDATE
6777,FALSE POSITIVE,FALSE POSITIVE


In [18]:
# Score the model
best_model.score(X_test_scaled, y_test)

0.8792906178489702

# Save the Model

In [19]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'LR_GradientBoost.sav'
joblib.dump(best_model, filename)

['LR_GradientBoost.sav']

In [20]:
# Print Classification Report.
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.82      0.67      0.74       422
     CONFIRMED       0.74      0.85      0.79       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.88      1748
     macro avg       0.85      0.84      0.84      1748
  weighted avg       0.88      0.88      0.88      1748

