In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_features, df["koi_disposition"], random_state=1)

In [7]:
X_train.columns

Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth',
       'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff',
       'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Scale your data
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_test_scaled

array([[-0.42276709, -0.57125689, -0.50369272, ..., -0.03351199,
        -1.54207582,  1.27194921],
       [ 2.36536859, -0.57125689, -0.50369272, ...,  0.01304448,
        -0.80378962,  0.68911767],
       [ 2.36536859, -0.57125689, -0.50369272, ..., -0.04804041,
         1.64326243,  0.74925698],
       ...,
       [ 2.36536859, -0.57125689, -0.50369272, ..., -0.42375104,
        -0.74210221,  0.83389748],
       [-0.42276709,  1.75052595, -0.50369272, ...,  1.04918031,
         1.55855898, -0.91756696],
       [-0.42276709, -0.57125689, -0.50369272, ..., -1.14541338,
        -1.01054017, -1.68972594]])

In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1000)
classifier

LogisticRegression(max_iter=1000)

In [10]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000)

# Train the Model



In [11]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.8172801831012779
Testing Data Score: 0.8278032036613272


In [12]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'FALSE POSITIVE' 'FALSE POSITIVE']
First 10 Actual labels: ['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'CANDIDATE', 'FALSE POSITIVE']


In [13]:
importance = classifier.coef_[0]

importance

array([-1.83245223, -1.1747566 , -1.83114194, -1.1711466 ,  0.40325675,
       -0.10109961,  0.40729874, -0.28223196,  1.22378886,  0.15341865,
        0.17587334, -0.10310015, -0.82019778,  0.0745493 ,  0.11303565,
        0.6629928 ,  0.01228367, -0.03861356,  0.00557112])

In [132]:
# Set features. This will also be used as your x values.
selected_features2 = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff', 'koi_slogg', 'koi_srad', 'ra', 'dec']]

In [133]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(selected_features2, df["koi_disposition"], random_state=1)

In [134]:
# Scale your data
X_scaler2 = StandardScaler().fit(X_train2)

X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

In [135]:
classifier2 = LogisticRegression(max_iter=1000)
classifier2

LogisticRegression(max_iter=1000)

In [136]:
classifier2.fit(X_train_scaled2, y_train2)

LogisticRegression(max_iter=1000)

In [137]:
print(f"Training Data Score: {classifier2.score(X_train_scaled2, y_train2)}")
print(f"Testing Data Score: {classifier2.score(X_test_scaled2, y_test2)}")

Training Data Score: 0.8159450696166317
Testing Data Score: 0.8352402745995423


In [138]:
importance2 = classifier2.coef_[0]

importance2

array([-1.83592239, -1.17385487, -1.8324881 , -1.17503479,  0.40412375,
       -0.10189208,  0.38725509, -0.27704219,  1.29906549,  0.14048835,
        0.17977133, -0.12641995, -0.85282597,  0.07792671,  0.12601464,
        0.7112338 ,  0.01534249, -0.03619433])

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [139]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.5, 1, 5, 10],
              'fit_intercept': [True, False],
             'solver': ["newton-cg", "lbfgs", "liblinear"]}
grid = GridSearchCV(classifier, param_grid, verbose=3)

In [140]:
# Train the model with GridSearch
grid.fit(X_train_scaled2, y_train2)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END ....C=0.5, fit_intercept=True, solver=newton-cg; total time=   0.0s
[CV 2/5] END ....C=0.5, fit_intercept=True, solver=newton-cg; total time=   0.0s
[CV 3/5] END ....C=0.5, fit_intercept=True, solver=newton-cg; total time=   0.0s
[CV 4/5] END ....C=0.5, fit_intercept=True, solver=newton-cg; total time=   0.0s
[CV 5/5] END ....C=0.5, fit_intercept=True, solver=newton-cg; total time=   0.0s
[CV 1/5] END ........C=0.5, fit_intercept=True, solver=lbfgs; total time=   0.0s
[CV 2/5] END ........C=0.5, fit_intercept=True, solver=lbfgs; total time=   0.0s
[CV 3/5] END ........C=0.5, fit_intercept=True, solver=lbfgs; total time=   0.0s
[CV 4/5] END ........C=0.5, fit_intercept=True, solver=lbfgs; total time=   0.0s
[CV 5/5] END ........C=0.5, fit_intercept=True, solver=lbfgs; total time=   0.0s
[CV 1/5] END ....C=0.5, fit_intercept=True, solver=liblinear; total time=   0.0s
[CV 2/5] END ....C=0.5, fit_intercept=True, sol

[CV 4/5] END .....C=10, fit_intercept=True, solver=liblinear; total time=   0.0s
[CV 5/5] END .....C=10, fit_intercept=True, solver=liblinear; total time=   0.0s
[CV 1/5] END ....C=10, fit_intercept=False, solver=newton-cg; total time=   0.1s
[CV 2/5] END ....C=10, fit_intercept=False, solver=newton-cg; total time=   0.0s
[CV 3/5] END ....C=10, fit_intercept=False, solver=newton-cg; total time=   0.0s
[CV 4/5] END ....C=10, fit_intercept=False, solver=newton-cg; total time=   0.1s
[CV 5/5] END ....C=10, fit_intercept=False, solver=newton-cg; total time=   0.1s
[CV 1/5] END ........C=10, fit_intercept=False, solver=lbfgs; total time=   0.1s
[CV 2/5] END ........C=10, fit_intercept=False, solver=lbfgs; total time=   0.0s
[CV 3/5] END ........C=10, fit_intercept=False, solver=lbfgs; total time=   0.1s
[CV 4/5] END ........C=10, fit_intercept=False, solver=lbfgs; total time=   0.1s
[CV 5/5] END ........C=10, fit_intercept=False, solver=lbfgs; total time=   0.1s
[CV 1/5] END ....C=10, fit_i

GridSearchCV(estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [0.5, 1, 5, 10], 'fit_intercept': [True, False],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             verbose=3)

In [141]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'fit_intercept': False, 'solver': 'newton-cg'}
0.8167038400803384


In [142]:
classifier3 = LogisticRegression(max_iter=1000, C=10, fit_intercept=True, solver="newton-cg")
classifier3.fit(X_train_scaled2, y_train2)
print(f"Training Data Score: {classifier3.score(X_train_scaled2, y_train2)}")
print(f"Testing Data Score: {classifier3.score(X_test_scaled2, y_test2)}")

Training Data Score: 0.8186152965859241
Testing Data Score: 0.8346681922196796


# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'stephen_eldridge_1.sav'
joblib.dump(classifier3, filename)