In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import pandas as pd

import warnings
warnings.simplefilter('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
df["koi_disposition"].value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

In [4]:
# data = df.copy()

# data_binary_encoded = pd.get_dummies(data, columns=["koi_disposition"])
# data_binary_encoded.head()


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = list(df.columns[1:])

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
y = df["koi_disposition"]

X = df[selected_features]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
y_train

3563         CANDIDATE
4099         CONFIRMED
5460         CANDIDATE
1091         CONFIRMED
5999         CANDIDATE
             ...      
905          CONFIRMED
5192    FALSE POSITIVE
3980    FALSE POSITIVE
235          CONFIRMED
5157         CONFIRMED
Name: koi_disposition, Length: 5243, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_train_scaled = MinMaxScaler().fit_transform(X_train)
X_test_scaled = MinMaxScaler().fit_transform(X_test)

In [9]:
X_train_scaled 

array([[0.        , 0.        , 0.        , ..., 0.83497297, 0.51779124,
        0.5155798 ],
       [0.        , 0.        , 0.        , ..., 0.72693168, 0.38067188,
        0.70650467],
       [0.        , 0.        , 0.        , ..., 0.56436342, 0.80798012,
        0.69823952],
       ...,
       [0.        , 0.        , 1.        , ..., 0.88419373, 0.2724652 ,
        0.74055707],
       [0.        , 0.        , 0.        , ..., 0.38035748, 0.58629009,
        0.733697  ],
       [0.        , 0.        , 0.        , ..., 0.25722845, 0.72706515,
        0.59980164]])

# Train the Model



In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()





model.fit(X_train, y_train)
training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)



print(f"Training Data Score: {training_score}")
print(f"Testing Data Score: {testing_score}")

Training Data Score: 0.2607285905016212
Testing Data Score: 0.26887871853546913


In [11]:
import numpy as np

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [13]:
# Create the GridSearchCV model
# Create first pipeline for base without reducing features.
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV

pipe = Pipeline([('classifier' , RandomForestClassifier())])
# pipe = Pipeline([('classifier', RandomForestClassifier())])

# Create param grid.

param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,40)),
    'classifier__max_features' : list(range(6,32,10))}
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data

# Train the models with GridSearch
best_clf = clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 236 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 245 out of 245 | elapsed:  7.6min finished


In [14]:
print(best_clf.best_params_)
print(best_clf.best_score_)

{'classifier': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=16,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=90,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 'classifier__max_features': 16, 'classifier__n_estimators': 90}
0.8966225558328906


# Save the Model

In [15]:
# save your models by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'best_clf.sav'
joblib.dump(best_clf, filename)

['best_clf.sav']