In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /opt/anaconda3/lib/python3.7/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [1]:
import pandas as pd

import numpy as np

import warnings
warnings.simplefilter('ignore', FutureWarning)

# Data Pre-Processing

In [2]:
# Load and Clean Data

df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()


# Set features(X) and target(y) values
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]
print(X.shape, y.shape)


(6991, 40) (6991,)


In [3]:
# Train and Test split data
from sklearn.model_selection import train_test_split

# Using stratify for fixing proportion of values
# Source Link: https://towardsdatascience.com/6-amateur-mistakes-ive-made-working-with-train-test-splits-916fabb421bb
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
4002,0,0,1,0,99.673478,0.0003463,-0.0003463,219.33483,0.0023,-0.0023,...,-148,4.777,0.04,-0.027,0.492,0.026,-0.027,293.05801,45.248821,15.801
4246,0,1,0,0,0.592244,9e-08,-9e-08,131.654831,0.000124,-0.000124,...,-146,4.664,0.056,-0.032,0.591,0.045,-0.045,290.28094,45.46426,15.653
548,0,1,1,0,9.991625,5.36e-06,-5.36e-06,137.447816,0.000445,-0.000445,...,-176,4.338,0.153,-0.187,1.096,0.309,-0.206,301.04239,45.022888,14.039
3953,0,1,0,0,178.41299,3.1e-05,-3.1e-05,218.225235,0.000127,-0.000127,...,-134,4.346,0.084,-0.126,1.148,0.202,-0.124,288.32785,38.627621,13.944
2362,0,0,0,0,45.294223,5.6e-05,-5.6e-05,138.678725,0.000987,-0.000987,...,-68,4.347,0.03,-0.03,1.044,0.057,-0.042,285.67938,50.241299,10.961


In [4]:
# MinMaxScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



# Create and Train(Fit) Model

### First Classifier

In [5]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression()

# Fit (train) or model - First Classifier
classifier1.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
print(f"Training Data Score: {classifier1.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier1.score(X_test, y_test)}")

Training Data Score: 0.6620255578867061
Testing Data Score: 0.6653318077803204


### Second Classifier

In [7]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier2 = LogisticRegression()

# Fit (train) or model - Second Classifier
classifier2.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
print(f"Training Data Score: {classifier2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier2.score(X_test_scaled, y_test)}")

Training Data Score: 0.8411214953271028
Testing Data Score: 0.8409610983981693


# Hyperparameter Tuning - GridSearchCV 

In [9]:
# Source Link: https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5

from sklearn.model_selection import GridSearchCV

param_grid = {
     'penalty' : ['l1', 'l2'],
    'C': [1, 5, 10, 50]}

grid = GridSearchCV(classifier2, param_grid = param_grid, cv = 5, verbose=3, n_jobs=-1)


In [10]:
# Fit the model

grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  4.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1, 5, 10, 50], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

## Quantify our Trained Model

In [11]:
# List the best parameters for this dataset
# List the best score

print(grid.best_params_)
print('Best Grid score: %.3f' % grid.best_score_)

{'C': 5, 'penalty': 'l1'}
Best Grid score: 0.885


In [12]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

print('Logistic Regression')
print('Test Accuracy: %.3f' % grid.score(X_test, y_test))

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

Logistic Regression
Test Accuracy: 0.880
                precision    recall  f1-score   support

     CANDIDATE       0.82      0.69      0.75       422
     CONFIRMED       0.75      0.84      0.79       450
FALSE POSITIVE       0.98      0.99      0.98       876

      accuracy                           0.88      1748
     macro avg       0.85      0.84      0.84      1748
  weighted avg       0.88      0.88      0.88      1748



#  Save the Model (If the best)

In [17]:
## Create a file for your best model and push to GitHub
# import joblib
# filename = 'logistic-regression.sav'
# joblib.dump(grid, filename)
