In [1]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# Read the csv file into a pandas DataFrame
df = pd.read_csv('kepler-exoplanet.csv')
df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
#Created new dataset "Kepler" with varibles of interest only
kepler = df[["kepid","koi_disposition", "koi_score","koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co","koi_fpflag_ec","koi_period"]]
kepler.head()

Unnamed: 0,kepid,koi_disposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period
0,10797460,CONFIRMED,1.0,0,0,0,0,9.488036
1,10797460,CONFIRMED,0.969,0,0,0,0,54.418383
2,10811496,FALSE POSITIVE,0.0,0,1,0,0,19.89914
3,10848459,FALSE POSITIVE,0.0,0,1,0,0,1.736952
4,10854555,CONFIRMED,1.0,0,0,0,0,2.525592


In [4]:
pd.get_dummies(kepler, columns=["koi_disposition"])

Unnamed: 0,kepid,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_disposition_CANDIDATE,koi_disposition_CONFIRMED,koi_disposition_FALSE POSITIVE
0,10797460,1.000,0,0,0,0,9.488036,0,1,0
1,10797460,0.969,0,0,0,0,54.418383,0,1,0
2,10811496,0.000,0,1,0,0,19.899140,0,0,1
3,10848459,0.000,0,1,0,0,1.736952,0,0,1
4,10854555,1.000,0,0,0,0,2.525592,0,1,0
...,...,...,...,...,...,...,...,...,...,...
9559,10031643,0.000,0,0,0,1,8.589871,0,0,1
9560,10090151,0.000,0,1,1,0,0.527699,0,0,1
9561,10128825,0.497,0,0,0,0,1.739849,1,0,0
9562,10147276,0.021,0,0,1,0,0.681402,0,0,1


In [5]:
X = kepler[["koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co","koi_fpflag_ec","koi_period"]]
y = kepler["koi_disposition"].values.reshape(-1, 1)
print(X.shape, y.shape)

(9564, 5) (9564, 1)


In [6]:
# Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
# Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5],
              'gamma': [0.0001, 0.0005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [9]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.776, total= 6.3min
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.3min remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.782, total= 6.2min
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 12.5min remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.786, total= 7.3min
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.793, total= 7.1min
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.775, total= 6.0min
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.776, total= 6.1min
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.782, total= 5.8min
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.786, total= 6.8min
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.793, total= 6.8min
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 118.2min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5], 'gamma': [0.0001, 0.0005]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [10]:
# List the best parameters for this dataset
print(grid.best_params_)

{'C': 5, 'gamma': 0.0001}


In [11]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [12]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["candidate", "confirmed", "false positive"]))

                precision    recall  f1-score   support

     candidate       0.70      0.16      0.27       567
     confirmed       0.52      0.92      0.66       574
false positive       0.98      0.97      0.98      1250

      accuracy                           0.77      2391
     macro avg       0.74      0.69      0.64      2391
  weighted avg       0.80      0.77      0.73      2391

