In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
X = df.drop('koi_disposition', axis = 1)
y = df['koi_disposition']

In [5]:
#Import module from sklearn
from sklearn.model_selection import train_test_split

#Just use these variables, they're a standard
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

# Pre-processing

Scale the data using the MinMaxScaler

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)


# Train the Support Vector Machine

In [8]:
from sklearn.svm import SVC

model = SVC(kernel = 'linear')

model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [10]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.8508691674290942
Testing Data Score: 0.8472095150960659


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [18]:
# Create the GridSearchCV model

from sklearn.model_selection import GridSearchCV

param_grid = {'C' : [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [20]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 11 candidates, totalling 33 fits
[CV] C=1 .............................................................
[CV] .................... C=1, score=0.8395061728395061, total=   1.5s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s


[CV] .................... C=1, score=0.8394327538883806, total=   1.6s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.5s remaining:    0.0s


[CV] ..................... C=1, score=0.854462242562929, total=   1.7s
[CV] C=5 .............................................................
[CV] .................... C=5, score=0.8587105624142661, total=   2.5s
[CV] C=5 .............................................................
[CV] .................... C=5, score=0.8618481244281794, total=   1.5s
[CV] C=5 .............................................................
[CV] .................... C=5, score=0.8713958810068649, total=   1.6s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.8664837677183356, total=   1.9s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.8705397987191217, total=   1.6s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.8759725400457666, total=   1.7s
[CV] C=15 ............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:  1.7min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [21]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 40}
0.8810612991765783


In [27]:
predictions = grid.predict(X_test)

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, predictions)


array([[ 348,  172,    9],
       [  58,  498,   12],
       [   2,    0, 1087]], dtype=int64)

In [31]:
from sklearn.metrics import mean_squared_error, r2_score

#mse = mean_squared_error(y_test, predictions)

r2 = r2_score(y, predictions)

print(mse)
print(r2)

ValueError: Found input variables with inconsistent numbers of samples: [8744, 2186]

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.85      0.66      0.74       529
     CONFIRMED       0.74      0.88      0.80       568
FALSE POSITIVE       0.98      1.00      0.99      1089

     micro avg       0.88      0.88      0.88      2186
     macro avg       0.86      0.84      0.85      2186
  weighted avg       0.89      0.88      0.88      2186



AttributeError: 'GridSearchCV' object has no attribute 'save'