<a href="https://colab.research.google.com/github/coralmaven/kepler-exoplanets/blob/master/Copy_of_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df_orig = pd.read_csv("ipynb/data/cumulative.csv")
df_orig.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
df_orig.size

478200

In [5]:
df = df_orig.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.size

358504

# Label Encoder: Transform Labels to Numbers

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

label_encoder.fit(df['koi_disposition'])
label_encoder.classes_

array(['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [7]:
df['koi_disposition'] = label_encoder.transform(df['koi_disposition'])

# Assign X values

In [8]:
X = df[df.columns[1:]]

# Use `koi_disposition` for the y values

In [9]:
y = df['koi_disposition'].values.reshape(-1,1)

In [10]:
X = pd.get_dummies(X)

# Create Test Train Split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,stratify=y)

# Pre-processing

Scale the data using the MinMaxScaler

In [12]:
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)
y_minmax = MinMaxScaler().fit(y_train)

In [13]:
X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

# Train the Support Vector Machine

In [14]:
%%time
from sklearn.svm import SVC
model1 = SVC(kernel='linear')
model1.fit(X_train_minmax, y_train)
model1

CPU times: user 900 ms, sys: 54.9 ms, total: 955 ms
Wall time: 842 ms


In [15]:
print(f"Training Data Score: {model1.score(X_train_minmax, y_train)}")
print(f"Testing Data Score: {model1.score(X_test_minmax, y_test)}")

Training Data Score: 0.8508691674290942
Testing Data Score: 0.8472095150960659


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

# Create the GridSearchCV model

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [50],
             'gamma': [0.0005]}
grid = GridSearchCV(model1, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_minmax, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)