# MACHINE LEARNING CHALLENGE
# Exoplanet Exploration: Model2
Note: This model will use Support Vector Machine (SVM) linear classifier

In [1]:

# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade


Requirement already up-to-date: sklearn in c:\users\diazd\anaconda3\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:

# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np



# Read CSV & Perform Basic Data Cleaning

In [4]:
# csv file obtained from Kaggle

df = pd.read_csv ('cumulative.csv')
df.shape

(9564, 50)

In [5]:
df.columns

Index(['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition',
       'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
       'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1',
       'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1',
       'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2',
       'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth',
       'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1',
       'koi_prad_err2', 'koi_teq', 'koi_teq_err1', 'koi_teq_err2', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_tce_delivname', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
       'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad',
       'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

In [6]:

#  Remove features 
df = df.drop(columns=['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition', 'koi_score', 'koi_tce_delivname'])


# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df

Unnamed: 0,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CANDIDATE,0,0,0,0,9.488036,2.775000e-05,-2.775000e-05,170.538750,0.002160,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CANDIDATE,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.899140,1.494000e-05,-1.494000e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
4,CANDIDATE,0,0,0,0,2.525592,3.761000e-06,-3.761000e-06,171.595550,0.001130,...,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152.0,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
9560,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166.0,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
9561,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
9562,FALSE POSITIVE,0,0,1,0,0.681402,2.434000e-06,-2.434000e-06,132.181750,0.002850,...,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


In [7]:
# remaining columns (9 were removed)
df.columns 

Index(['koi_pdisposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')


# Select your features (columns)



In [8]:



# X will be All columns with the exception of defined label (not including y value)

# X = df['koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co','koi_fpflag_ec',
#                         'koi_period','koi_period_err1','koi_period_err2']


X = df.drop(columns=["koi_pdisposition"])   
# define label (y)
y = df["koi_pdisposition"]  

print(X.shape)
print(y.shape)

(8744, 40)
(8744,)


#  Create a Train Test Split 
Use koi_disposition for the y values


In [9]:
 
# split the data to train and test values

from sklearn.model_selection import train_test_split


In [10]:

 # random sample of label and features
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)



In [11]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3380,0,0,0,0,56.391062,0.004836,-0.004836,167.579,0.113,-0.113,...,-170.0,4.643,0.044,-0.032,0.628,0.051,-0.056,282.37042,48.259289,16.324
5360,0,1,0,0,2.192004,1.463e-06,-1.463e-06,132.218114,0.000761,-0.000761,...,-185.0,4.642,0.03,-0.096,0.687,0.108,-0.046,290.93845,37.701172,16.654
834,0,1,0,0,1.744101,6.45e-07,-6.45e-07,134.50867,0.000314,-0.000314,...,-145.0,4.593,0.049,-0.018,0.703,0.029,-0.054,294.91586,45.783871,15.702
7896,0,0,1,0,20.413812,0.0003221,-0.0003221,148.5092,0.0133,-0.0133,...,-166.0,4.569,0.038,-0.161,0.826,0.201,-0.067,293.90091,38.308689,15.38
2826,0,0,1,0,0.536335,8.81e-07,-8.81e-07,131.71447,0.00135,-0.00135,...,-180.0,4.569,0.029,-0.162,0.84,0.2,-0.067,298.65317,47.713711,15.546


# Pre-processing 
Use MinMaxScaler to scale the numerical data.


In [12]:
# Referenced: https://www.geeksforgeeks.org/data-preprocessing-machine-learning-python/

from sklearn.preprocessing import MinMaxScaler, LabelEncoder


In [13]:
# Scale your data

X_scale = MinMaxScaler().fit(X_train) 

X_train_scaled = X_scale.transform(X_train)
X_test_scaled = X_scale.transform(X_test)

In [14]:

# Sample testing
X_test_scaled


array([[1.        , 0.        , 0.        , ..., 0.58909669, 0.80928728,
        0.55789735],
       [0.        , 0.        , 0.        , ..., 0.52956598, 0.32528598,
        0.54062319],
       [1.        , 0.        , 0.        , ..., 0.52391115, 0.38322744,
        0.69873543],
       ...,
       [0.        , 0.        , 0.        , ..., 0.76285163, 0.8381738 ,
        0.57062567],
       [0.        , 0.        , 1.        , ..., 0.61831056, 0.32029931,
        0.52822547],
       [0.        , 1.        , 0.        , ..., 0.87532765, 0.64948715,
        0.51243904]])

In [15]:

# Label-encoded data  

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)


print(encoded_y_test)


[1 0 1 ... 0 1 1]


# Train the Model

* SUPPORT VECTOR MACHINE (SVM) 


In [16]:
# Support vector machine (SVM) linear classifier
# Import Linear SVC (Support Vector Classifier) 

from sklearn.svm import SVC 


In [17]:

# fit the data and make predictions
 

model = SVC(kernel='linear')
model.fit(X_train_scaled, encoded_y_train)
predictions = model.predict(X_test_scaled)
model

SVC(kernel='linear')

In [18]:

print(f"Training Data Score: {model.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 0.9945105215004575
Testing Data Score: 0.9935956084172004


# Hyperparameter Tuning
When hyper-parameter tuning, some models have parameters that depend on each other, and certain combinations will not create a valid model. Be sure to read through any warning messages and check the documentation


In [None]:

from sklearn.model_selection import GridSearchCV


In [None]:

# Create the GridSearchCV model

param_grid = {'C': [10, 50, 100],
             'max_iter':[200, 500, 1000]}

grid = GridSearchCV(model,param_grid, verbose=3)

In [None]:


# Train the model with GridSearch


grid.fit(X_train_scaled, encoded_y_train)


In [None]:


print(grid.best_params_)
print(grid.best_score_)


In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib

# filename = 'exoplanet_model2.sav'
# joblib.dump(model, filename)


In [None]:
# load the model from disk

# loaded_model = joblib.load(filename)
# result = loaded_model.score(X_test_scaled, encoded_y_test)
# print(result)