In [93]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\programdata\anaconda3\lib\site-packages (0.0)


In [94]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


from matplotlib import style
style.use("fivethirtyeight")

from matplotlib import rcParams
rcParams['figure.figsize'] = 10, 8

# Read the CSV and Perform Basic Data Cleaning

In [96]:
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [151]:
# nx.from_pandas_edgelist
# Set features. This will also be used as your x values.
# test_features = df[['koi_period', 'koi_time0bk', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag']]
# X = test_features

X = df.drop('koi_disposition', axis=1)
X = X.head(100)
X = X.values.tolist()
X

# X = df.drop('koi_disposition', axis=1)
# X = df.head(200)
# X.head()

[[0.0,
  0.0,
  0.0,
  0.0,
  54.4183827,
  0.0002479,
  -0.0002479,
  162.51384,
  0.00352,
  -0.00352,
  0.586,
  0.059000000000000004,
  -0.44299999999999995,
  4.507,
  0.11599999999999999,
  -0.11599999999999999,
  874.8,
  35.5,
  -35.5,
  2.83,
  0.32,
  -0.19,
  443.0,
  9.11,
  2.87,
  -1.62,
  25.8,
  2.0,
  5455.0,
  81.0,
  -81.0,
  4.467,
  0.064,
  -0.096,
  0.927,
  0.105,
  -0.061,
  291.93422999999996,
  48.141651,
  15.347000000000001],
 [0.0,
  1.0,
  0.0,
  0.0,
  19.89913995,
  1.49e-05,
  -1.49e-05,
  175.850252,
  0.0005809999999999999,
  -0.0005809999999999999,
  0.9690000000000001,
  5.126,
  -0.077,
  1.7822,
  0.0341,
  -0.0341,
  10829.0,
  171.0,
  -171.0,
  14.6,
  3.92,
  -1.31,
  638.0,
  39.3,
  31.04,
  -10.49,
  76.3,
  1.0,
  5853.0,
  158.0,
  -176.0,
  4.544,
  0.044000000000000004,
  -0.17600000000000002,
  0.868,
  0.233,
  -0.078,
  297.00482,
  48.134128999999994,
  15.436],
 [0.0,
  1.0,
  0.0,
  0.0,
  1.7369524530000002,
  2.63e-07,
  -2.63e

# Create a Train Test Split

Use `koi_disposition` for the y values

In [155]:


# Assign koi_disposition to y
Y = df[['koi_disposition']]


# # Assign koi_disposition to y and Remove spaces btw words
# Y = df['koi_disposition'].apply(lambda str : str.replace(" ", ""))

# 200 samples for test
Y = Y.head(100)

# LABEL ENCODE Y
# Import required module
from sklearn.preprocessing import LabelEncoder

# LabelEncoder
le = LabelEncoder()

# Create an object of the label encoder class
labelencoder = LabelEncoder()

# apply "le.fit_transform"
y = Y.apply(le.fit_transform)

# Change the shape of Y
# y=np.array(y['koi_disposition'])
# y=(y['koi_disposition'])

# y = y.reshape(1,-1)
y
# Apply labelencoder object on columns
# labelencoder.fit_transform(data.ix[:, 1:])   # First column does not need to be encoded

Unnamed: 0,koi_disposition
0,1
1,2
2,2
3,1
4,1
...,...
95,1
96,1
97,1
98,1


In [156]:
# Split data into train and test groups
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [100]:
X_train

[[0.0,
  0.0,
  0.0,
  0.0,
  46.1842039,
  0.0002654,
  -0.0002654,
  165.23729,
  0.0043,
  -0.0043,
  0.013000000000000001,
  0.415,
  -0.013000000000000001,
  4.758,
  0.13,
  -0.13,
  1394.7,
  56.4,
  -56.4,
  1.83,
  0.12,
  -0.15,
  273.0,
  1.32,
  0.29,
  -0.3,
  26.0,
  3.0,
  3950.0,
  70.0,
  -86.0,
  4.754,
  0.042,
  -0.031,
  0.493,
  0.033,
  -0.04,
  286.07912999999996,
  39.27832,
  15.954],
 [0.0,
  1.0,
  0.0,
  0.0,
  10.32803571,
  3.76e-06,
  -3.76e-06,
  174.581054,
  0.000306,
  -0.000306,
  0.281,
  0.04,
  -0.064,
  7.7915,
  0.0272,
  -0.0272,
  16918.0,
  31.8,
  -31.8,
  13.28,
  3.64,
  -2.41,
  843.0,
  119.31,
  98.49,
  -48.58,
  636.8,
  1.0,
  5627.0,
  169.0,
  -169.0,
  4.359,
  0.18,
  -0.18,
  1.024,
  0.28,
  -0.18600000000000003,
  287.69675,
  38.886452,
  15.645999999999999],
 [0.0,
  0.0,
  0.0,
  0.0,
  4.280963604,
  6.14e-06,
  -6.14e-06,
  171.89659,
  0.00115,
  -0.00115,
  0.424,
  0.11199999999999999,
  -0.35600000000000004,
  2.875,

In [157]:
y_train

Unnamed: 0,koi_disposition
68,1
75,2
21,1
94,1
8,1
...,...
6,1
26,1
60,1
50,0


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [173]:
# Scale your data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

# Create variables to hold the scaled train & test data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [181]:
# # Reshape Y
# y_train_scaled
# # y_train_scaled=np.array(y_train_scaled)
# print(y_train_scaled.reshape(1,-1))
# y_train_scaled = y_train_scaled.reshape(-1,1)
# len(y_train_scaled)
# # array.reshape(-1, 1)
y_train

Unnamed: 0,koi_disposition
68,1
75,2
21,1
94,1
8,1
...,...
6,1
26,1
60,1
50,0


# Train the Model



In [174]:
model1 = SVC(kernel="linear")
model1


# print(f"Training Data Score: {model1.score(X_train_scaled, y_train)}")
# print(f"Testing Data Score: {model1.score(X_test_scaled, y_test)}")

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [175]:
# Create the GridSearchCV model
param_grid = {'C': [0.01, 1, 5],
              'gamma': [0.0005, 0.001, 0.005]}

cv = KFold(n_splits=2, shuffle=False, random_state=None)
grid = GridSearchCV(model1, param_grid, verbose=3, scoring='accuracy', cv=cv)


# # Create the GridSearchCV model
# param_grid = {'C': [0.01, 1, 5],
#               'gamma': [0.0005, 0.001, 0.005],
#              'n_splits_': [2]}
# kf = KFold(n_splits=2, shuffle=False, random_state=None)
# newxtrain=kf.split(np.array(X_train['koi_period']))
# #grid = GridSearchCV(model1, param_grid, verbose=3, scoring='f1', cv=cv)

In [177]:
X_train_scaled
len(y_train_scaled)

75

In [178]:
# Train the model with GridSearch
# grid.fit(X_train, y_train)
grid.fit(X_train_scaled, y_train_scaled)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV] C=0.01, gamma=0.0005 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)


ValueError: Unknown label type: 'continuous'

In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [None]:
# List best parameters and best score
print(grid.best_params_)
print(grid.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'testModel1.sav'
joblib.dump(your_model, filename)