In [1]:
# Import dependencies
import pandas as pd

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
print(df["koi_disposition"].unique())

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']


# Select your features (columns)

In [4]:
# Set X equal to the entire data set, except for the first column
X = df.iloc[:,1:]

# Set y equal to the first column
y = df.iloc[:,0]

In [5]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
model.feature_importances_

array([0.13341857, 0.17878343, 0.12915733, 0.05268131, 0.01855677,
       0.01319293, 0.01373238, 0.01333536, 0.02308064, 0.02358378,
       0.01140865, 0.01164963, 0.0096631 , 0.01738702, 0.02359143,
       0.02956089, 0.02052197, 0.01309452, 0.01171301, 0.01261766,
       0.0109779 , 0.00998817, 0.01150382, 0.00896661, 0.00989888,
       0.00810681, 0.02902602, 0.00981321, 0.01049444, 0.0190523 ,
       0.01660179, 0.01258491, 0.00967793, 0.01267332, 0.0087141 ,
       0.01129801, 0.00886548, 0.01090983, 0.01007607, 0.01004001])

In [6]:
# Store the top 10 features as a series, using the column headers as the index
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
feat_imp

koi_fpflag_ss        0.178783
koi_fpflag_nt        0.133419
koi_fpflag_co        0.129157
koi_fpflag_ec        0.052681
koi_duration_err2    0.029561
koi_model_snr        0.029026
koi_duration_err1    0.023591
koi_time0bk_err2     0.023584
koi_time0bk_err1     0.023081
koi_depth            0.020522
dtype: float64

In [7]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['koi_disposition']

# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

In [9]:
X_train.head()

Unnamed: 0,koi_fpflag_ss,koi_fpflag_nt,koi_fpflag_co,koi_fpflag_ec,koi_duration_err2,koi_model_snr,koi_duration_err1,koi_time0bk_err2,koi_time0bk_err1,koi_depth
6947,0,1,0,0,-0.836,11.5,0.836,-0.0204,0.0204,523.2
1614,1,0,0,0,-0.1,16.9,0.1,-0.00296,0.00296,458.1
607,0,0,0,0,-0.158,17.4,0.158,-0.00638,0.00638,217.6
4051,0,1,0,0,-0.00403,628.2,0.00403,-0.00107,0.00107,431670.0
1093,0,0,0,0,-0.116,38.2,0.116,-0.00382,0.00382,336.6


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [11]:
from sklearn.svm import SVC

# Initialize model
model = SVC(kernel='poly')

# Train the model
model.fit(X_train_scaled, y_train)

SVC(kernel='poly')

In [12]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8441170190049114
Testing Data Score: 0.837521663778163


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [13]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

# Create the GridSearchCV model
param_grid = [{'C':[1, 5, 10, 50,100], 'kernel':['poly']},
             {'C':[1, 5, 10, 50], 'kernel':['rbf'], 'gamma': [0.0001, 0.0005, 0.001, 0.005]}]

grid = GridSearchCV(model, param_grid, verbose=3)

In [14]:
# Train the new model with GridSearch
best_model = grid.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 21 candidates, totalling 105 fits
[CV 1/5] END ..................C=1, kernel=poly;, score=0.858 total time=   0.1s
[CV 2/5] END ..................C=1, kernel=poly;, score=0.826 total time=   0.1s
[CV 3/5] END ..................C=1, kernel=poly;, score=0.853 total time=   0.1s
[CV 4/5] END ..................C=1, kernel=poly;, score=0.829 total time=   0.1s
[CV 5/5] END ..................C=1, kernel=poly;, score=0.835 total time=   0.1s
[CV 1/5] END ..................C=5, kernel=poly;, score=0.869 total time=   0.1s
[CV 2/5] END ..................C=5, kernel=poly;, score=0.844 total time=   0.1s
[CV 3/5] END ..................C=5, kernel=poly;, score=0.866 total time=   0.1s
[CV 4/5] END ..................C=5, kernel=poly;, score=0.845 total time=   0.1s
[CV 5/5] END ..................C=5, kernel=poly;, score=0.853 total time=   0.1s
[CV 1/5] END .................C=10, kernel=poly;, score=0.872 total time=   0.1s
[CV 2/5] END .................C=10, kernel=poly

[CV 2/5] END .....C=50, gamma=0.005, kernel=rbf;, score=0.781 total time=   0.3s
[CV 3/5] END .....C=50, gamma=0.005, kernel=rbf;, score=0.792 total time=   0.3s
[CV 4/5] END .....C=50, gamma=0.005, kernel=rbf;, score=0.785 total time=   0.3s
[CV 5/5] END .....C=50, gamma=0.005, kernel=rbf;, score=0.796 total time=   0.3s


In [15]:
print(best_model.best_params_)
print("Best Score: ", best_model.best_score_)

{'C': 100, 'kernel': 'poly'}
Best Score:  0.8603421539921007


In [16]:
# Predict with best_model
grid_predictions = best_model.predict(X_test_scaled)
df_grid = pd.DataFrame({"Actual":y_test, "Predicted":grid_predictions}) 
df_grid.head()

Unnamed: 0,Actual,Predicted
4982,FALSE POSITIVE,FALSE POSITIVE
4866,CANDIDATE,CANDIDATE
2934,FALSE POSITIVE,FALSE POSITIVE
5007,FALSE POSITIVE,FALSE POSITIVE
3869,FALSE POSITIVE,FALSE POSITIVE


In [17]:
# Score the model
best_model.score(X_test_scaled, y_test)

0.8548526863084922

# Save the Model

In [18]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'SVCTwoKernels.sav'
joblib.dump(best_model, filename)

['SVCTwoKernels.sav']

In [19]:
# Print Classification Report.
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.78      0.59      0.67       555
     CONFIRMED       0.70      0.83      0.76       628
FALSE POSITIVE       0.98      1.00      0.99      1125

      accuracy                           0.85      2308
     macro avg       0.82      0.81      0.81      2308
  weighted avg       0.86      0.85      0.85      2308

