In [76]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade
# install joblib to save model. 
# !pip install joblib

In [77]:
import pandas as pd
import tensorflow
import numpy as np

In [78]:
df = pd.read_csv("../Resources/cleaned_data.csv")
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [79]:
from sklearn.model_selection import train_test_split
y = df['koi_disposition']
X = df.drop('koi_disposition',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, stratify = y)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
4002,0,0,1,0,99.673478,0.0003463,-0.0003463,219.33483,0.0023,-0.0023,...,-148,4.777,0.04,-0.027,0.492,0.026,-0.027,293.05801,45.248821,15.801
4246,0,1,0,0,0.592244,9e-08,-9e-08,131.654831,0.000124,-0.000124,...,-146,4.664,0.056,-0.032,0.591,0.045,-0.045,290.28094,45.46426,15.653
548,0,1,1,0,9.991625,5.36e-06,-5.36e-06,137.447816,0.000445,-0.000445,...,-176,4.338,0.153,-0.187,1.096,0.309,-0.206,301.04239,45.022888,14.039
3953,0,1,0,0,178.41299,3.1e-05,-3.1e-05,218.225235,0.000127,-0.000127,...,-134,4.346,0.084,-0.126,1.148,0.202,-0.124,288.32785,38.627621,13.944
2362,0,0,0,0,45.294223,5.6e-05,-5.6e-05,138.678725,0.000987,-0.000987,...,-68,4.347,0.03,-0.03,1.044,0.057,-0.042,285.67938,50.241299,10.961


# Pre-processing

Scale the data using the MinMaxScaler

In [80]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Standardize the data using StandardScaler

In [81]:
from sklearn.preprocessing import StandardScaler

X_train_stand = X_train.copy()
X_test_stand = X_test.copy()

# only numerical features
num_cols = X.columns

# apply standardization on numerical features
for i in num_cols:
    scale = StandardScaler().fit(X_train_stand[[i]])
    X_train_stand[i] = scale.transform(X_train_stand[[i]])
    X_test_stand[i] = scale.transform(X_test_stand[[i]])

# Train the Model

In [82]:
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [83]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8439824527942018
Testing Data Score: 0.8415331807780321


In [84]:
model.fit(X_train_stand, y_train)
print(f"Training Data Score: {model.score(X_train_stand, y_train)}")
print(f"Test Data Score: {model.score(X_test_stand, y_test)}")

Training Data Score: 0.895288956704177
Test Data Score: 0.88558352402746


Standardization produces higher test accuracy than normalization

In [85]:
model2 = SVC(kernel='rbf')
model2.fit(X_train_stand, y_train)
print(f"Training Data Score: {model2.score(X_train_stand, y_train)}")
print(f"Test Data Score: {model2.score(X_test_stand, y_test)}")

Training Data Score: 0.8893763112721724
Test Data Score: 0.8770022883295194


RBF kernel doesn't improve the model performance so we are supposed to consider the linear kernel.

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [86]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[1, 5, 10],
             'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [87]:
# Train the model with GridSearch
grid.fit(X_train_scaled,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.856, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.846, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.839, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.841, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.825, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.856, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.846, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.839, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.841, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   25.5s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01]},
             verbose=3)

In [88]:
print(grid.best_params_)
print(grid.best_score_) # 86.80%

{'C': 10, 'gamma': 0.0001}
0.8680138845428944


In [89]:
predictions1 = grid.predict(X_test_scaled)
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test)) # 87.6%

Test Acc: 0.876


In [90]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.85      0.64      0.73       422
     CONFIRMED       0.72      0.86      0.79       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.88      1748
     macro avg       0.85      0.83      0.83      1748
  weighted avg       0.88      0.88      0.87      1748



In [91]:
grid.fit(X_train_stand, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.908, total=   0.7s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.886, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.873, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.898, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.891, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.908, total=   0.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.886, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.873, total=   0.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.898, total=   0.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  1.6min finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01]},
             verbose=3)

In [92]:
print(grid.best_params_)
print(grid.best_score_) # 89.32%

{'C': 10, 'gamma': 0.0001}
0.8931921713882358


In [93]:
predictions2 = grid.predict(X_test_stand)
print('Test Acc: %.3f' % grid.score(X_test_stand, y_test)) # 88.2%

Test Acc: 0.882


In [94]:
print(classification_report(y_test, predictions2))

                precision    recall  f1-score   support

     CANDIDATE       0.80      0.73      0.77       422
     CONFIRMED       0.77      0.80      0.78       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.88      1748
     macro avg       0.85      0.84      0.85      1748
  weighted avg       0.88      0.88      0.88      1748



# Make predictions

In [95]:
prediction1_df = pd.DataFrame({"Actual": y_test, "Predicted":predictions1})
prediction1_df.head(20)

Unnamed: 0,Actual,Predicted
1981,CANDIDATE,CANDIDATE
5609,FALSE POSITIVE,FALSE POSITIVE
532,FALSE POSITIVE,FALSE POSITIVE
6558,CANDIDATE,CANDIDATE
1249,FALSE POSITIVE,FALSE POSITIVE
237,CONFIRMED,CONFIRMED
3247,FALSE POSITIVE,FALSE POSITIVE
6859,FALSE POSITIVE,FALSE POSITIVE
1687,CONFIRMED,CONFIRMED
1143,CONFIRMED,FALSE POSITIVE


In [96]:
prediction1_df['match'] = np.where(prediction1_df['Predicted'] == prediction1_df['Actual'], 1, 0)
print(f"The number of total predictions: {len(prediction1_df)}")
print(f"The number of correct predictions: {sum(prediction1_df['match'])}")
print(f"The test test accuracy: {round(sum(prediction1_df['match'])/len(prediction1_df)*100,2)}%") # 87.59%
prediction1_df[prediction1_df['match']==0].index

The number of total predictions: 1748
The number of correct predictions: 1531
The test test accuracy: 87.59%


Int64Index([1143, 6093, 1455, 1741, 4197, 3314, 4558, 2580, 6702, 6451,
            ...
            4063,  731, 3639, 1769, 3787, 3541, 1400, 2258, 3203,  729],
           dtype='int64', length=217)

In [97]:
prediction2_df = pd.DataFrame({"Actual": y_test, "Predicted":predictions2})
prediction2_df.head(20)

Unnamed: 0,Actual,Predicted
1981,CANDIDATE,CANDIDATE
5609,FALSE POSITIVE,FALSE POSITIVE
532,FALSE POSITIVE,FALSE POSITIVE
6558,CANDIDATE,CANDIDATE
1249,FALSE POSITIVE,FALSE POSITIVE
237,CONFIRMED,CONFIRMED
3247,FALSE POSITIVE,FALSE POSITIVE
6859,FALSE POSITIVE,FALSE POSITIVE
1687,CONFIRMED,CONFIRMED
1143,CONFIRMED,FALSE POSITIVE


In [98]:
prediction2_df['match'] = np.where(prediction2_df['Predicted'] == prediction2_df['Actual'], 1, 0)
print(f"The number of total predictions: {len(prediction2_df)}")
print(f"The number of correct predictions: {sum(prediction2_df['match'])}")
print(f"The test test accuracy: {round(sum(prediction2_df['match'])/len(prediction2_df)*100,2)}%") # 88.22%
prediction2_df[prediction2_df['match']==0].index

The number of total predictions: 1748
The number of correct predictions: 1542
The test test accuracy: 88.22%


Int64Index([1143, 3945, 6093, 1455, 1741,  478, 2236, 4197, 4558, 2580,
            ...
            2827, 3639, 1769, 3787, 1400, 2258, 2932, 5125, 3407, 1154],
           dtype='int64', length=206)

# Save the Model

In [100]:
import joblib
filename = 'SVM.sav'
joblib.dump(grid, filename)

['SVM.sav']