In [76]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade
# install joblib to save model. 
# !pip install joblib

In [None]:
import pandas as pd
import tensorflow
import numpy as np

In [None]:
df = pd.read_csv("../Resources/cleaned_data.csv")
df.head()

# Create a Train Test Split

Use `koi_disposition` for the y values

In [None]:
from sklearn.model_selection import train_test_split
y = df['koi_disposition']
X = df.drop('koi_disposition',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, stratify = y)
X_train.head()

# Pre-processing

Scale the data using the MinMaxScaler

In [None]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Standardize the data using StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

X_train_stand = X_train.copy()
X_test_stand = X_test.copy()

# only numerical features
num_cols = X.columns

# apply standardization on numerical features
for i in num_cols:
    scale = StandardScaler().fit(X_train_stand[[i]])
    X_train_stand[i] = scale.transform(X_train_stand[[i]])
    X_test_stand[i] = scale.transform(X_test_stand[[i]])

# Train the Model

In [None]:
from sklearn.svm import SVC
model1 = SVC(kernel='linear')

In [13]:
print(f"The cross-validated accuracy for unprocessed training set:\n{cross_val_score(model1,X_train, y_train,scoring='accuracy', cv=5).mean()}")
# failed to converge without data's being scaled or standardized

Training Data Score: 0.8439824527942018


In [None]:
print(f"The cross-validated accuracy for normalized training set: \n{cross_val_score(model1,X_train_scaled, y_train,scoring='accuracy', cv=5).mean()}")
print(f"The cross-validated accuracy for standardized training set:\n{cross_val_score(model1,X_train_stand, y_train,scoring='accuracy', cv=5).mean()}")

Standardization produces higher training accuracy than normalization

In [15]:
model2 = SVC(kernel='rbf')
print(f"The cross-validated accuracy for normalized training set:/n{cross_val_score(model2,X_train_scaled, y_train,scoring='accuracy', cv=5).mean()}")

Training Data Score: 0.8893763112721724


RBF kernel doesn't improve the model performance so we are supposed to consider the linear kernel.

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [16]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[1, 5, 10],
             'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01]}
grid1 = GridSearchCV(model, param_grid, verbose=3)

In [17]:
# Train the model with GridSearch
grid1.fit(X_train_scaled,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.856, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.846, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.839, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.841, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.825, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.856, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.846, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.839, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.841, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   26.7s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01]},
             verbose=3)

In [18]:
print(grid1.best_params_)
print(grid1.best_score_) # 86.80%

{'C': 10, 'gamma': 0.0001}
0.8680138845428944


In [21]:
from sklearn.metrics import classification_report
predictions1 = grid1.predict(X_train_scaled)
print(classification_report(y_train, predictions1))

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.63      0.72      1265
     CONFIRMED       0.72      0.87      0.79      1350
FALSE POSITIVE       0.98      1.00      0.99      2628

      accuracy                           0.88      5243
     macro avg       0.85      0.83      0.83      5243
  weighted avg       0.88      0.88      0.87      5243



In [22]:
grid2 = GridSearchCV(model, param_grid, verbose=3)
grid2.fit(X_train_stand,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.908, total=   0.8s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.886, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.873, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.898, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.891, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.908, total=   0.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.886, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.873, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.898, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  1.5min finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01]},
             verbose=3)

In [24]:
print(grid2.best_params_)
print(grid2.best_score_) # 86.80%
predictions2 = grid2.predict(X_train_stand)
print(classification_report(y_train, predictions2))

{'C': 10, 'gamma': 0.0001}
0.8931921713882358
                precision    recall  f1-score   support

     CANDIDATE       0.84      0.74      0.78      1265
     CONFIRMED       0.78      0.85      0.82      1350
FALSE POSITIVE       0.98      1.00      0.99      2628

      accuracy                           0.90      5243
     macro avg       0.87      0.86      0.86      5243
  weighted avg       0.90      0.90      0.90      5243



In [89]:
# predictions1 = grid.predict(X_test_scaled)
# print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test)) # 87.6%

Test Acc: 0.876


In [90]:
# from sklearn.metrics import classification_report
# print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.85      0.64      0.73       422
     CONFIRMED       0.72      0.86      0.79       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.88      1748
     macro avg       0.85      0.83      0.83      1748
  weighted avg       0.88      0.88      0.87      1748



In [91]:
# grid.fit(X_train_stand, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.908, total=   0.7s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.886, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.873, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.898, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.891, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.908, total=   0.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.886, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.873, total=   0.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.898, total=   0.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  1.6min finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01]},
             verbose=3)

In [92]:
print(grid.best_params_)
print(grid.best_score_) # 89.32%

{'C': 10, 'gamma': 0.0001}
0.8931921713882358


The tuned model with standardized data perfoems better than the tuned model with normalized data.

In [93]:
# predictions2 = grid.predict(X_test_stand)
# print('Test Acc: %.3f' % grid.score(X_test_stand, y_test)) # 88.2%

Test Acc: 0.882


In [94]:
# print(classification_report(y_test, predictions2))

                precision    recall  f1-score   support

     CANDIDATE       0.80      0.73      0.77       422
     CONFIRMED       0.77      0.80      0.78       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.88      1748
     macro avg       0.85      0.84      0.85      1748
  weighted avg       0.88      0.88      0.88      1748



# Make predictions

In [95]:
prediction1_df = pd.DataFrame({"Actual": y_test, "Predicted":predictions1})
prediction1_df.head(20)

Unnamed: 0,Actual,Predicted
1981,CANDIDATE,CANDIDATE
5609,FALSE POSITIVE,FALSE POSITIVE
532,FALSE POSITIVE,FALSE POSITIVE
6558,CANDIDATE,CANDIDATE
1249,FALSE POSITIVE,FALSE POSITIVE
237,CONFIRMED,CONFIRMED
3247,FALSE POSITIVE,FALSE POSITIVE
6859,FALSE POSITIVE,FALSE POSITIVE
1687,CONFIRMED,CONFIRMED
1143,CONFIRMED,FALSE POSITIVE


In [96]:
prediction1_df['match'] = np.where(prediction1_df['Predicted'] == prediction1_df['Actual'], 1, 0)
print(f"The number of total predictions: {len(prediction1_df)}")
print(f"The number of correct predictions: {sum(prediction1_df['match'])}")
print(f"The test test accuracy: {round(sum(prediction1_df['match'])/len(prediction1_df)*100,2)}%") # 87.59%
prediction1_df[prediction1_df['match']==0].index

The number of total predictions: 1748
The number of correct predictions: 1531
The test test accuracy: 87.59%


Int64Index([1143, 6093, 1455, 1741, 4197, 3314, 4558, 2580, 6702, 6451,
            ...
            4063,  731, 3639, 1769, 3787, 3541, 1400, 2258, 3203,  729],
           dtype='int64', length=217)

In [97]:
prediction2_df = pd.DataFrame({"Actual": y_test, "Predicted":predictions2})
prediction2_df.head(20)

Unnamed: 0,Actual,Predicted
1981,CANDIDATE,CANDIDATE
5609,FALSE POSITIVE,FALSE POSITIVE
532,FALSE POSITIVE,FALSE POSITIVE
6558,CANDIDATE,CANDIDATE
1249,FALSE POSITIVE,FALSE POSITIVE
237,CONFIRMED,CONFIRMED
3247,FALSE POSITIVE,FALSE POSITIVE
6859,FALSE POSITIVE,FALSE POSITIVE
1687,CONFIRMED,CONFIRMED
1143,CONFIRMED,FALSE POSITIVE


In [98]:
prediction2_df['match'] = np.where(prediction2_df['Predicted'] == prediction2_df['Actual'], 1, 0)
print(f"The number of total predictions: {len(prediction2_df)}")
print(f"The number of correct predictions: {sum(prediction2_df['match'])}")
print(f"The test test accuracy: {round(sum(prediction2_df['match'])/len(prediction2_df)*100,2)}%") # 88.22%
prediction2_df[prediction2_df['match']==0].index

The number of total predictions: 1748
The number of correct predictions: 1542
The test test accuracy: 88.22%


Int64Index([1143, 3945, 6093, 1455, 1741,  478, 2236, 4197, 4558, 2580,
            ...
            2827, 3639, 1769, 3787, 1400, 2258, 2932, 5125, 3407, 1154],
           dtype='int64', length=206)

# Save the Model

In [25]:
import joblib
filename = 'SVM.sav'
joblib.dump(grid2, filename)

['SVM.sav']