In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /opt/anaconda3/lib/python3.7/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

import numpy as np

import warnings
warnings.simplefilter('ignore', FutureWarning)

# Data Pre-Processing

In [4]:
# Load and Clean Data

df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()


# Set features(X) and target(y) values
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]
print(X.shape, y.shape)


(6991, 40) (6991,)


In [5]:
# Train and Test split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3563,0,0,0,0,10.548413,5.47e-05,-5.47e-05,139.06402,0.00411,-0.00411,...,-133,4.387,0.066,-0.123,1.092,0.181,-0.097,298.09543,44.737061,13.204
4099,0,0,0,0,24.754385,0.0001365,-0.0001365,140.20732,0.00446,-0.00446,...,-144,4.519,0.078,-0.052,0.804,0.056,-0.076,295.73535,42.576248,15.514
5460,0,0,0,0,1.057336,1.23e-07,-1.23e-07,131.792007,9.6e-05,-9.6e-05,...,-140,4.594,0.054,-0.027,0.683,0.054,-0.06,292.18417,49.31004,15.414
1091,0,0,0,0,201.118319,0.001461,-0.001461,187.56986,0.00529,-0.00529,...,-112,4.447,0.072,-0.108,0.954,0.135,-0.083,283.11377,48.13139,13.328
5999,0,0,0,0,91.649983,0.003181,-0.003181,175.7156,0.0286,-0.0286,...,-233,4.145,0.164,-0.164,1.608,0.905,-0.383,294.93198,39.81242,12.964


In [6]:
# MinMaxScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_scaled = label_encoder.transform(y_train)
y_test_scaled = label_encoder.transform(y_test)


# Create and Train(Fit) Model

### Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier


# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")


k: 1, Train/Test Score: 1.000/0.787
k: 3, Train/Test Score: 0.895/0.805
k: 5, Train/Test Score: 0.871/0.811
k: 7, Train/Test Score: 0.862/0.818
k: 9, Train/Test Score: 0.858/0.820
k: 11, Train/Test Score: 0.848/0.830
k: 13, Train/Test Score: 0.848/0.830
k: 15, Train/Test Score: 0.842/0.830
k: 17, Train/Test Score: 0.839/0.830
k: 19, Train/Test Score: 0.836/0.831


In [8]:
# Note that k: 13 seems to be the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train_scaled, y_train)
print('k=13 Test Accuracy: %.3f' % knn.score(X_test_scaled, y_test))

k=13 Test Accuracy: 0.830


# Hyperparameter Tuning - GridSearchCV 

In [9]:
# Source Link: https://medium.com/datadriveninvestor/k-nearest-neighbors-in-python-hyperparameters-tuning-716734bc557f

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()

# Hyperparameters 
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p = [1,2]

hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

grid = GridSearchCV(knn, hyperparameters, cv=10, verbose=1)


In [10]:
# Fit the model

grid.fit(X_train, y_train)

Fitting 10 folds for each of 2842 candidates, totalling 28420 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 28420 out of 28420 | elapsed: 28.1min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29],
                         'p': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_tra

## Quantify our Trained Model

In [11]:
# List the best parameters for this dataset
# List the best score

print(grid.best_params_)
print('Best Grid score: %.3f' % grid.best_score_)

{'leaf_size': 1, 'n_neighbors': 9, 'p': 1}
Best Grid score: 0.672


In [12]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

print('K-Nearest Neighbors(KNN)')
print('Test Accuracy: %.3f' % grid.score(X_test, y_test))

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

K-Nearest Neighbors(KNN)
Test Accuracy: 0.660
                precision    recall  f1-score   support

     CANDIDATE       0.43      0.40      0.41       404
     CONFIRMED       0.61      0.70      0.65       435
FALSE POSITIVE       0.79      0.76      0.77       909

      accuracy                           0.66      1748
     macro avg       0.61      0.62      0.61      1748
  weighted avg       0.66      0.66      0.66      1748



#  Save the Model (If the best)

In [1]:
# ## Create a file for your best model and push to GitHub
# import joblib
# filename = 'KNN.sav'
# joblib.dump(knn, filename)
