# kNN:  k Nearest Neighbors

In [2]:
# Imports
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


In [2]:
# Read the csv file into a pandas DataFrame
exoplanet_complete_kNN = pd.read_csv('exoplanet_complete_kNN.csv')
exoplanet_complete_kNN.head()

Unnamed: 0,koi_disposition,koi_score,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,1,1.0,9.488036,170.53875,0.146,2.9575,615.8,2.26,793,93.59,5455,4.467,0.927,291.93423,48.141651,15.347
1,1,0.969,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,5455,4.467,0.927,291.93423,48.141651,15.347
2,3,0.0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,5853,4.544,0.868,297.00482,48.134129,15.436
3,3,0.0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,5805,4.564,0.791,285.53461,48.28521,15.597
4,1,1.0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,6031,4.438,1.046,288.75488,48.2262,15.509


### key for koi_disposition:
1 = CONFIRMED, 
2 = CANDIDATE, 
3 = FALSE POSITIVE


In [3]:

X = exoplanet_complete_kNN.drop("koi_disposition", axis=1)
y = exoplanet_complete_kNN['koi_disposition'].values.reshape(-1, 1)
target_names = ["1", "2", "3"]
print(X.shape, y.shape)


(7994, 15) (7994, 1)


In [4]:
# Label Encoder and Transform
label_encoder = LabelEncoder()

data = X.copy()

label_encoder.fit(data['koi_period'])
label_encoder.transform(data.koi_period)
label_encoder.fit(data['koi_time0bk'])
label_encoder.transform(data.koi_time0bk)
label_encoder.fit(data['koi_impact'])
label_encoder.transform(data.koi_impact)
label_encoder.fit(data['koi_duration'])
label_encoder.transform(data.koi_duration)
label_encoder.fit(data['koi_depth'])
label_encoder.transform(data.koi_depth)
label_encoder.fit(data['koi_prad'])
label_encoder.transform(data.koi_prad)
label_encoder.fit(data['koi_teq'])
label_encoder.transform(data.koi_teq)
label_encoder.fit(data['koi_insol'])
label_encoder.transform(data.koi_insol)
label_encoder.fit(data['koi_steff'])
label_encoder.transform(data.koi_steff)
label_encoder.fit(data['koi_slogg'])
label_encoder.transform(data.koi_slogg)
label_encoder.fit(data['koi_srad'])
label_encoder.transform(data.koi_srad)
label_encoder.fit(data['ra'])
label_encoder.transform(data.ra)
label_encoder.fit(data['dec'])
label_encoder.transform(data.dec)
label_encoder.fit(data['koi_kepmag'])
label_encoder.transform(data.koi_kepmag)
label_encoder.classes_

array([ 6.966,  6.974,  7.391, ..., 19.319, 19.485, 20.003])

In [5]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
data_train, data_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [6]:
# MinMaxScalar
data_minmax = MinMaxScaler().fit(data_train)
y_minmax = MinMaxScaler().fit(y_train)

data_train_minmax = data_minmax.transform(data_train)
data_test_minmax = data_minmax.transform(data_test)
y_train_minmax = y_minmax.transform(y_train)
y_test_minmax = y_minmax.transform(y_test)

In [7]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    #knn.fit(X_train_minmax, y_train_minmax)
    #train_score = knn.score(X_train_minmax, y_train_minmax)
    #test_score = knn.score(X_test_minmax, y_test_minmax)
    knn.fit(data_train_minmax, y_train)
    train_score = knn.score(data_train_minmax, y_train)
    test_score = knn.score(data_test_minmax, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

k: 1, Train/Test Score: 1.000/0.789
k: 3, Train/Test Score: 0.890/0.795
k: 5, Train/Test Score: 0.867/0.796
k: 7, Train/Test Score: 0.847/0.801
k: 9, Train/Test Score: 0.840/0.794
k: 11, Train/Test Score: 0.832/0.790
k: 13, Train/Test Score: 0.830/0.798
k: 15, Train/Test Score: 0.827/0.800
k: 17, Train/Test Score: 0.825/0.800
k: 19, Train/Test Score: 0.821/0.801


In [9]:
knn = KNeighborsClassifier(n_neighbors=19)
knn.fit(data_train_minmax, y_train)
print('k=19 Test Acc: %.3f' % knn.score(data_test_minmax, y_test))

k=19 Test Acc: 0.801


In [3]:
# Save model to file
filename = 'kNN.sav'
joblib.dump(knn, filename)

NameError: name 'knn' is not defined