# K NEAREST NEIGHBOR

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import os
import numpy as np

### Exoplanet Identification
#### The variables (columns) in this dataset have been scrubbed based on the sensitivity analysis does on each individual ability to predict whether a Kepler Object of Interest or "koi" is a potential exoplanet.

#### There will be some columns dropped from the dataset before training the model
#### koi_disposition is the "label" that identifies each koi as either a FALSE POSITIVE, CONFIRMED, or CANDIDATE


In [2]:
df = pd.read_csv("data/clean_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_model_snr,koi_tce_plnt_num,koi_srad
0,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,25.8,2,0.927
1,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,76.3,1,0.868
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,505.6,1,0.791
3,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,40.9,1,1.046
4,CONFIRMED,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,40.2,2,0.972


In [3]:
target = df["koi_disposition"]
target_names = ["confirmed","candidate","false_positive"]

In [4]:
data = df.drop("koi_disposition", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_model_snr,koi_tce_plnt_num,koi_srad
0,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,25.8,2,0.927
1,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,76.3,1,0.868
2,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,505.6,1,0.791
3,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,40.9,1,1.046
4,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,40.2,2,0.972


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model

SVC(kernel='linear')

In [7]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [None]:
# List the best parameters for this dataset
print(grid.best_params_)

In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [None]:
print('Test Acc: %.3f' % grid.score(X_test, y_test))

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["blue", "red"]))

# EVALUATION OF MODEL

#### This model required a tremendous amount of processing. It provided the same level of accurary as the KNN.