<a href="https://colab.research.google.com/github/christinaxliu/research/blob/main/Caltech-JPL-Intern/ExoplanetClassifiers/KNNClassifier/Exoplanet_KNN_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
from __future__ import division

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from scipy import ndimage
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pickle
from google.colab import drive

In [27]:
warnings.filterwarnings('ignore')
pd.options.mode.copy_on_write = True
plt.style.use('fivethirtyeight')

In [28]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
exoplanets_filename = '/content/drive/My Drive/Colab Notebooks/ExoplanetClassifiers/KNNClassifier/TESSExoplanetCandidates_2024.06.23.csv'
exoplanets_data = pd.read_table(exoplanets_filename,
                                skiprows=54,
                                sep=',',
                                header=None,
                                index_col=None,
                                names = ['toi', 'tid', 'tfopwg_disp', 'ra', 'dec', 'st_pmra', 'st_pmraerr1', 'st_pmraerr2',
                                          'st_pmdec', 'st_pmdecerr1', 'st_pmdecerr2', 'pl_tranmid', 'pl_tranmiderr1', 'pl_tranmiderr2',
                                          'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2', 'pl_trandurh', 'pl_trandurherr1', 'pl_trandurherr2',
                                          'pl_trandep', 'pl_trandeperr1', 'pl_trandeperr2', 'pl_rade', 'pl_radeerr1', 'pl_radeerr2',
                                          'pl_insol', 'pl_insolerr1', 'pl_insolerr2', 'pl_eqt', 'pl_eqterr1', 'pl_eqterr2',
                                          'st_tmag', 'st_tmagerr1', 'st_tmagerr2', 'st_dist', 'st_disterr1', 'st_disterr2',
                                          'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_logg', 'st_loggerr1', 'st_loggerr2',
                                          'st_rad', 'st_raderr1', 'st_raderr2'],
                                skipfooter=0,
                                engine='python')

In [30]:
# Sanity check the first several rows of data to ensure data is read correctly
exoplanets_data.head()

Unnamed: 0,toi,tid,tfopwg_disp,ra,dec,st_pmra,st_pmraerr1,st_pmraerr2,st_pmdec,st_pmdecerr1,...,st_disterr2,st_teff,st_tefferr1,st_tefferr2,st_logg,st_loggerr1,st_loggerr2,st_rad,st_raderr1,st_raderr2
0,1000.01,50365310,FP,112.357708,-12.69596,-5.964,0.085,-0.085,-0.076,0.072,...,-11.9515,10249.0,264.7,-264.7,4.19,0.07,-0.07,2.16986,0.072573,-0.072573
1,1001.01,88863718,PC,122.580465,-5.513852,-4.956,0.102,-0.102,-15.555,0.072,...,-5.91,7070.0,126.4,-126.4,4.03,0.09,-0.09,2.01,0.09,-0.09
2,1002.01,124709665,FP,104.726966,-10.580455,-1.462,0.206,-0.206,-2.249,0.206,...,-106.333,8924.0,124.0,-124.0,,,,5.73,,
3,1003.01,106997505,FP,110.559945,-25.207017,-0.939,0.041,-0.041,1.64,0.055,...,-1899.57,5388.5,567.0,-567.0,4.15,1.64,-1.64,,,
4,1004.01,238597883,FP,122.178195,-48.802811,-4.496,0.069,-0.069,9.347,0.062,...,-4.6175,9219.0,171.1,-171.1,4.14,0.07,-0.07,2.15,0.06,-0.06


In [31]:
# Sanity check the last several rows of data to ensure data is read correctly.
exoplanets_data.tail()

Unnamed: 0,toi,tid,tfopwg_disp,ra,dec,st_pmra,st_pmraerr1,st_pmraerr2,st_pmdec,st_pmdecerr1,...,st_disterr2,st_teff,st_tefferr1,st_tefferr2,st_logg,st_loggerr1,st_loggerr2,st_rad,st_raderr1,st_raderr2
7198,995.01,317951248,FP,110.811443,5.56285,2.061,0.405,-0.405,-7.082,0.336,...,-355.634,4805.2,46.5,-46.5,,,,,,
7199,996.01,142918609,FP,119.349948,-19.516015,-3.9,0.848,-0.848,3.866,1.158,...,,8007.0,,,,,,2.05,,
7200,997.01,341729521,FP,121.319521,-59.579798,-44.77,0.044,-0.044,-2.616,0.036,...,-0.2655,5786.0,132.9,-132.9,4.52,0.08,-0.08,0.926261,0.045789,-0.045789
7201,998.01,54390047,FP,118.319555,-14.218823,-1.706,0.069,-0.069,-1.714,0.061,...,-11.1405,8322.0,130.7,-130.7,4.01,0.07,-0.07,2.34986,0.091578,-0.091578
7202,999.01,341186896,FP,118.864086,-58.22206,-17.19,0.042,-0.042,12.272,0.039,...,-0.6635,6635.0,130.1,-130.1,4.35,0.09,-0.09,1.3,0.05,-0.05


In [32]:
# Get the statistics about the data
exoplanets_data.describe()

Unnamed: 0,toi,tid,ra,dec,st_pmra,st_pmraerr1,st_pmraerr2,st_pmdec,st_pmdecerr1,st_pmdecerr2,...,st_disterr2,st_teff,st_tefferr1,st_tefferr2,st_logg,st_loggerr1,st_loggerr2,st_rad,st_raderr1,st_raderr2
count,7203.0,7203.0,7203.0,7203.0,7084.0,7084.0,7084.0,7084.0,7084.0,7084.0,...,6559.0,7072.0,6776.0,6776.0,6408.0,5117.0,5117.0,6733.0,5402.0,5402.0
mean,3514.03644,244820900.0,173.954915,-0.650568,-0.884084,0.227741,-0.227741,-9.083052,0.221717,-0.221717,...,-20.059656,5797.592605,210.114839,-210.114839,4.305612,0.180526,-0.180526,1.406487,0.075384,-0.075384
std,2013.842205,163517900.0,101.316743,47.86805,78.855921,0.635495,0.635495,68.206643,0.625434,0.625434,...,141.650242,1502.824027,567.587345,567.587345,0.305031,0.361619,0.361619,1.642425,0.085912,0.085912
min,101.01,2876.0,0.08468,-89.471513,-1624.05,0.015,-8.0,-1230.62,0.016,-8.0,...,-4020.6,2808.0,7.82429,-7000.0,0.1,0.000391,-2.01084,0.114827,0.003357,-1.72302
25%,1756.51,130382500.0,93.891644,-44.950389,-11.083,0.038,-0.08,-14.749,0.038,-0.071,...,-11.87525,5203.925,122.0,-157.0,4.13,0.08,-0.095776,0.89,0.05,-0.08
50%,3494.01,245462400.0,153.779587,0.444704,-1.787,0.051,-0.051,-3.3315,0.049,-0.049,...,-4.1545,5801.0,129.5575,-129.5575,4.33,0.085884,-0.085884,1.23369,0.06,-0.06
75%,5262.51,353092500.0,272.132619,42.672596,8.13475,0.08,-0.038,5.154,0.071,-0.038,...,-0.9885,6301.475,157.0,-122.0,4.5,0.095776,-0.08,1.66,0.08,-0.05
max,7031.01,2041563000.0,359.933006,89.086923,2074.52,8.0,-0.015,1048.84,8.0,-0.016,...,-0.002835,50000.0,7000.0,-7.82429,5.96065,2.01084,-0.000391,102.03,1.72302,-0.003357


In [33]:
# Cast numerical values to floats
exoplanets_data['ra'] = exoplanets_data['ra'].astype(float)
exoplanets_data['dec'] = exoplanets_data['dec'].astype(float)
exoplanets_data['st_pmra'] = exoplanets_data['st_pmra'].astype(float)
exoplanets_data['st_pmraerr1'] = exoplanets_data['st_pmraerr1'].astype(float)
exoplanets_data['st_pmraerr2'] = exoplanets_data['st_pmraerr2'].astype(float)
exoplanets_data['st_pmdec'] = exoplanets_data['st_pmdec'].astype(float)
exoplanets_data['st_pmdecerr1'] = exoplanets_data['st_pmdecerr1'].astype(float)
exoplanets_data['st_pmdecerr2'] = exoplanets_data['st_pmdecerr2'].astype(float)
exoplanets_data['pl_tranmid'] = exoplanets_data['pl_tranmid'].astype(float)
exoplanets_data['pl_tranmiderr1'] = exoplanets_data['pl_tranmiderr1'].astype(float)
exoplanets_data['pl_tranmiderr2'] = exoplanets_data['pl_tranmiderr2'].astype(float)
exoplanets_data['pl_orbper'] = exoplanets_data['pl_orbper'].astype(float)
exoplanets_data['pl_orbpererr1'] = exoplanets_data['pl_orbpererr1'].astype(float)
exoplanets_data['pl_orbpererr2'] = exoplanets_data['pl_orbpererr2'].astype(float)
exoplanets_data['pl_trandurh'] = exoplanets_data['pl_trandurh'].astype(float)
exoplanets_data['pl_trandurherr1'] = exoplanets_data['pl_trandurherr1'].astype(float)
exoplanets_data['pl_trandurherr2'] = exoplanets_data['pl_trandurherr2'].astype(float)
exoplanets_data['pl_trandep'] = exoplanets_data['pl_trandep'].astype(float)
exoplanets_data['pl_trandeperr1'] = exoplanets_data['pl_trandeperr1'].astype(float)
exoplanets_data['pl_trandeperr2'] = exoplanets_data['pl_trandeperr2'].astype(float)
exoplanets_data['pl_rade'] = exoplanets_data['pl_rade'].astype(float)
exoplanets_data['pl_radeerr1'] = exoplanets_data['pl_radeerr1'].astype(float)
exoplanets_data['pl_radeerr2'] = exoplanets_data['pl_radeerr2'].astype(float)
exoplanets_data['pl_insol'] = exoplanets_data['pl_insol'].astype(float)
exoplanets_data['pl_insolerr1'] = exoplanets_data['pl_insolerr1'].astype(float)
exoplanets_data['pl_insolerr2'] = exoplanets_data['pl_insolerr2'].astype(float)
exoplanets_data['pl_eqt'] = exoplanets_data['pl_eqt'].astype(float)
exoplanets_data['pl_eqterr1'] = exoplanets_data['pl_eqterr1'].astype(float)
exoplanets_data['pl_eqterr2'] = exoplanets_data['pl_eqterr2'].astype(float)
exoplanets_data['st_tmag'] = exoplanets_data['st_tmag'].astype(float)
exoplanets_data['st_tmagerr1'] = exoplanets_data['st_tmagerr1'].astype(float)
exoplanets_data['st_tmagerr2'] = exoplanets_data['st_tmagerr2'].astype(float)
exoplanets_data['st_dist'] = exoplanets_data['st_dist'].astype(float)
exoplanets_data['st_disterr1'] = exoplanets_data['st_disterr1'].astype(float)
exoplanets_data['st_disterr2'] = exoplanets_data['st_disterr2'].astype(float)
exoplanets_data['st_teff'] = exoplanets_data['st_teff'].astype(float)
exoplanets_data['st_tefferr1'] = exoplanets_data['st_tefferr1'].astype(float)
exoplanets_data['st_tefferr2'] = exoplanets_data['st_tefferr2'].astype(float)
exoplanets_data['st_logg'] = exoplanets_data['st_logg'].astype(float)
exoplanets_data['st_loggerr1'] = exoplanets_data['st_loggerr1'].astype(float)
exoplanets_data['st_loggerr2'] = exoplanets_data['st_loggerr2'].astype(float)
exoplanets_data['st_rad'] = exoplanets_data['st_rad'].astype(float)
exoplanets_data['st_raderr1'] = exoplanets_data['st_raderr1'].astype(float)
exoplanets_data['st_raderr2'] = exoplanets_data['st_raderr2'].astype(float)

In [34]:
# Change any fields with empty spaces to NaN values so we could clean them up later on
exoplanets_data = exoplanets_data.applymap(lambda x: np.nan if isinstance(x, str) and x.isspace() else x)

In [35]:
exoplanets_data['tfopwg_disp'].describe()

count     7201
unique       6
top         PC
freq      4657
Name: tfopwg_disp, dtype: object

In [36]:
exoplanets_data['tfopwg_disp'].value_counts()

tfopwg_disp
PC     4657
FP     1034
KP      537
CP      459
APC     422
FA       92
Name: count, dtype: int64

In [37]:
exoplanets_data = exoplanets_data.dropna(subset=['tfopwg_disp'])
exoplanets_data.describe()

Unnamed: 0,toi,tid,ra,dec,st_pmra,st_pmraerr1,st_pmraerr2,st_pmdec,st_pmdecerr1,st_pmdecerr2,...,st_disterr2,st_teff,st_tefferr1,st_tefferr2,st_logg,st_loggerr1,st_loggerr2,st_rad,st_raderr1,st_raderr2
count,7201.0,7201.0,7201.0,7201.0,7082.0,7082.0,7082.0,7082.0,7082.0,7082.0,...,6557.0,7070.0,6774.0,6774.0,6406.0,5115.0,5115.0,6731.0,5400.0,5400.0
mean,3514.043808,244793700.0,173.97515,-0.641553,-0.894011,0.22775,-0.22775,-9.073147,0.221746,-0.221746,...,-20.065663,5797.879335,210.144294,-210.144294,4.305494,0.180483,-0.180483,1.406639,0.075399,-0.075399
std,2013.411388,163524200.0,101.310846,47.869494,78.855069,0.63558,0.63558,68.212533,0.625519,0.625519,...,141.671429,1502.62498,567.66849,567.66849,0.304901,0.36166,0.36166,1.642614,0.085923,0.085923
min,101.01,2876.0,0.08468,-89.471513,-1624.05,0.015,-8.0,-1230.62,0.016,-8.0,...,-4020.6,2808.0,7.82429,-7000.0,0.1,0.000391,-2.01084,0.114827,0.003357,-1.72302
25%,1757.01,130349700.0,93.921031,-44.938923,-11.07575,0.038,-0.08,-14.7225,0.038,-0.071,...,-11.876,5204.325,122.0,-157.0,4.13,0.08,-0.095659,0.89,0.05,-0.08
50%,3494.01,245462400.0,153.779587,0.458798,-1.787,0.051,-0.051,-3.3255,0.049,-0.049,...,-4.161,5801.0,129.5805,-129.5805,4.33,0.08575,-0.08575,1.23369,0.06,-0.06
75%,5262.01,353038700.0,272.20452,42.677739,8.12825,0.08,-0.038,5.154,0.071,-0.038,...,-0.9885,6301.225,157.0,-122.0,4.5,0.095659,-0.08,1.66,0.08,-0.05
max,7031.01,2041563000.0,359.933006,89.086923,2074.52,8.0,-0.015,1048.84,8.0,-0.016,...,-0.002835,50000.0,7000.0,-7.82429,5.96065,2.01084,-0.000391,102.03,1.72302,-0.003357


In [38]:
# Label CP (confirmed planet) and KP (known planet) as postive samples and FP (false positive) as negative samples
exoplanets_data.loc[((exoplanets_data['tfopwg_disp'] == "CP") | (exoplanets_data['tfopwg_disp'] == "KP")), 'label'] = 1
exoplanets_data.loc[(exoplanets_data['tfopwg_disp'] == "FP"), 'label'] = 0

In [39]:
exoplanets_data['label'].value_counts()

label
0.0    1034
1.0     996
Name: count, dtype: int64

In [40]:
# Drop data fields that are not relevant to training or don't provide value to training
exoplanet_features_data = exoplanets_data.drop(['toi'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['tid'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['tfopwg_disp'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_pmraerr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_pmraerr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_pmdecerr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_pmdecerr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_tranmiderr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_tranmiderr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_orbpererr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_orbpererr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_trandurherr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_trandurherr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_trandeperr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_trandeperr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_radeerr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_radeerr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_insolerr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_insolerr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_eqterr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['pl_eqterr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_tmagerr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_tmagerr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_disterr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_disterr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_tefferr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_tefferr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_loggerr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_loggerr2'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_raderr1'], axis = 1)
exoplanet_features_data = exoplanet_features_data.drop(['st_raderr2'], axis = 1)
exoplanet_features_data.to_csv('/content/drive/My Drive/Colab Notebooks/ExoplanetClassifiers/KNNClassifier/FeaturesData_DropInrelevantFields_2024.06.23.csv')
exoplanet_features_data

Unnamed: 0,ra,dec,st_pmra,st_pmdec,pl_tranmid,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_logg,st_rad,label
0,112.357708,-12.695960,-5.964,-0.076,2.459230e+06,2.171348,2.017220,656.886099,5.818163,22601.948581,3127.204052,9.604000,485.735,10249.0,4.19,2.169860,0.0
1,122.580465,-5.513852,-4.956,-15.555,2.459988e+06,1.931646,3.166000,1286.000000,11.215400,44464.500000,4045.000000,9.423440,295.862,7070.0,4.03,2.010000,
2,104.726966,-10.580455,-1.462,-2.249,2.459225e+06,1.867557,1.408000,1500.000000,23.752900,2860.610000,2037.000000,9.299501,943.109,8924.0,,5.730000,0.0
3,110.559945,-25.207017,-0.939,1.640,2.458493e+06,2.743230,3.167000,383.410000,,1177.360000,1631.000000,9.300300,7728.170,5388.5,4.15,,0.0
4,122.178195,-48.802811,-4.496,9.347,2.459987e+06,3.573014,3.370000,755.000000,11.311300,54679.300000,4260.000000,9.135500,356.437,9219.0,4.14,2.150000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7198,110.811443,5.562850,2.061,-7.082,2.458494e+06,3.443800,2.572000,7260.750000,,1413.670000,1708.000000,10.078100,1080.880,4805.2,,,0.0
7199,119.349948,-19.516015,-3.900,3.866,2.458495e+06,14.537800,6.826000,4040.000000,16.052300,,,9.792500,379.693,8007.0,,2.050000,0.0
7200,121.319521,-59.579798,-44.770,-2.616,2.459230e+06,8.413486,3.556833,17479.605331,20.510696,127.916421,857.731431,9.772700,113.667,5786.0,4.52,0.926261,0.0
7201,118.319555,-14.218823,-1.706,-1.714,2.459230e+06,0.941436,1.360700,339.912662,4.529209,41562.587811,3641.626449,9.695200,482.278,8322.0,4.01,2.349860,0.0


In [41]:
# Drop off data rows without labels to get training data
training_data = exoplanet_features_data.loc[~np.isnan(exoplanet_features_data['label'])]
training_data['label'].value_counts()
training_data.describe()

Unnamed: 0,ra,dec,st_pmra,st_pmdec,pl_tranmid,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_logg,st_rad,label
count,2030.0,2030.0,2008.0,2008.0,2030.0,2007.0,2030.0,2030.0,1921.0,1972.0,1946.0,2030.0,1993.0,1997.0,1846.0,1921.0,2030.0
mean,176.529115,1.920753,1.83815,-14.920173,2459321.0,8.935517,3.007747,8624.570937,10.743431,4144.881653,1461.486768,10.933019,417.322841,5852.984552,4.308429,1.531377,0.49064
std,106.558725,46.260083,126.013408,96.837248,586.7907,51.123985,1.698272,22648.929982,13.006856,16019.719596,845.372887,1.706347,630.093878,1598.774543,0.322753,2.830962,0.500036
min,0.185606,-89.471513,-1624.05,-1230.62,2458325.0,0.247668,0.199,100.0,0.757588,0.018728,103.0,4.6278,6.53127,2828.0,2.36,0.114827,0.0
25%,90.54956,-39.586708,-13.209,-18.54075,2458766.0,1.901212,1.88164,1134.1725,4.026926,120.41225,895.695726,9.884435,140.225,5134.0,4.11,0.864173,0.0
50%,150.978382,2.229966,-1.67,-3.9275,2459323.0,3.522475,2.6755,3729.817355,10.2185,575.0365,1312.49824,10.7496,280.355,5817.3,4.34,1.25505,0.0
75%,287.596915,44.248476,11.76375,5.5815,2459824.0,6.879605,3.675418,10220.53279,14.2226,1978.81584,1797.75,12.098875,494.985,6327.16,4.51,1.72,1.0
max,359.900874,87.86853,2074.52,1048.84,2460406.0,1825.048364,24.052709,504062.379922,297.111726,280833.0,6413.0,18.3324,8341.8,31000.0,5.59998,102.03,1.0


In [42]:
# Apply simple imputation by filling in missing values with mean
training_data = training_data.fillna(training_data.mean())
training_data = training_data.dropna()
training_data.to_csv('/content/drive/My Drive/Colab Notebooks/ExoplanetClassifiers/KNNClassifier/TrainingData_Final_2024.06.23.csv')
training_data

Unnamed: 0,ra,dec,st_pmra,st_pmdec,pl_tranmid,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_logg,st_rad,label
0,112.357708,-12.695960,-5.964,-0.076,2.459230e+06,2.171348,2.017220,656.886099,5.818163,22601.948581,3127.204052,9.604000,485.735,10249.0,4.190000,2.169860,0.0
2,104.726966,-10.580455,-1.462,-2.249,2.459225e+06,1.867557,1.408000,1500.000000,23.752900,2860.610000,2037.000000,9.299501,943.109,8924.0,4.308429,5.730000,0.0
3,110.559945,-25.207017,-0.939,1.640,2.458493e+06,2.743230,3.167000,383.410000,10.743431,1177.360000,1631.000000,9.300300,7728.170,5388.5,4.150000,1.531377,0.0
4,122.178195,-48.802811,-4.496,9.347,2.459987e+06,3.573014,3.370000,755.000000,11.311300,54679.300000,4260.000000,9.135500,356.437,9219.0,4.140000,2.150000,0.0
5,120.704811,-11.101521,-26.932,-2.901,2.458493e+06,4.550720,2.599000,3620.000000,6.544490,254.050000,1112.000000,9.130900,100.711,5613.0,4.308429,1.090000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7198,110.811443,5.562850,2.061,-7.082,2.458494e+06,3.443800,2.572000,7260.750000,10.743431,1413.670000,1708.000000,10.078100,1080.880,4805.2,4.308429,1.531377,0.0
7199,119.349948,-19.516015,-3.900,3.866,2.458495e+06,14.537800,6.826000,4040.000000,16.052300,4144.881653,1461.486768,9.792500,379.693,8007.0,4.308429,2.050000,0.0
7200,121.319521,-59.579798,-44.770,-2.616,2.459230e+06,8.413486,3.556833,17479.605331,20.510696,127.916421,857.731431,9.772700,113.667,5786.0,4.520000,0.926261,0.0
7201,118.319555,-14.218823,-1.706,-1.714,2.459230e+06,0.941436,1.360700,339.912662,4.529209,41562.587811,3641.626449,9.695200,482.278,8322.0,4.010000,2.349860,0.0


In [43]:
features = training_data.drop(['label'], axis = 1)
results = training_data.label

In [44]:
# Split data with 70% for training and 30% for testing
features_train, features_test, results_train, results_test = train_test_split(features, results, test_size = 0.3, random_state = 0)

In [45]:
# Standadize the scales of features
sc = StandardScaler()
features_train_sc = sc.fit_transform(features_train)
features_test_sc = sc.transform(features_test)

In [20]:
#features_train = normalized = normalize(features_train)
#features_test = normalize(features_test)

In [21]:
#features_train = filtered = ndimage.filters.gaussian_filter(features_train, sigma=10)
#features_test = ndimage.filters.gaussian_filter(features_test, sigma=10)

In [22]:
#std_scaler = StandardScaler()
#features_train_sc = scaled = std_scaler.fit_transform(features_train)
#features_test_sc = std_scaler.fit_transform(features_test)

In [46]:
# Train a KNN classifier with the training data
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(features_train_sc, results_train)

In [47]:
# Call trained KNN classifier to get predictions for the testing data
results_pred = knn_classifier.predict(features_test_sc)

In [48]:
# Calculate accuracy, precision, recall, and F-1 scores for the KNN classifier
print("KNN Classifier Accuracy: ", accuracy_score(results_test, results_pred))
print()
print("KNN Classifier Classification Report :\n",(classification_report(results_test,results_pred)))

KNN Classifier Accuracy:  0.8095238095238095

KNN Classifier Classification Report :
               precision    recall  f1-score   support

         0.0       0.90      0.71      0.79       309
         1.0       0.75      0.92      0.83       300

    accuracy                           0.81       609
   macro avg       0.82      0.81      0.81       609
weighted avg       0.83      0.81      0.81       609



In [49]:
# Save the KNN classifier model onto disk
knnPickle = open('/content/drive/My Drive/Colab Notebooks/ExoplanetClassifiers/KNNClassifier/exoplanet_knn_classifier_v1', 'wb')
pickle.dump(knn_classifier, knnPickle)
knnPickle.close()

In [50]:
# Load the KNN classifier model from disk and test with test data
loaded_knn_classifier = pickle.load(open('/content/drive/My Drive/Colab Notebooks/ExoplanetClassifiers/KNNClassifier/exoplanet_knn_classifier_v1', 'rb'))
test_pred_results = loaded_knn_classifier.predict(features_test_sc)
test_pred_results

array([0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 0., 1., 1., 0.

In [51]:
# The data rows with NaN labels are exoplanet candidates
candidate_exoplanets = exoplanet_features_data.loc[np.isnan(exoplanet_features_data['label'])]
candidate_exoplanets = candidate_exoplanets.drop(['label'], axis = 1)
candidate_exoplanets.describe()

Unnamed: 0,ra,dec,st_pmra,st_pmdec,pl_tranmid,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_logg,st_rad
count,5171.0,5171.0,5074.0,5074.0,5171.0,5092.0,5171.0,5171.0,4810.0,5079.0,4980.0,5171.0,5001.0,5073.0,4560.0,4810.0
mean,172.97253,-1.647448,-1.975244,-6.759228,2459521.0,22.132351,3.06263,7916.216182,10.150587,1413.704845,1212.393324,11.751034,497.943562,5776.18702,4.304306,1.356821
std,99.167462,48.454063,48.918294,52.591379,503.2803,114.176194,1.894036,11359.058667,6.171677,6553.308426,590.686114,1.550294,541.577126,1462.637987,0.297398,0.753362
min,0.08468,-88.568495,-840.114,-1098.04,2457926.0,0.152076,0.101,23.45397,0.552507,0.000342,37.0,5.28906,10.454,2808.0,0.1,0.125969
25%,95.11591,-47.023823,-10.4395,-13.615,2459232.0,2.689045,1.824944,1557.270908,4.563078,75.644617,799.306351,10.6849,191.145,5225.91,4.13,0.9
50%,154.302943,-0.80334,-1.81,-3.1165,2459586.0,4.346767,2.748,5042.0,10.6191,310.868,1145.5,12.122,405.634,5792.8,4.33,1.23
75%,265.412802,42.205059,7.04325,5.025,2459910.0,8.252952,3.811241,10334.10964,14.021275,978.9045,1529.0,12.94545,688.605,6287.2,4.5,1.63
max,359.933006,89.086923,531.463,795.85,2460421.0,1837.889731,22.90265,225793.106096,140.194,238125.935921,5634.05944,16.1779,14728.3,50000.0,5.96065,17.34


In [52]:
candidate_exoplanets = candidate_exoplanets.fillna(candidate_exoplanets.mean())
candidate_exoplanets = candidate_exoplanets.dropna()
candidate_exoplanets.describe()

Unnamed: 0,ra,dec,st_pmra,st_pmdec,pl_tranmid,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_logg,st_rad
count,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0,5171.0
mean,172.97253,-1.647448,-1.975244,-6.759228,2459521.0,22.132351,3.06263,7916.216182,10.150587,1413.704845,1212.393324,11.751034,497.943562,5776.18702,4.304306,1.356821
std,99.167462,48.454063,48.457217,52.09568,503.2803,113.300503,1.894036,11359.058667,5.952307,6494.738729,579.672307,1.550294,532.598629,1448.709138,0.279272,0.726584
min,0.08468,-88.568495,-840.114,-1098.04,2457926.0,0.152076,0.101,23.45397,0.552507,0.000342,37.0,5.28906,10.454,2808.0,0.1,0.125969
25%,95.11591,-47.023823,-10.1645,-13.199,2459232.0,2.731434,1.824944,1557.270908,5.100595,78.148314,809.161269,10.6849,197.187,5244.0,4.16,0.93
50%,154.302943,-0.80334,-1.975244,-3.401,2459586.0,4.416399,2.748,5042.0,10.150587,324.912095,1169.0,12.122,425.694,5780.0,4.304306,1.29
75%,265.412802,42.205059,6.73,4.8185,2459910.0,8.609472,3.811241,10334.10964,13.7204,1052.639871,1513.0,12.94545,677.752,6277.5,4.47,1.59
max,359.933006,89.086923,531.463,795.85,2460421.0,1837.889731,22.90265,225793.106096,140.194,238125.935921,5634.05944,16.1779,14728.3,50000.0,5.96065,17.34


In [53]:
candidate_exoplanets_sc = sc.transform(candidate_exoplanets)

In [54]:
candidate_pred_results = loaded_knn_classifier.predict(candidate_exoplanets_sc)
candidate_pred_results

array([0., 1., 0., ..., 0., 0., 0.])