In [1]:
#import libraries
import pandas as pd
import mpl_style
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme()
import warnings

# %pip install numpy==1.20.3


warnings.filterwarnings('ignore')
from scipy import stats

#import sklearn packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc

ModuleNotFoundError: No module named 'seaborn'

In [None]:
plt.style.use(mpl_style.style1)

In [None]:
#read data  
df = pd.read_csv('./exoplanets.csv', index_col=False)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.rename(columns={'kepid':'KepID',
'kepoi_name':'KOIName',
'kepler_name':'KeplerName',
'koi_disposition':'ExoplanetArchiveDisposition',
'koi_pdisposition':'DispositionUsingKeplerData',
'koi_score':'DispositionScore',
'koi_fpflag_nt':'NotTransit-LikeFalsePositiveFlag',
'koi_fpflag_ss':'koi_fpflag_ss',
'koi_fpflag_co':'CentroidOffsetFalsePositiveFlag',
'koi_fpflag_ec':'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
'koi_period':'OrbitalPeriod[days]',
'koi_period_err1':'OrbitalPeriodUpperUnc.[days]',
'koi_period_err2':'OrbitalPeriodLowerUnc.[days]',
'koi_time0bk':'TransitEpoch[BKJD]',
'koi_time0bk_err1':'TransitEpochUpperUnc.[BKJD]',
'koi_time0bk_err2':'TransitEpochLowerUnc.[BKJD]',
'koi_impact':'ImpactParameter',
'koi_impact_err1':'ImpactParameterUpperUnc',
'koi_impact_err2':'ImpactParameterLowerUnc',
'koi_duration':'TransitDuration[hrs]',
'koi_duration_err1':'TransitDurationUpperUnc.[hrs]',
'koi_duration_err2':'TransitDurationLowerUnc.[hrs]',
'koi_depth':'TransitDepth[ppm]',
'koi_depth_err1':'TransitDepthUpperUnc.[ppm]',
'koi_depth_err2':'TransitDepthLowerUnc.[ppm]',
'koi_prad':'PlanetaryRadius[Earthradii]',
'koi_prad_err1':'PlanetaryRadiusUpperUnc.[Earthradii]',
'koi_prad_err2':'PlanetaryRadiusLowerUnc.[Earthradii]',
'koi_teq':'EquilibriumTemperature[K',
'koi_teq_err1':'EquilibriumTemperatureUpperUnc.[K]',
'koi_teq_err2':'EquilibriumTemperatureLowerUnc.[K]',
'koi_insol':'InsolationFlux[Earthflux]',
'koi_insol_err1':'InsolationFluxUpperUnc.[Earthflux]',
'koi_insol_err2':'InsolationFluxLowerUnc.[Earthflux]',
'koi_model_snr':'TransitSignal-to-Noise',
'koi_tce_plnt_num':'TCEPlanetNumber',
'koi_tce_delivname':'TCEDeliver',
'koi_steff':'StellarEffectiveTemperature[K]',
'koi_steff_err1':'StellarEffectiveTemperatureUpperUnc.[K]',
'koi_steff_err2':'StellarEffectiveTemperatureLowerUnc.[K]',
'koi_slogg':'StellarSurfaceGravity[log10(cm/s**2)]',
'koi_slogg_err1':'StellarSurfaceGravityUpperUnc.[log10(cm/s**2)]',
'koi_slogg_err2':'StellarSurfaceGravityLowerUnc.[log10(cm/s**2)]',
'koi_srad':'StellarRadius[Solarradii]',
'koi_srad_err1':'StellarRadiusUpperUnc.[Solarradii]',
'koi_srad_err2':'StellarRadiusLowerUnc.[Solarradii]',
'ra':'RA[decimaldegrees]',
'dec':'Dec[decimaldegrees]',
'koi_kepmag':'Kepler-band[mag]'
})
df

In [None]:
df.describe()

In [None]:
df['ExoPlanetCandidate'] = df['DispositionUsingKeplerData'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
df['ExoPlanetConfirmed'] = df['ExoplanetArchiveDisposition'].apply(lambda x: 2 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0 )

In [None]:
df_dropped = df.copy()

In [None]:
# sns.set_style("darkgrid")
# sns.set(rc={'figure.figsize':(8.7,12.27)})
# sns.countplot(x='StellarEclipseFalsePositiveFlag', data=df_dropped, palette='muted')


In [None]:
df_dropped.drop(columns=['KeplerName','EquilibriumTemperatureUpperUnc.[K]','KOIName',
                 'KepID','ExoplanetArchiveDisposition','TCEDeliver',
                 'NotTransit-LikeFalsePositiveFlag','EphemerisMatchIndicatesContaminationFalsePositiveFlag',
                 'koi_fpflag_ss','CentroidOffsetFalsePositiveFlag',
                 'DispositionUsingKeplerData',
                 'EquilibriumTemperatureLowerUnc.[K]'], inplace=True)

In [None]:
df_dropped.isna().any()
df_dropped.dropna(inplace=True) # Remove all columns with NaN values

In [None]:
df_dropped

In [None]:
print('minimum right ascension: ' + str(df_dropped['RA[decimaldegrees]'].min()) + ' degrees')
print('maximum right ascension: ' + str(df_dropped['RA[decimaldegrees]'].max()) + ' degrees')

In [None]:
# fig, ax = plt.subplots(1, 2)
# ax[0] = plt.hist(df['StellarRadius[Solar radii]'], bins=100)
# ax[1] = plt.hist(df['StellarEffectiveTemperature[K]'], bins=100)



#plt.hist(df['StellarRadius[Solar radii]'], bins=1000, alpha=0.5)
kde = stats.gaussian_kde(df_dropped['StellarEffectiveTemperature[K]'])
fig, ax = plt.subplots(figsize=(8, 4))
plt.hist(df_dropped['StellarEffectiveTemperature[K]'], bins=100, color='red', alpha=0.9)
xx = np.linspace(0, 10000, 1000)
plt.plot(xx, kde(xx), color='black', linewidth=2.5)

# plt.ylim(0, 1000)

plt.show()

In [None]:
plt.figsize=(40, 40)
sns.set_style('darkgrid')
#plt.plot(df['RA [decimal degrees]'], df['Dec [decimal degrees]'], 'o', markersize=0.5, alpha=0.3)
#plot right ascension and declination for data points that are either confirmed or candidates
plt.plot(df_dropped[df_dropped['ExoPlanetCandidate'] == 0]['RA[decimaldegrees]'], df_dropped[df_dropped['ExoPlanetCandidate'] == 0]['Dec[decimaldegrees]'], 'o', markersize=1, alpha=0.3)
plt.plot(df_dropped[df_dropped['ExoPlanetCandidate'] == 1]['RA[decimaldegrees]'], df_dropped[df_dropped['ExoPlanetCandidate'] == 1]['Dec[decimaldegrees]'], '+', markersize=3, alpha=0.3)
plt.plot(df_dropped[df_dropped['ExoPlanetConfirmed'] == 2]['RA[decimaldegrees]'], df_dropped[df_dropped['ExoPlanetConfirmed'] == 2]['Dec[decimaldegrees]'], '*', markersize=3, alpha=0.3)
plt.xlabel('Right Ascension [decimal degrees $^{\cdot}$]')
plt.ylabel('Declination [decimal degrees $^{\cdot}$]')
plt.title('Exoplanet Candidates and Confirmed Exoplanets')
plt.legend(['False Flag', 'Candidate', 'Confirmed'], loc='lower right', fontsize='smaller')

In [None]:
#remove rows and columns that do not have a high correlation with the exoplanet candidate
corr = df_dropped.corr()
corr = corr[corr['ExoPlanetCandidate'] > 0.1]
corr = corr[corr.index]
corr = corr.drop(['ExoPlanetCandidate'], axis=0)


In [None]:
corr.head()
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=False, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# def clean_data(df):
#     assert isinstance(df, pd.DataFrame), "df needs to be the correct type"
#     df.dropna(inplace=True)
#     indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)


#     return df[indices_to_keep].astype(np.float64)

# clean_data(df_dropped)

def clean_data(df):
    assert isinstance(df, pd.DataFrame), "df needs to be the correct type"
    df.dropna(inplace=True)

    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_data(df_dropped)

In [None]:
for col in df_dropped.columns:
    print(col)

In [None]:
sns.distplot(df_dropped['StellarEffectiveTemperature[K]'], bins=100, color='blue', kde_kws={"color": "orange", "lw": 3, "label": "KDE"})
plt.title('StellarEffectiveTemperature Distribution')
plt.xlabel('StellarEffectiveTemperature [K]')

In [None]:
# plot the log scale of the oribital periods in a histogram using seaborn
sns.set(rc={'figure.figsize':(8.7,12.27)})
sns.histplot(x='OrbitalPeriod[days]', data=df_dropped, log_scale=True, palette="bright", hue='ExoPlanetCandidate', kde=True, bins=100, alpha=0.5)
plt.xlim(0, 1500)
plt.xlabel('Log Orbital Period [days]')
plt.title('Histogram of Orbital Periods')

In [None]:
# Create categorical Plot of the Log Scale Orbital Period versus Number of Planets
sns.catplot(x='ExoPlanetCandidate', y='OrbitalPeriod[days]', data=df_dropped, kind='box', palette='muted', showfliers=False)
plt.title('Orbital Periods of Exoplanet Candidates')
plt.yscale('log')
plt.ylabel('Log Orbital Period (days)')
plt.ylim(0.1, 1500)

In [None]:
# Apply feature selection to the data
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler

# Create a function to apply feature selection to the data
def feature_selection(df, k):
    assert isinstance(df, pd.DataFrame), "df needs to be the correct type"
    assert isinstance(k, int), "k needs to be the correct type" # k is the number of features to select
    X = df.drop(['ExoPlanetCandidate'], axis=1)
    y = df['ExoPlanetCandidate']
    X = StandardScaler().fit_transform(X)
    X = SelectKBest(f_classif, k=k).fit_transform(X, y)
    return X, y

# Apply feature selection to the data
#X, y = feature_selection(df_dropped, 10)


In [None]:
X = df_dropped.drop(['ExoPlanetCandidate', 'ExoPlanetConfirmed'], axis=1)
y = df_dropped.ExoPlanetCandidate

In [None]:
# Apply test train split to the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [None]:
rf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=21)
rf.fit(X_train, y_train)


In [None]:
# Check the accuracy of the model
y_pred_rf = rf.predict(X_test)
print('Accuracy of random forest classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_rf)))

In [None]:
# Create a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred_rf)

# Reset sns style
sns.set(rc={'figure.figsize':(10,10)})
sns.set_style("darkgrid")
# Plot the confusion matrix
sns.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True')
#Apply actual labels to the confusion matrix axis
tick_marks = np.arange(len(y_test.unique()))
plt.xticks(tick_marks, y_test.unique())
plt.yticks(tick_marks, y_test.unique())

# Add a title
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()


In [None]:
# Logistic Regression Model
lr = LogisticRegression(C=100, max_iter=200, class_weight='balanced', solver='liblinear')

# Fitting Model to the train set
lr.fit(X_train, y_train)

# Predicting on the test set
y_pred = lr.predict(X_test)

# Check the accuracy of the model
y_pred_lr = lr.predict(X_test)
print('Accuracy of linear regression classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_lr)))

In [None]:
# Create a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))


In [None]:
# Improve the model using a CV Grid Search
from sklearn.model_selection import GridSearchCV

# Create a function to apply a random grid search to the data that runs in a sensible amount of time
def grid_search(X, y, model, param_grid, cv):
    assert isinstance(X, pd.DataFrame), "X needs to be the correct type"
    assert isinstance(y, pd.Series), "y needs to be the correct type"
    assert isinstance(model, object), "model needs to be the correct type"
    assert isinstance(param_grid, dict), "param_grid needs to be the correct type"
    assert isinstance(cv, int), "cv needs to be the correct type"
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=2)
    grid.fit(X, y)
    return grid.best_score_, grid.best_estimator_

#track progress of grid search
from sklearn.metrics import make_scorer, f1_score
f1 = make_scorer(f1_score , average='weighted')


# Apply a grid search to the data
param_grid = {'n_estimators': [100, 200, 300, 400, 500],
              #'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth': [4, 5, 6, 7, 8],
              #'criterion': ['gini', 'entropy']}
            }
best_score, best_model = grid_search(X_train, y_train, rf, param_grid, 5)
print('Best score: ' + str(best_score))

In [None]:
# Output the best model
print(best_model)

In [None]:
# Save the best model
import pickle
# filename = 'best_rf_model_binary.sav'
# pickle.dump(best_model, open(filename, 'wb'))

In [None]:
# Create a second round of grid search using the best model from the first round, but now altering max_features and criterion
param_grid = {'n_estimators': [200],
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth': [6],
              'criterion': ['gini', 'entropy']}
best_score2, best_model2 = grid_search(X_train, y_train, best_model, param_grid, 5)


In [None]:
print('Best score for model 2: ' + str(best_score2))
#print the best combination of max_features and criterion
print(best_model2)

In [None]:
# Create an adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(random_state=21)
ada.fit(X_train, y_train)

In [None]:
# Check the accuracy of the model
y_pred = ada.predict(X_test)
print('Accuracy of AdaBoost classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred)))
# Worse than random forest classifier

In [None]:
from sklearn.tree import export_graphviz
import os
estimator = rf.estimators_[5]
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = X.columns,
                class_names = ['Not Candidate', 'Candidate'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)



In [None]:
os.system('dot -Tpng tree.dot -o tree.png')
#from subprocess import call
#call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

In [None]:
# Create a pipeline to apply a grid search to the data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

# Create a pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('pca', PCA()),
                 ('classifier', RandomForestClassifier(random_state=21))])

# Create a parameter grid
param_grid = {'classifier__n_estimators': [100, 200, 300, 400, 500],
              'classifier__max_features': ['auto', 'sqrt', 'log2'],
              'classifier__max_depth': [4, 5, 6, 7, 8],
              'classifier__criterion': ['gini', 'entropy']}
# Create a grid search object
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# Fit the grid search
grid.fit(X_train, y_train)
# Output the best score
print(grid.best_score_)
# Output the best estimator
print(grid.best_estimator_)
# Output the best parameters
print(grid.best_params_)
# Output the best model
best_model = grid.best_estimator_
# Check the accuracy of the model
y_pred = best_model.predict(X_test)
print('Accuracy of best model on test set: {:.2f}'.format(accuracy_score(y_test, y_pred)))
# Still worse than the random forest classifier

In [None]:
# Create a set of classifiers to compare the accuracy of the models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

In [None]:
# Create a list of classifiers
classifiers = [LogisticRegression(solver='liblinear'),
               DecisionTreeClassifier(),
               KNeighborsClassifier(),
               RandomForestClassifier(),
               AdaBoostClassifier(),
               GradientBoostingClassifier(),
               SVC(),
               GaussianNB()]

In [None]:
# Create a function to compare the accuracy of the models
def compare_accuracy(X, y, classifiers):
    assert isinstance(X, pd.DataFrame), "X needs to be the correct type"
    assert isinstance(y, pd.Series), "y needs to be the correct type"
    assert isinstance(classifiers, list), "classifiers needs to be the correct type"
    accuracy = []
    for classifier in classifiers:
        model = classifier
        model.fit(X, y)
        scores = cross_val_score(model, X, y, cv=5)
        accuracy.append(scores.mean())
    return accuracy

In [None]:
# Apply the function to the data
accuracy = compare_accuracy(X_train, y_train, classifiers)

In [None]:
# Create a dataframe of the accuracy scores
accuracy_df = pd.DataFrame(accuracy, index=['LogisticRegression',
                                            'DecisionTreeClassifier',
                                            'KNeighborsClassifier',
                                            'RandomForestClassifier',
                                            'AdaBoostClassifier',
                                            'GradientBoostingClassifier',
                                            'SVC',
                                            'GaussianNB'])


In [None]:
# Rename the accuracy column
accuracy_df.rename(columns={0: 'Accuracy'}, inplace=True)
# Sort the dataframe by accuracy
accuracy_df.sort_values(by='Accuracy', ascending=False, inplace=True)
# Output the dataframe
print(accuracy_df)

In [None]:
# Perform last test with a multi layer perceptron
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
print('Accuracy of MLP classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_mlp)))


In [None]:
#Improve the MLP classifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)
# Apply the transformations to the data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
print('Accuracy of MLP classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_mlp)))
# Far better than before

In [None]:
# Create an ensemble of the best models
from sklearn.ensemble import VotingClassifier
# Create a list of the best models
models = [('lr', lr), ('rf', rf), ('mlp', mlp)]
# Create a voting classifier
ensemble = VotingClassifier(estimators=models)
# Fit the voting classifier
ensemble.fit(X_train, y_train)
# Check the accuracy of the model
y_pred_ensemble = ensemble.predict(X_test)
print('Accuracy of ensemble classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_ensemble)))



In [None]:
# Create classification report for the ensemble
print(classification_report(y_test, y_pred_ensemble))
# Recall is the ability of the classifier to find all the positive samples
# Precision is the ability of the classifier not to label as positive a sample that is negative


In [None]:
# Create confusion matrix for the ensemble
# confusion_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)

# # Reset sns style
# sns.set_style("darkgrid")
# # Plot the confusion matrix
# sns.heatmap(confusion_matrix_ensemble, annot=True, cmap='Blues', fmt='g')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# #Apply actual labels to the confusion matrix axis
# tick_marks = np.arange(len(y_test.unique()))
# plt.xticks(tick_marks, y_test.unique())
# plt.yticks(tick_marks, y_test.unique())

# # Add a title
# plt.title('Confusion Matrix for Random Forest Classifier')
# plt.show()

In [None]:
# Create streamlit app
import streamlit as st

# Create a title
#st.title('Exoplanet Classification')
