### Processing the data

#### data wrangling and cleaning

In [47]:
# import dependencies
import pandas as pd

# data cleaning process
df = pd.read_csv('resources/exoplanet_data.csv')
# df.head()

# drop all null columns and rows
column_clean = df.dropna(axis='columns', how='all')
# column_clean.head()
df_clean = column_clean.dropna()
df_clean.head(9)

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
5,CONFIRMED,0,0,0,0,2.566589,1.78e-05,-1.78e-05,179.55437,0.00461,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
6,CONFIRMED,0,0,0,0,16.068647,1.09e-05,-1.09e-05,173.621937,0.000517,...,-83,4.485,0.083,-0.028,0.848,0.033,-0.072,286.99948,48.37579,15.841
7,CONFIRMED,0,0,0,0,2.470613,2.7e-08,-2.7e-08,122.763305,9e-06,...,-78,4.457,0.024,-0.024,0.964,0.038,-0.038,286.80847,49.316399,11.338
8,CONFIRMED,0,1,0,0,2.204735,4.3e-08,-4.3e-08,121.358542,1.6e-05,...,-89,4.019,0.033,-0.027,1.952,0.099,-0.11,292.24728,47.969521,10.463


#### pre-processing data using sklearn

In [33]:
from sklearn.model_selection import train_test_split
# create x and y variables
X = df_clean.drop('koi_disposition', axis=1)
y = df_clean['koi_disposition']

# split data to create training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

from sklearn.preprocessing import MinMaxScaler
# scale x values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Model building

#### model training using LR

In [53]:
# start training the datasets
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='newton-cg', multi_class='auto')
classifier

LogisticRegression(solver='newton-cg')

In [54]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(solver='newton-cg')

In [55]:
print(f'training data = {classifier.score(X_train_scaled, y_train)}')
print(f'testing data = {classifier.score(X_test_scaled, y_test)}')

training data = 0.8550448216669846
testing data = 0.8621281464530892


### Feature selection
##### this project will use recursive feature elimination to assign weights to features.

In [37]:
# read https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
# feature selection used in this project is RFECV
from sklearn.feature_selection import RFECV
# we will use RFECV feature selection 
fn = X.columns.tolist()
feature_select=RFECV(estimator=classifier, cv=5, step=1)
selected_features=feature_select.fit(X_train_scaled, y_train)

In [38]:
# beast feature selection processure_select.support_
feature_searching = sorted(zip(feature_select.ranking_, fn, feature_select.support_))
# feature_searching
features_sorted = pd.DataFrame(feature_searching, columns=['rank', 'feature', 'support']).set_index('feature')
# features_sorted.max()
features_sorted.head()

Unnamed: 0_level_0,rank,support
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
dec,1,True
koi_depth,1,True
koi_duration,1,True
koi_duration_err1,1,True
koi_duration_err2,1,True


In [39]:
# eliminate all features with ranking of 2 and above
approved_features = []
for i in feature_searching:
    if i[0] < 12:
        approved_features.append(i[1])


In [40]:
# use the new data for the entire model
from sklearn.preprocessing import MinMaxScaler

X_train_new = X_train[approved_features]
X_test_new = X_test[approved_features]
# X_test_new

scaler = MinMaxScaler()
X_scaler=scaler.fit(X_train_new)

X_train_scaled = X_scaler.transform(X_train_new)
X_test_scaled = X_scaler.transform(X_test_new)

new_model = LogisticRegression(solver='newton-cg', multi_class='auto')
new_model.fit(X_train_scaled, y_train)

print(f'training data = {new_model.score(X_train_scaled, y_train)}')
print(f'testing data = {new_model.score(X_test_scaled, y_test)}')


training data = 0.8550448216669846
testing data = 0.8621281464530892


### Model tuning process

In [41]:
# create gridsearch estimator along with a parameter object having the values to adjust
from sklearn.model_selection import GridSearchCV
import numpy as np

model_2=LogisticRegression(solver='newton-cg', multi_class='auto')
param_grid = {'C': np.logspace(0, 4, 10),
                'penalty': ['l2']
                }
grid=GridSearchCV(model_2, param_grid, verbose=0, cv=5)

In [42]:
# fit model using gridsearch estimator
fit_model=grid.fit(X_train_scaled, y_train)

In [43]:
# train the tuned model
# select tuned params
c_value = grid.best_params_['C']
penalty_value = grid.best_params_['penalty']

# model tuned
t_model = LogisticRegression(solver='newton-cg', multi_class='auto', C = c_value, penalty=penalty_value)
t_model.fit(X_train_scaled, y_train)

print(f'training data_tuned = {t_model.score(X_train_scaled, y_train)}')
print(f'testing data_tuned = {t_model.score(X_test_scaled, y_test)}')

training data_tuned = 0.8870875452984932
testing data_tuned = 0.8947368421052632


### Model prediction

In [44]:
# predict using the trained model
predictions = t_model.predict(X_test_scaled)

print('test accuracy: %.3f'%grid.score(X_test_scaled, y_test))

sorting_y_test_data = y_test.unique().tolist()

overall = {'real': y_test, 'predictions':predictions}

# create a dataframe for the predicted and actual values to compare against
comparison_df = pd.DataFrame(overall).set_index('real')
comparison_df.reset_index().head(9)

test accuracy: 0.895


Unnamed: 0,real,predictions
0,CANDIDATE,CANDIDATE
1,FALSE POSITIVE,FALSE POSITIVE
2,FALSE POSITIVE,FALSE POSITIVE
3,FALSE POSITIVE,FALSE POSITIVE
4,CANDIDATE,CANDIDATE
5,FALSE POSITIVE,FALSE POSITIVE
6,CANDIDATE,CANDIDATE
7,FALSE POSITIVE,FALSE POSITIVE
8,FALSE POSITIVE,FALSE POSITIVE


In [63]:
# evaluate the model
# make evaluations of the models
a=round(classifier.score(X_test_scaled, y_test)*100,2)
eval = {'model tested': ['root model', 'selected features model', 'tuned model'],
    'accuracy': [
    f'{round(classifier.score(X_test_scaled, y_test)*100,2)}%', 
    f'{round(new_model.score(X_test_scaled, y_test)*100,2)}%', 
    f'{round(t_model.score(X_test_scaled, y_test)*100,2)}%']}

eval_df = pd.DataFrame(eval).set_index('model tested').reset_index()
eval_df

Unnamed: 0,model tested,accuracy
0,root model,86.21%
1,selected features model,86.21%
2,tuned model,89.47%


In [64]:
# save the model scores to a csv 
eval_df.to_csv('resources/LR_scores.csv')


import joblib
filename = 'ML_models/LR.sav'
joblib.dump(t_model, filename)


['ML_models/LR.sav']