In [None]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [None]:
'''
Column information:

Err column 1 is positive error, Err column 2 is negative error

koi_disposition: Disposition Using Kepler Data. The disposition in the literature towards this exoplanet candidate. 
One of CANDIDATE, FALSE POSITIVE, NOT DISPOSITIONED or CONFIRMED.

koi_fpflag_nt: Not Transit-Like Flag. The light curve is not consistent with that of a transiting planet.

koi_fpflag_ss: Stellar Eclipse Flag. A KOI that is observed to have a significant secondary event, transit shape, 
or out-of-eclipse variability, which indicates that the transit-like event is most likely caused by an eclipsing binary.

koi_fpflag_co: Centroid Offset Flag. The source of the signal is from a nearby star, as inferred by measuring the 
centroid location of the image both in and out of transit.

koi_fpflag_ec: Ephemeris Match Indicates Contamination Flag. The KOI shares the same period and epoch as another object 
and is judged to be the result of flux contamination in the aperture or electronic crosstalk.

koi_period: Orbital Period (days). The interval between consecutive planetary transits.

koi_time0bk: Transit Epoch (BJD - 2,454,833.0). The time corresponding to the center of the first detected transit in 
Barycentric Julian Day (BJD) minus a constant offset of 2,454,833.0 days. 
The offset corresponds to 12:00 on Jan 1, 2009 UTC.

koi_impact: Impact Parameter. The sky-projected distance between the center of the stellar disc and the center of the 
planet disc at conjunction, normalized by the stellar radius.

koi_duration: Transit Duration (hours). The duration of the observed transits.

koi_depth: Transit Depth (parts per million).The fraction of stellar flux lost at the minimum of the planetary transit.

koi_prad: Planetary Radius (Earth radii). The radius of the planet. Planetary radius is the product of the planet star 
radius ratio and the stellar radius.

koi_teq: Equilibrium Temperature (Kelvin). Approximation for the temperature of the planet.

koi_insol: Insolation Flux [Earth flux]. Insolation flux is another way to give the equilibrium temperature.

koi_model_snr: Transit Signal-to-Noise. Transit depth normalized by the mean uncertainty in the flux during the transits.

koi_tce_plnt_num: TCE Planet Number. TCE Planet Number federated to the KOI.

koi_steff: Stellar Effective Temperature (Kelvin). The photospheric temperature of the star.

koi_slogg: Stellar Surface Gravity (log10(cm s-2). The base-10 logarithm of the acceleration due to gravity at the 
surface of the star.

koi_srad: Stellar Radius (solar radii). The photospheric radius of the star.

ra & dec: right ascension and declination (degrees).

koi_kepmag: Kepler-band (mag). Kepler-band (mag).
'''

# Select your features (columns)

In [3]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_prad']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
y = df.koi_disposition

In [5]:
y = y.values.reshape(-1, 1)
y = y.flatten()

In [6]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_features, y, random_state=1)

In [7]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_prad
3563,0,0,0,0,3.89
4099,0,0,0,0,2.1
5460,0,0,0,0,14.59
1091,0,0,0,0,2.28
5999,0,0,0,0,2.27


In [8]:
y_test

array(['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', ...,
       'FALSE POSITIVE', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [9]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)


X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

# Train the Model



In [14]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(max_iter=6000)
model1
model1.fit(X_train, y_train)

print(f"Training Data Score: {model1.score(X_train, y_train)}")
print(f"Testing Data Score: {model1.score(X_test, y_test)}")

Training Data Score: 0.7604424947549113
Testing Data Score: 0.7751716247139588


In [15]:
model1.fit(X_train, y_train)

LogisticRegression(max_iter=6000)

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [16]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
              'penalty': ["l2"]}
grid = GridSearchCV(model1, param_grid, verbose=3)

In [17]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=1, penalty=l2 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... C=1, penalty=l2, score=0.763, total=   1.5s
[CV] C=1, penalty=l2 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ..................... C=1, penalty=l2, score=0.760, total=   1.8s
[CV] C=1, penalty=l2 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.2s remaining:    0.0s


[CV] ..................... C=1, penalty=l2, score=0.754, total=   3.4s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.755, total=   1.5s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.770, total=   1.4s
[CV] C=5, penalty=l2 .................................................
[CV] ..................... C=5, penalty=l2, score=0.763, total=   3.6s
[CV] C=5, penalty=l2 .................................................
[CV] ..................... C=5, penalty=l2, score=0.760, total=   2.4s
[CV] C=5, penalty=l2 .................................................
[CV] ..................... C=5, penalty=l2, score=0.754, total=   2.1s
[CV] C=5, penalty=l2 .................................................
[CV] ..................... C=5, penalty=l2, score=0.755, total=   1.7s
[CV] C=5, penalty=l2 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   29.8s finished


GridSearchCV(estimator=LogisticRegression(max_iter=6000),
             param_grid={'C': [1, 5, 10, 50], 'penalty': ['l2']}, verbose=3)

In [18]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'penalty': 'l2'}
0.7602525851592574


In [19]:
predictions = model1.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")


pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

First 10 Predictions:   ['CONFIRMED' 'FALSE POSITIVE' 'FALSE POSITIVE' 'CONFIRMED'
 'FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE' 'CONFIRMED'
 'CONFIRMED' 'FALSE POSITIVE']
First 10 Actual labels: ['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'CANDIDATE', 'FALSE POSITIVE']


Unnamed: 0,Prediction,Actual
0,CONFIRMED,CONFIRMED
1,FALSE POSITIVE,FALSE POSITIVE
2,FALSE POSITIVE,FALSE POSITIVE
3,CONFIRMED,CONFIRMED
4,FALSE POSITIVE,FALSE POSITIVE
...,...,...
1743,FALSE POSITIVE,FALSE POSITIVE
1744,CONFIRMED,CONFIRMED
1745,FALSE POSITIVE,FALSE POSITIVE
1746,FALSE POSITIVE,FALSE POSITIVE


# Save the Model

In [20]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'edie_logistic_regression.sav'
joblib.dump(grid, filename)

['edie_logistic_regression.sav']