In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("exoplanet_data_cleaned.csv", index_col = 0)

In [3]:
df.head()

Unnamed: 0,kepoi_name,koi_disposition,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,K00752.01,y,m,negative,negative,negative,negative,9.488036,2.78e-05,-2.78e-05,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,K00752.02,y,m,negative,negative,negative,negative,54.418383,0.000248,-0.000248,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,K00753.01,n,n,negative,positive,negative,negative,19.89914,1.49e-05,-1.49e-05,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,K00754.01,n,n,negative,positive,negative,negative,1.736952,2.63e-07,-2.63e-07,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,K00755.01,y,m,negative,negative,negative,negative,2.525592,3.76e-06,-3.76e-06,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


---

#### Drop: "m" (= "CANDIDATE") from target "koi_disposition":

In [4]:
df_culled = df[df["koi_disposition"] != "m"]

#### Isolate target vector: "koi_disposition"; rename as y_disp.

In [5]:
y_disp = df_culled[["koi_disposition"]].reset_index(drop = True)

In [6]:
y_disp.value_counts()

koi_disposition
n                  5023
y                  2293
dtype: int64

In [7]:
df_culled.drop(columns = ["kepoi_name", "koi_disposition", "koi_pdisposition"], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
df_culled.reset_index(drop = True, inplace = True)

In [9]:
df_dummy = pd.get_dummies(df_culled)

In [10]:
df_dummy

Unnamed: 0,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,...,koi_fpflag_nt_positive,koi_fpflag_ss_negative,koi_fpflag_ss_positive,koi_fpflag_co_negative,koi_fpflag_co_positive,koi_fpflag_ec_negative,koi_fpflag_ec_positive,koi_tce_delivname_q1_q16_tce,koi_tce_delivname_q1_q17_dr24_tce,koi_tce_delivname_q1_q17_dr25_tce
0,9.488036,2.780000e-05,-2.780000e-05,170.538750,0.002160,-0.002160,0.146,0.318,-0.146,2.95750,...,0,1,0,1,0,1,0,0,0,1
1,54.418383,2.480000e-04,-2.480000e-04,162.513840,0.003520,-0.003520,0.586,0.059,-0.443,4.50700,...,0,1,0,1,0,1,0,0,0,1
2,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.78220,...,0,0,1,1,0,1,0,0,0,1
3,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,...,0,0,1,1,0,1,0,0,0,1
4,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,-0.001130,0.701,0.235,-0.478,1.65450,...,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7311,23.627035,2.260000e-04,-2.260000e-04,150.036200,0.010900,-0.010900,1.096,38.210,-0.106,11.48100,...,0,0,1,1,0,1,0,0,0,1
7312,8.589871,1.850000e-04,-1.850000e-04,132.016100,0.015700,-0.015700,0.765,0.023,-0.541,4.80600,...,0,1,0,1,0,0,1,0,0,1
7313,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,-0.000170,1.252,0.051,-0.049,3.22210,...,0,0,1,0,1,1,0,0,0,1
7314,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,-0.002850,0.147,0.309,-0.147,0.86500,...,0,1,0,0,1,1,0,0,0,1


#### Impute the data using medians:

In [11]:
from sklearn.impute import SimpleImputer

In [12]:
impute_median = SimpleImputer(missing_values=np.nan, strategy = 'median')

In [13]:
impute_median.fit(df_dummy)

SimpleImputer(strategy='median')

In [14]:
df_imp = impute_median.transform(df_dummy)

#### Train, test, split the data:

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_imp, y_disp, random_state = 46)

#### Scale (by median):

In [16]:
from sklearn.preprocessing import MinMaxScaler

In [17]:
X_scaler = MinMaxScaler().fit(X_train)

In [18]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Fit logit model with LASSO ("L1" regularization):

In [19]:
from sklearn.linear_model import LogisticRegression

logit_1 = LogisticRegression(penalty='l1', C = 5, solver='liblinear')  # c = 5 is preferred (see below).
logit_1.fit(X_train_scaled, np.ravel(y_train, order = "c"))

LogisticRegression(C=5, penalty='l1', solver='liblinear')

In [20]:
from sklearn.metrics import roc_auc_score

In [21]:
pred_prob = logit_1.predict_proba(X_test_scaled)

In [22]:
roc_auc = roc_auc_score(y_test, pred_prob[:,1])
print(f"AUC: {round(roc_auc, 4)}")

AUC: 0.9982


In [23]:
accuracy = logit_1.score(X_test, y_test)

print(f"Accuracy: {accuracy}")

Accuracy: 0.6522689994532531


High AUC but low accuracy suggests good performance on positive class, but a high false negative rate.

#### GridSearch to optimize the regularization parameter:

In [24]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [10, 1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(logit_1, param_grid, verbose=3)

In [25]:
grid.fit(X_train, np.ravel(y_train, order = "c"))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ..............................C=10;, score=0.993 total time=   0.1s
[CV 2/5] END ..............................C=10;, score=0.989 total time=   0.0s
[CV 3/5] END ..............................C=10;, score=0.992 total time=   0.2s
[CV 4/5] END ..............................C=10;, score=0.996 total time=   0.0s
[CV 5/5] END ..............................C=10;, score=0.985 total time=   0.2s
[CV 1/5] END ...............................C=1;, score=0.991 total time=   0.0s
[CV 2/5] END ...............................C=1;, score=0.990 total time=   0.0s
[CV 3/5] END ...............................C=1;, score=0.991 total time=   0.0s
[CV 4/5] END ...............................C=1;, score=0.995 total time=   0.0s
[CV 5/5] END ...............................C=1;, score=0.987 total time=   0.1s
[CV 1/5] END .............................C=0.1;, score=0.989 total time=   0.0s
[CV 2/5] END .............................C=0.1;,

GridSearchCV(estimator=LogisticRegression(C=5, penalty='l1',
                                          solver='liblinear'),
             param_grid={'C': [10, 1, 0.1, 0.01, 0.001]}, verbose=3)

In [26]:
print(grid.best_params_)

{'C': 1}


In [27]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}
grid = GridSearchCV(logit_1, param_grid, verbose=3)

In [28]:
grid.fit(X_train, np.ravel(y_train, order = "c"))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ..............................C=10;, score=0.993 total time=   0.1s
[CV 2/5] END ..............................C=10;, score=0.989 total time=   0.0s
[CV 3/5] END ..............................C=10;, score=0.991 total time=   0.1s
[CV 4/5] END ..............................C=10;, score=0.996 total time=   0.1s
[CV 5/5] END ..............................C=10;, score=0.986 total time=   0.2s
[CV 1/5] END ...............................C=9;, score=0.994 total time=   0.1s
[CV 2/5] END ...............................C=9;, score=0.989 total time=   0.1s
[CV 3/5] END ...............................C=9;, score=0.991 total time=   0.1s
[CV 4/5] END ...............................C=9;, score=0.996 total time=   0.0s
[CV 5/5] END ...............................C=9;, score=0.987 total time=   0.1s
[CV 1/5] END ...............................C=8;, score=0.994 total time=   0.1s
[CV 2/5] END ...............................C=8;

GridSearchCV(estimator=LogisticRegression(C=5, penalty='l1',
                                          solver='liblinear'),
             param_grid={'C': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, verbose=3)

In [29]:
print(grid.best_params_)

{'C': 9}


#### Manual (custom) grid search via for loop to examine which features are dropped as C decreases

In [30]:
C = [10, 5, 1, 0.1, 0.001, 0.0001]

for c in C:
    logit_x = LogisticRegression(penalty='l1', C=c, solver='liblinear')
    logit_x.fit(X_train_scaled, np.ravel(y_train, order = "c"))
    print(f"C: {c}")
    print(f"Coefficient: {logit_x.coef_}")
    pred_prob_x = logit_x.predict_proba(X_test_scaled)
    roc_auc_x = roc_auc_score(y_test, pred_prob_x[:,1])
    print(f"AUC: {round(roc_auc_x, 4)}\n")

C: 10
Coefficient: [[ -0.3603078   -7.24655018   0.           0.          -1.28410575
    0.           0.          -9.63402039  -3.99324863  -0.15888492
  -14.97799653   0.         -56.5824079    0.          -1.31131222
    0.           0.          -1.62484184  -5.34241304   0.
    0.          -1.07606302   1.34893808   3.23371194   2.6975682
  -11.44970388   0.           1.94229306   0.71198405   9.17824404
   -7.99879535   0.           0.          -0.77291143   0.54759936
   -0.45763019   0.         -13.82074753   0.         -13.10635048
    0.         -14.98843696   0.         -14.95300657   0.
    6.01489455  12.60424111]]
AUC: 0.9983

C: 5
Coefficient: [[ -0.2340422   -4.65916719   0.           0.          -1.33771906
    0.           0.          -6.76843111  -2.59894402  -0.11959241
   -7.46175667   0.         -31.40060892   0.          -0.99056102
    0.           0.          -1.06497015  -2.72427354   0.
    0.          -1.07046577   0.17768518   3.14980632   0.62027463
   -9.6

---

### Save model:

In [31]:
import pickle

In [32]:
filename = "explanet_logit_CDA.sav"
pickle.dump(logit_1, open(filename, 'wb'))

In [None]:
# To load back:

# logit_l1 = pickle.load(open("explanet_logit_CDA.sav", 'rb'))