# Model 1: Logistic Regression With Error

In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in d:\python\python38-32\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("Resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[["koi_period", "koi_time0bk", "koi_slogg", "koi_srad", "ra", "dec", "koi_kepmag"]]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
X = selected_features
y = df["koi_disposition"]
# y = df["koi_disposition"].values.reshape(-1, 1)
print(X.shape, y.shape)

(6991, 7) (6991,)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,koi_period,koi_time0bk,koi_slogg,koi_srad,ra,dec,koi_kepmag
6122,6.768901,133.07724,4.327,1.125,294.40472,39.351681,14.725
6370,0.733726,132.02005,4.578,0.797,284.50391,42.46386,15.77
2879,7.652707,134.46038,4.481,0.963,295.50211,38.98354,13.099
107,7.953547,174.66224,4.536,0.779,291.15878,40.750271,15.66
29,4.959319,172.258529,4.359,1.082,292.16705,48.727589,15.263


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Scale your data
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
X_train_scaled

array([[-0.42255034, -0.46799876,  0.05414964, ...,  0.49936552,
        -1.24204895,  0.34371087],
       [-0.47343342, -0.48359522,  0.62499662, ..., -1.57636903,
        -0.37612563,  1.11050056],
       [-0.41509889, -0.44759365,  0.40439041, ...,  0.72943663,
        -1.34447939, -0.84939921],
       ...,
       [-0.42776623, -0.44814769, -0.92606968, ...,  1.0815803 ,
         0.60454317, -2.66694759],
       [-0.451432  , -0.44191797, -1.20580745, ...,  1.01466325,
        -0.74240821, -0.71952191],
       [-0.46046488, -0.44260516,  0.46124768, ...,  0.56843989,
        -0.82198371,  0.14852804]])

In [10]:
X_test_scaled

array([[ 0.26672986,  0.36108131,  1.08895193, ...,  0.74498869,
         1.41405143,  1.16259824],
       [ 0.38450878,  0.017841  ,  0.63636807, ...,  1.18276093,
         0.87064319,  0.64969298],
       [-0.46966102, -0.47761785,  0.63409378, ..., -0.14131938,
         2.07020604,  1.25798834],
       ...,
       [-0.13532238, -0.17595018,  0.34980744, ...,  0.64564042,
        -0.90240199, -0.1515839 ],
       [-0.41567459, -0.39763171, -0.38706277, ...,  0.53147807,
        -0.82502262,  0.2696001 ],
       [-0.47142522, -0.48145622,  0.35208173, ...,  0.0552932 ,
        -0.48502759,  0.17861261]])

In [11]:
# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [12]:
encoded_y_test

array([2, 0, 2, ..., 1, 1, 1])

In [13]:
encoded_y_train

array([0, 2, 2, ..., 2, 2, 2])

# Train the Model



In [14]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
model1.fit(X_train_scaled, encoded_y_train)
predictions = model1.predict(X_test_scaled)
predictions

array([2, 2, 1, ..., 2, 2, 2])

In [16]:
training_score = model1.score(X_train_scaled, encoded_y_train)
testing_score = model1.score(X_test_scaled, encoded_y_test)

In [17]:
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.5224108334922755
Testing Score: 0.5051487414187643


In [18]:
print(f"Training Data Score: {model1.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {model1.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 0.5224108334922755
Testing Data Score: 0.5051487414187643


In [19]:
from sklearn.metrics import mean_squared_error

score = model1.score(X_train_scaled, encoded_y_train)
MSE = mean_squared_error(encoded_y_test, predictions)
print(f"R2 Score: {score}")
print(f"MSE: {MSE}")

R2 Score: 0.5224108334922755
MSE: 1.0852402745995424


In [20]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, predictions,
                            target_names=["Candidate","Confirmed", "False Positive"]))

                precision    recall  f1-score   support

     Candidate       0.00      0.00      0.00       411
     Confirmed       0.43      0.24      0.31       484
False Positive       0.52      0.90      0.66       853

      accuracy                           0.51      1748
     macro avg       0.32      0.38      0.32      1748
  weighted avg       0.37      0.51      0.41      1748



  _warn_prf(average, modifier, msg_start, len(result))


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [21]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [20, 40, 120],
              'max_iter': [200, 400, 1200]}
grid = GridSearchCV(model1, param_grid, verbose=3)

In [22]:
for param in grid.get_params().keys():
    print(param)

cv
error_score
estimator__C
estimator__class_weight
estimator__dual
estimator__fit_intercept
estimator__intercept_scaling
estimator__l1_ratio
estimator__max_iter
estimator__multi_class
estimator__n_jobs
estimator__penalty
estimator__random_state
estimator__solver
estimator__tol
estimator__verbose
estimator__warm_start
estimator
iid
n_jobs
param_grid
pre_dispatch
refit
return_train_score
scoring
verbose


In [23]:
# Train the model with GridSearch
grid.fit(X_train_scaled, encoded_y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=20, max_iter=200 ..............................................
[CV] .................. C=20, max_iter=200, score=0.516, total=   0.0s
[CV] C=20, max_iter=200 ..............................................
[CV] .................. C=20, max_iter=200, score=0.520, total=   0.0s
[CV] C=20, max_iter=200 ..............................................
[CV] .................. C=20, max_iter=200, score=0.528, total=   0.0s
[CV] C=20, max_iter=200 ..............................................
[CV] .................. C=20, max_iter=200, score=0.513, total=   0.0s
[CV] C=20, max_iter=200 ..............................................
[CV] .................. C=20, max_iter=200, score=0.532, total=   0.0s
[CV] C=20, max_iter=400 ..............................................
[CV] .................. C=20, max_iter=400, score=0.516, total=   0.0s
[CV] C=20, max_iter=400 ..............................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] C=20, max_iter=400 ..............................................
[CV] .................. C=20, max_iter=400, score=0.528, total=   0.0s
[CV] C=20, max_iter=400 ..............................................
[CV] .................. C=20, max_iter=400, score=0.513, total=   0.0s
[CV] C=20, max_iter=400 ..............................................
[CV] .................. C=20, max_iter=400, score=0.532, total=   0.0s
[CV] C=20, max_iter=1200 .............................................
[CV] ................. C=20, max_iter=1200, score=0.516, total=   0.0s
[CV] C=20, max_iter=1200 .............................................
[CV] ................. C=20, max_iter=1200, score=0.520, total=   0.0s
[CV] C=20, max_iter=1200 .............................................
[CV] ................. C=20, max_iter=1200, score=0.528, total=   0.0s
[CV] C=20, max_iter=1200 .............................................
[CV] ................. C=20, max_iter=1200, score=0.513, total=   0.0s
[CV] 

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    1.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [20, 40, 120], 'max_iter': [200, 400, 1200]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [24]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 20, 'max_iter': 200}
0.5220297047715381


# Save the Model

In [25]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'ey_Logistic_Regression_Model_with_error.sav'
joblib.dump(model1, filename)

['ey_Logistic_Regression_Model_with_error.sav']