In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
X = df.drop(['koi_disposition'], axis=1)
Y = df['koi_disposition']


In [4]:
from sklearn.model_selection import train_test_split

# create a test-train split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=42, stratify=Y)

In [5]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3999,1,0,1,1,0.952378,6e-06,-6e-06,131.85999,0.00501,-0.00501,...,-264.0,4.25,0.124,-0.186,1.375,0.404,-0.235,290.70651,48.226559,13.541
1127,0,1,0,0,2.621707,8e-06,-8e-06,131.88227,0.0026,-0.0026,...,-192.0,4.436,0.155,-0.17,0.881,0.215,-0.132,297.38153,48.565041,15.399
2481,0,0,0,0,9.376637,3.2e-05,-3.2e-05,136.95319,0.00285,-0.00285,...,-74.0,4.136,0.033,-0.03,1.543,0.084,-0.075,288.10086,50.033718,10.421
419,0,0,0,0,20.497276,0.000334,-0.000334,147.43894,0.00968,-0.00968,...,-120.0,4.093,0.195,-0.105,1.458,0.226,-0.339,293.08463,41.135441,14.348
6880,1,0,0,0,88.523517,0.002337,-0.002337,189.2756,0.0232,-0.0232,...,-75.0,4.782,0.06,-0.07,0.462,0.05,-0.07,295.57629,48.897861,15.841


# Pre-processing

Scale the data using the MinMaxScaler

In [6]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

# Train the Support Vector Machine

In [7]:
steps = [('scaler', MinMaxScaler()), ('SVM', SVC())]

In [8]:
# define the pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('SVM',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [9]:
print(f"Training Data Score: {pipeline.score(X_train, y_train)}")
print(f"Testing Data Score: {pipeline.score(X_test, y_test)}")

Training Data Score: 0.8110078627591136
Testing Data Score: 0.8153230417381361


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [10]:
# Create the GridSearchCV model
pipeline2 = Pipeline(steps)
from sklearn.model_selection import GridSearchCV
parameters = {'SVM__C':[0.1, 1, 10, 100, 1000], 'SVM__gamma':[0.1, 1, 10, 100, 1000]}
grid = GridSearchCV(pipeline2, param_grid=parameters, cv=5)

In [11]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('SVM',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto_deprecated',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid='warn', n_jobs=None

In [12]:
print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

Training Data Score: 0.907505360972123
Testing Data Score: 0.8902229845626072


In [13]:
print(grid.best_params_)
print(grid.best_score_)

{'SVM__C': 100, 'SVM__gamma': 1}
0.8827734095782702


Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
steps2 = [('scaler', MinMaxScaler()), ('LogisticRegression', LogisticRegression())]
pipeline3 = Pipeline(steps2)
pipeline3.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('LogisticRegression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [18]:
print(f"Training Data Score: {pipeline3.score(X_train, y_train)}")
print(f"Testing Data Score: {pipeline3.score(X_test, y_test)}")

Training Data Score: 0.843602573266619
Testing Data Score: 0.8404802744425386


Logistic Regression CV

In [55]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

ss_lr_cv = make_pipeline(
    StandardScaler(),
    LogisticRegressionCV()
)

In [56]:
ss_lr_cv.fit(X_train, y_train)





Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregressioncv',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv='warn',
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='warn',
                                      n_jobs=None, penalty='l2',
                                      random_state=None, refit=True,
                                      scoring=None, solver='lbfgs', tol=0.0001,
                                      verbose=0))],
         verbose=False)

In [57]:
#because Ed suggested it, I'm going to go with this
ss_lr_cv.score(X_test, y_test)

0.884505431675243

In [62]:
ss_lr_cv[-1].C_

array([1.29154967e+03, 1.29154967e+03, 3.59381366e-01])

In [73]:
ss_lr_cv_final = make_pipeline(
    StandardScaler(),
    LogisticRegressionCV(Cs=[1.29154967e+03])
)

In [77]:
ss_lr_cv_final.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregressioncv',
                 LogisticRegressionCV(Cs=[1291.54967], class_weight=None,
                                      cv='warn', dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='warn',
                                      n_jobs=None, penalty='l2',
                                      random_state=None, refit=True,
                                      scoring=None, solver='lbfgs', tol=0.0001,
                                      verbose=0))],
         verbose=False)

In [78]:
ss_lr_cv_final.score(X_test, y_test)

0.8833619210977701

In [79]:
#fit model on full data
ss_lr_cv_final.fit(X, Y)



Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregressioncv',
                 LogisticRegressionCV(Cs=[1291.54967], class_weight=None,
                                      cv='warn', dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='warn',
                                      n_jobs=None, penalty='l2',
                                      random_state=None, refit=True,
                                      scoring=None, solver='lbfgs', tol=0.0001,
                                      verbose=0))],
         verbose=False)

In [82]:
import pickle
#assign filename and open pkl
filename = 'ufk_lr_cv.pkl'
model_pkl = open(filename, 'wb')
#export model on local
pickle.dump(ss_lr_cv_final, model_pkl)
model_pkl.close()