In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /opt/anaconda3/lib/python3.7/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

import numpy as np


import warnings
warnings.simplefilter('ignore', FutureWarning)

# Data Pre-Processing

In [4]:
# Load and Clean Data

df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()


# Set features(X) and target(y) values
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]
print(X.shape, y.shape)


(6991, 40) (6991,)


In [5]:
# Train and Test split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3563,0,0,0,0,10.548413,5.47e-05,-5.47e-05,139.06402,0.00411,-0.00411,...,-133,4.387,0.066,-0.123,1.092,0.181,-0.097,298.09543,44.737061,13.204
4099,0,0,0,0,24.754385,0.0001365,-0.0001365,140.20732,0.00446,-0.00446,...,-144,4.519,0.078,-0.052,0.804,0.056,-0.076,295.73535,42.576248,15.514
5460,0,0,0,0,1.057336,1.23e-07,-1.23e-07,131.792007,9.6e-05,-9.6e-05,...,-140,4.594,0.054,-0.027,0.683,0.054,-0.06,292.18417,49.31004,15.414
1091,0,0,0,0,201.118319,0.001461,-0.001461,187.56986,0.00529,-0.00529,...,-112,4.447,0.072,-0.108,0.954,0.135,-0.083,283.11377,48.13139,13.328
5999,0,0,0,0,91.649983,0.003181,-0.003181,175.7156,0.0286,-0.0286,...,-233,4.145,0.164,-0.164,1.608,0.905,-0.383,294.93198,39.81242,12.964


In [6]:
# MinMaxScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_scaled = label_encoder.transform(y_train)
y_test_scaled = label_encoder.transform(y_test)


# Create and Train(Fit) Model

### First Classifier

In [7]:
# Create a DecisionTree Model
from sklearn import tree
clf1 = tree.DecisionTreeClassifier()


In [8]:
# Fit (train) or model - First Classifier
clf1 = clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)


0.8661327231121282

### Second Classifier

In [9]:
# Create a DecisionTree Model
from sklearn import tree
clf2 = tree.DecisionTreeClassifier()


In [10]:
# Fit (train) or model - Second Classifier
clf2 = clf2.fit(X_train_scaled, y_train)
clf2.score(X_train_scaled, y_train)


1.0

# Hyperparameter Tuning - GridSearchCV 

In [11]:
# Source Link: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

from sklearn.model_selection import GridSearchCV


param_grid = {
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12]}

grid = GridSearchCV(estimator = clf1, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [12]:
# Fit the model 

grid.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:    5.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],


## Quantify our Trained Model

In [13]:
# List the best parameters for this dataset
# List the best score

print(grid.best_params_)
print('Best Grid score: %.3f' % grid.best_score_)

{'max_depth': 90, 'max_features': 3, 'min_samples_leaf': 5, 'min_samples_split': 12}
Best Grid score: 0.790


In [16]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

print('Decision Tree')
print('Test Accuracy: %.3f' % grid.score(X_test, y_test))

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

Decision Tree
Test Accuracy: 0.799
                precision    recall  f1-score   support

     CANDIDATE       0.59      0.63      0.61       404
     CONFIRMED       0.73      0.76      0.75       435
FALSE POSITIVE       0.94      0.89      0.91       909

      accuracy                           0.80      1748
     macro avg       0.75      0.76      0.76      1748
  weighted avg       0.81      0.80      0.80      1748



#  Save the Model (If the best)

In [17]:
# ## Create a file for your best model and push to GitHub
# import joblib
# filename = 'decision-tree.sav'
# joblib.dump(clf1, filename)

