# RandomForest Exoplanet Classification using Cumulative Object of Interest Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

pd.set_option('display.max_row', 100)
pd.set_option('display.max_columns', 75)

## RandomForest Classifier


In [2]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn import metrics as mt
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, cross_validate

Read pickle files from Data Prep.

In [3]:
#Read Pickle Files
#try:
#    del df_rfecvTrainFillX, df_rfecvTestFillX, df_rfecvTrainMiceX, df_rfecvTestMiceX, topExoTrainFillX, topExoTestFillX, topExoTrainMiceX, topExoTestMiceX
#except Exception as e:
#    print(e)
    
df_scaledTrainFillX = pd.read_pickle("./pkl/df_scaledTrainFillX.pkl")
df_scaledTestFillX = pd.read_pickle("./pkl/df_scaledTestFillX.pkl")
df_scaledTrainMiceX = pd.read_pickle("./pkl/df_scaledTrainMiceX.pkl")
df_scaledTestMiceX = pd.read_pickle("./pkl/df_scaledTestMiceX.pkl")

exoTestFillX = pd.read_pickle("./pkl/exoTestFillX.pkl")
exoTrainFillX = pd.read_pickle("./pkl/exoTrainFillX.pkl")
exoTestMiceX = pd.read_pickle("./pkl/exoTestMiceX.pkl")
exoTrainMiceX = pd.read_pickle("./pkl/exoTrainMiceX.pkl")
exoTrainX = pd.read_pickle('./pkl/exoTrainX.pkl')
exoTestX = pd.read_pickle('./pkl/exoTestX.pkl')

#Response Variables
exoTrainFillY = pd.read_pickle("./pkl/exoTrainFillY.pkl")
exoTestFillY = pd.read_pickle("./pkl/exoTestFillY.pkl")
exoTrainMiceY = pd.read_pickle("./pkl/exoTrainMiceY.pkl")
exoTestMiceY = pd.read_pickle("./pkl/exoTestMiceY.pkl")
exoTestY = pd.read_pickle("./pkl/exoTestY.pkl")
exoTrainY = pd.read_pickle("./pkl/exoTrainY.pkl")

In [4]:
#exoTestFillX.drop(['koi_count'], inplace=True, axis=1)
#exoTrainFillX.drop(['koi_count'], inplace=True, axis=1)

#### RandomForest Classifier Function

In [5]:
def runModelCV(X, y, cv, clf, varImp = True):
    imps = list()
    accList = list()
    f1List = list()
    recList = list()
    i = 1
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]  
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        accList.append(acc)
        print("Model", i, "Accuracy:", acc)
        imps.append(clf.feature_importances_)
        i += 1
    
    print("Mean Accuracy Across Models: ",np.mean(np.asarray(accList)))
    
#    y_hat = clf.predict(X_test)
#    print("======= RandomForest Classifier Scoring =======")
#    acc = mt.accuracy_score(y_test, y_hat)
#    conf = mt.confusion_matrix(y_test,y_hat)
#    prec = mt.precision_score(y_test, y_hat)
#    recall = mt.recall_score(y_test, y_hat)
#    f1 = mt.f1_score(y_test, y_hat)
#    rocAuc = mt.roc_auc_score(y_test, y_hat)
#    print('accuracy:', acc.round(4))
#    print('precision:', prec.round(4))
#    print('recall:', recall.round(4))
#    print('F1:', f1.round(4))
#    print('ROC AUC:', rocAuc.round(4))
#    print(conf)    
    
    if(varImp):
        fig, ax = plt.subplots(figsize=(15,10))
        im = ax.imshow(imps)

        ax.set_xticks(np.arange(X.shape[1]))
        ax.set_yticks(np.arange(i - 1))
        ax.set_xticklabels(list(X))
        ax.set_yticklabels(np.arange(i) + 1)
        ax.set_xlabel("Feature")
        ax.set_ylabel("Model")
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")
        ax.set_title("Comparison of Variable Importance Across CV Runs")
        fig.tight_layout()
        plt.show()

#### GridSearchCV

In [None]:
%%time

#GridSearchCV SVC
from sklearn.model_selection import GridSearchCV

#Create Cross Validation Objects
#cvss = ShuffleSplit(n_splits = 10, test_size=0.20, random_state=0)
cvsss = StratifiedShuffleSplit(n_splits = 5, test_size=0.20, random_state=0)
cv = StratifiedKFold(n_splits=5)

rfEst = RandomForestClassifier()

parameters = { 'n_estimators':[100, 200, 300, 400, 500]
              ,'max_features': [None, "auto", "sqrt", "log2"]
              #,'min_samples_split': [20, 50, 100, 200]
              , 'max_depth': [5, 10, 15]
              , 'min_samples_leaf': [5, 10, 20]
              ,'random_state': [0]
             }

#Create a grid search object using the  
regGridSearch = GridSearchCV(estimator=rfEst
                   , n_jobs=-1 # jobs to run in parallel
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   #, cv=cv
                   , cv=cvsss
                   , scoring='recall') #'accuracy' 'f1' 'recall'


CPU times: user 204 µs, sys: 26 µs, total: 230 µs
Wall time: 234 µs


In [None]:
#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(exoTrainFillX, exoTrainFillY)
print(regGridSearch.best_params_, regGridSearch.best_score_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


###### SAVED:  Stratified K Fold = 10 splits
Fitting 10 folds for each of 240 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 32.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 39.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 46.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 97.2min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 147.3min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed: 169.9min finished

{'max_features': 'auto', 'max_depth': 10, 'min_samples_split': 20, 'n_estimators': 100, 'random_state': 0} 0.991454188848417

CPU times: user 34.9 s, sys: 3.07 s, total: 38 s
Wall time: 2h 49min 54s

##### SAVED:  StratifiedShuffleSplit 5 splits
Fitting 5 folds for each of 180 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 52.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 55.4min finished

{'min_samples_leaf': 5, 'max_features': None, 'max_depth': 15, 'random_state': 0, 'n_estimators': 400} 0.9856521739130435

CPU times: user 1min 4s, sys: 946 ms, total: 1min 5s
Wall time: 56min 16s

In [None]:
#regGridSearch.fit(exoTrainMiceX, exoTrainMiceY)
#print(regGridSearch.best_params_, regGridSearch.best_score_)
regGridSearch.best_params_.values()

### RandomForest Modeling for Different Missing Value Strategies

In [None]:
%%time
print("Zero Filled")
runModelCV(exoTrainFillX, exoTrainFillY, StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, train_size=0.7, random_state=0),
          RandomForestClassifier(min_samples_leaf=10, max_features=None, n_estimators=400, max_depth=8, random_state=0, n_jobs=-1))

In [None]:
%%time
print("KNN Imputation")
runModelCV(exoTrainMiceX, exoTrainMiceY, StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, train_size=0.7, random_state=0),
          RandomForestClassifier(min_samples_leaf=5, max_features=None, n_estimators=400, max_depth=15, random_state=0, n_jobs=-1))

In [None]:
%%time
print("Scaled Zero Filled")
runModelCV(df_scaledTrainFillX, exoTrainFillY, StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, train_size=0.7, random_state=0),
          RandomForestClassifier(min_samples_leaf=5, max_features=None, n_estimators=400, max_depth=15, random_state=0, n_jobs=-1))

In [None]:
%%time
print("Scaled KNN Imputation")
runModelCV(df_scaledTrainMiceX, exoTrainMiceY, StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, train_size=0.7, random_state=0),
          RandomForestClassifier(min_samples_leaf=5, max_features=None, n_estimators=400, max_depth=15, random_state=0, n_jobs=-1))

### Generate Graphics and Predictions

In [None]:
#From https://towardsdatascience.com/how-to-visualize-a-decision-tree-from-a-random-forest-in-python-using-scikit-learn-38ad2d75f21c

from sklearn.tree import export_graphviz

#rfClf = RandomForestClassifier(n_estimators=400, max_depth=9, min_samples_split=100, random_state=0,  n_jobs=-1)
#rfClf = RandomForestClassifier(max_features='auto', n_estimators=100, max_depth=10, min_samples_split=20, random_state=0,  n_jobs=-1)
rfClf = RandomForestClassifier(min_samples_leaf=10, max_features=None, n_estimators=300, max_depth=8, random_state=0, n_jobs=-1)

rfClf.fit(exoTrainFillX, exoTrainFillY)
rfYHat = rfClf.predict(exoTrainFillX)
acc = mt.accuracy_score(exoTrainFillY, rfYHat)
conf = mt.confusion_matrix(exoTrainFillY,rfYHat)
prec = mt.precision_score(exoTrainFillY, rfYHat)
recall = mt.recall_score(exoTrainFillY, rfYHat)
f1 = mt.f1_score(exoTrainFillY, rfYHat)
rocAuc = mt.roc_auc_score(exoTrainFillY, rfYHat)
print('accuracy:', acc.round(4))
print('precision:', prec.round(4))
print('recall:', recall.round(4))
print('F1:', f1.round(4))
print('ROC AUC:', rocAuc.round(4))
print(conf)    


In [None]:
#Feature Importance Bar Plot
fi = pd.DataFrame({'feature': list(exoTrainFillX.columns),
                   'importance': rfClf.feature_importances_}).\
                    sort_values('importance', ascending = False)

print(fi.describe())
fi = fi[fi.importance > 0.001194]
    
plt.figure(figsize=(10,8))
ax = sns.barplot(x='importance', y='feature', data=fi)

In [None]:
export_graphviz(rfClf.estimators_[5], out_file='tree.dot', feature_names=exoTrainFillX.columns, 
                class_names=["FALSE POSITIVE", "EXOPLANET"], rounded=True, proportion=False, precision=2, filled=True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

In [None]:
#rfClf.feature_importances_
rfYHat

In [None]:
yHat = rfClf.predict_proba(exoTestFillX)

In [None]:
yHat = pd.DataFrame(yHat)
print("Greater Than 90%", np.count_nonzero(np.where(yHat[1] > 0.9)))
print("Greater Than 95%", np.count_nonzero(np.where(yHat[1] > 0.95)))

In [None]:
yHat.describe()