In [1]:
################################################################################################
#  RUT-SOM-DATA-PT-06-2020-U-C                                                    Douglas High #
#   Machine-Learning-Challenge                                                November 7, 2020 #
#         > model_top3.ipynb                                                                   #
#    - run the top 3 models against the 1687 unknown 'candidate' records.                      #
#    - save each df with predictions to csv.                                                   #
#    > results                                                                                 #
#      - svc model only predicted 3 confirmed planets and 1684 false positive.                 #
#      - DecisionTree predicted 1665 confirmed and 22 false positive.                          #
#      - RandomForest predicted 1515 confirmed and 172 false positive.                         #
################################################################################################

In [2]:
#############################################################################
#00.UI     Update/Install                                                   #
#    a- install/check for update sklearn (to prevent version mismatches).   #
#    b- install joblib - used to save model (uncomment to install).         #
#       *** Restart Kernal after joblib install ***                         #
#############################################################################

#a
!pip install sklearn --upgrade

#b
# !pip install joblib

In [3]:
###########################
#00   I/O                 #
#   - import libraries.   #
#   - associate files.    #
###########################

import pandas as pd
from matplotlib import pyplot
import warnings; warnings.simplefilter('ignore')

data_in = "../data_in/exoplanet_data.csv"

In [4]:
##################################################################
#10     Read and remove                                          #
#   - read data into df.                                         #
#   - drop null columns if all values are null.                  #
#   - drop null rows.                                            #
#   - split data into known values (confirmed, false positive)   #
#     and unknown (candidate).                                   #
##################################################################

df = pd.read_csv(data_in)
df = df.dropna(axis='columns', how='all')
df = df.dropna()

known_df = df[df["koi_disposition"] != "CANDIDATE"]
unknown_df = df[df["koi_disposition"] == "CANDIDATE"]

In [5]:
##############################################################
#20     Set Features                                         #
#   a- select all columns to use as features                 #
#    - disposition is the dependent variable.                #
#   b- drop columns based on results from module0_compare.   #
##############################################################

#a
X = known_df.drop("koi_disposition", axis=1)
y = known_df["koi_disposition"]

#b
cols = [39,33,32,31,30,36,27,25,22,34,16,17,35,10,9]
dt_df = X.drop(X.columns[cols],axis=1)

cols = [12,20,7,9,29,5,23,8,31,27,13,21,37]
svc_df = X.drop(X.columns[cols],axis=1)

cols = [27,38,12,28,32,36,37,39,31,11,34,7,13,17,5]
rf_df = X.drop(X.columns[cols],axis=1)

In [6]:
#######################################################################
#30     Test/train split                                              #
#   - split known_df into training and testing data for each model.   #
#######################################################################

from sklearn.model_selection import train_test_split
dt_train, dt_test, y_train, y_test = train_test_split(dt_df, y, random_state=42)
svc_train, svc_test, y_train, y_test = train_test_split(svc_df, y, random_state=42)
rf_train, rf_test, y_train, y_test = train_test_split(rf_df, y, random_state=42)

In [7]:
#####################################################
#40     Scale                                       #
#   - create standard scaler data for dt and svc.   #
#####################################################

from sklearn.preprocessing import StandardScaler
dt_scaler = StandardScaler().fit(dt_train)
dt_train_scaled = dt_scaler.transform(dt_train)
dt_test_scaled = dt_scaler.transform(dt_test)

svc_scaler = StandardScaler().fit(svc_train)
svc_train_scaled = svc_scaler.transform(svc_train)
svc_test_scaled = svc_scaler.transform(svc_test)

In [8]:
#################################
#50    DecisionTree             #
#   a- create and fit model.    #
#   b- print score for model.   #
#################################

#a 
from sklearn import tree
clf = tree.DecisionTreeClassifier(ccp_alpha = 0.01, max_features = None, splitter = 'random')
clf = clf.fit(dt_train_scaled, y_train)

#b
train = clf.score(dt_train_scaled, y_train)
test = clf.score(dt_test_scaled, y_test)
avg = (train+test)/2
print (f"train:  {train}, test: {test}, avg {avg}")

train:  0.991955756661639, test: 0.9856711915535445, avg 0.9888134741075918


In [9]:
#################################
#60    RandomForest             #
#   a- create and fit model.    #
#   b- print score for model.   #
#################################

#a 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(ccp_alpha = 0, max_features = None, n_estimators = 500, random_state = None)
rf = rf.fit(rf_train, y_train)

#b
train = rf.score(rf_train, y_train)
test = rf.score(rf_test, y_test)
avg = (train+test)/2
print (f"train:  {train}, test: {test}, avg {avg}")


train:  1.0, test: 0.9879336349924586, avg 0.9939668174962293


In [10]:
#################################
#70    SVC                      #
#   a- create and fit model.    #
#   b- print score for model.   #
#################################

#a 
from sklearn.svm import SVC 
model = SVC(C = 5, gamma = 0.0001, kernel = 'linear')
model.fit(svc_train_scaled, y_train)

#b
train = model.score(svc_train_scaled, y_train)
test = model.score(svc_test_scaled, y_test)
avg = (train+test)/2
print (f"train:  {train}, test: {test}, avg {avg}")

train:  0.9924585218702866, test: 0.9856711915535445, avg 0.9890648567119156


In [11]:
##################################################################
#150    DecisionTree vs Candidates                               #
#   a- scale unknown data and run model against it.              #
#   b- predict disposition and add column to df, write to csv.   #
##################################################################

#a
X_dt = unknown_df.drop("koi_disposition", axis=1)

cols = [39,33,32,31,30,36,27,25,22,34,16,17,35,10,9]
X_dt = X_dt.drop(X_dt.columns[cols],axis=1)

X_dt_scaler = StandardScaler().fit(X_dt)
X_dt_scaled = X_dt_scaler.transform(X_dt)

#b
predictions_dt = clf.predict(X_dt_scaled)
X_dt["koi_predicted"] = clf.predict(X_dt_scaled)
X_dt.to_csv("../data_out/dt.csv", index=False, header=True)

In [12]:
##################################################################
#160    RandomForest vs Candidates                               #
#   a- run model against unknown data.                           #
#   b- predict disposition and add column to df, write to csv.   #
##################################################################

#a
X_rf = unknown_df.drop("koi_disposition", axis=1)

cols = [27,38,12,28,32,36,37,39,31,11,34,7,13,17,5]
X_rf = X_rf.drop(X_rf.columns[cols],axis=1)

#b
predictions_rf = rf.predict(X_rf)
X_rf["koi_predicted"] = rf.predict(X_rf)
X_rf.to_csv("../data_out/rf.csv", index=False, header=True)

In [13]:
##################################################################
#150    SVC vs Candidates                                        #
#   a- scale unknown data and run model against it.              #
#   b- predict disposition and add column to df, write to csv.   #
##################################################################

#a
X_svc = unknown_df.drop("koi_disposition", axis=1)

cols = [12,20,7,9,29,5,23,8,31,27,13,21,37]
X_svc = X_svc.drop(X_svc.columns[cols],axis=1)

X_svc_scaler = StandardScaler().fit(X_svc)
X_svc_scaled = X_svc_scaler.transform(X_svc)

#b
predictions_svc = model.predict(X_svc_scaled)
X_svc["koi_predicted"] = model.predict(X_svc_scaled)
X_svc.to_csv("../data_out/svc.csv", index=False, header=True)