In [17]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade
# install joblib and restart the kernel after installing
!pip install joblib

Requirement already up-to-date: sklearn in /Users/Elliot/anaconda3/lib/python3.7/site-packages (0.0)


In [18]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [19]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [20]:
# Remove Space for `FALSE POSITIVE` category
new_df = df["koi_disposition"] == "FALSE POSITIVE"
df.loc[new_df, "koi_disposition"] = "False_Positive"
df["koi_disposition"]

0            CONFIRMED
1       False_Positive
2       False_Positive
3            CONFIRMED
4            CONFIRMED
5            CONFIRMED
6            CONFIRMED
7            CONFIRMED
8            CONFIRMED
9            CONFIRMED
10           CONFIRMED
11      False_Positive
12      False_Positive
13      False_Positive
14           CONFIRMED
15           CONFIRMED
16      False_Positive
17           CONFIRMED
18      False_Positive
19           CONFIRMED
20           CONFIRMED
21           CONFIRMED
22      False_Positive
23      False_Positive
24      False_Positive
25           CONFIRMED
26           CONFIRMED
27           CONFIRMED
28           CONFIRMED
29           CANDIDATE
             ...      
6961         CANDIDATE
6962    False_Positive
6963    False_Positive
6964    False_Positive
6965    False_Positive
6966    False_Positive
6967         CANDIDATE
6968    False_Positive
6969    False_Positive
6970    False_Positive
6971    False_Positive
6972         CONFIRMED
6973    Fal

# Create a Train Test Split

In [21]:
# Assign X and y values
y = df["koi_disposition"]
X = df.drop(columns=["koi_disposition"])

# Split the data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [22]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
4002,0,0,1,0,99.673478,0.0003463,-0.0003463,219.33483,0.0023,-0.0023,...,-148,4.777,0.04,-0.027,0.492,0.026,-0.027,293.05801,45.248821,15.801
4246,0,1,0,0,0.592244,9e-08,-9e-08,131.654831,0.000124,-0.000124,...,-146,4.664,0.056,-0.032,0.591,0.045,-0.045,290.28094,45.46426,15.653
548,0,1,1,0,9.991625,5.36e-06,-5.36e-06,137.447816,0.000445,-0.000445,...,-176,4.338,0.153,-0.187,1.096,0.309,-0.206,301.04239,45.022888,14.039
3953,0,1,0,0,178.41299,3.1e-05,-3.1e-05,218.225235,0.000127,-0.000127,...,-134,4.346,0.084,-0.126,1.148,0.202,-0.124,288.32785,38.627621,13.944
2362,0,0,0,0,45.294223,5.6e-05,-5.6e-05,138.678725,0.000987,-0.000987,...,-68,4.347,0.03,-0.03,1.044,0.057,-0.042,285.67938,50.241299,10.961


# Pre-processing

Scale the data using the MinMaxScaler

In [23]:
# Fit the data to the MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Random Forest Model

In [24]:
from sklearn.ensemble import RandomForestClassifier

# Create the model
model = RandomForestClassifier(n_estimators=300)

# Train the model with G
model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
# Score the model
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8947368421052632


# Hyperparameter Tuning
Use 'GridSearchCV' to tune model's parameters

In [26]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators':[200, 400, 600, 800, 1000],
             'max_depth':[5, 8, 15, 25, 30],
             'min_samples_split':[2, 5, 10],
             'min_samples_leaf':[1, 2, 4]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [27]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 225 candidates, totalling 675 fits
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, score=0.8655606407322655, total=   2.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.5s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, score=0.8609839816933639, total=   2.3s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.0s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, score=0.8637664567830566, total=   2.1s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=400 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=400, score=0.868421052631579, total=   3.1s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=400 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=400, score=0.8604118993135011, total=   3.3s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=400 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=400, score=0.8637664567830566, total=   2.6s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=600 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=600, score=0.868421052631579, total=   3.8s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=600 
[CV]  max_depth=5, min_samp

[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=1000, score=0.8631940469376074, total=   7.1s
[CV] max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=200, score=0.870137299771167, total=   1.6s
[CV] max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=200, score=0.8609839816933639, total=   1.5s
[CV] max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=200, score=0.86090440755581, total=   1.5s
[CV] max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=400 
[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=400, score=0.8695652173913043, total=   2.8s
[CV] max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=400 
[CV]  max_depth=5, min_sam

[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=800, score=0.864338866628506, total=   5.7s
[CV] max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1000 
[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1000, score=0.8661327231121282, total=   7.0s
[CV] max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1000 
[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1000, score=0.8604118993135011, total=   7.0s
[CV] max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1000 
[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1000, score=0.8654836863194046, total=   6.8s
[CV] max_depth=5, min_samples_leaf=4, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=4, min_samples_split=2, n_estimators=200, score=0.8667048054919908, total=   1.4s
[CV] max_depth=5, min_samples_leaf=4, min_samples_split=2, n_estimators=200 
[CV]  max_dep

[CV]  max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=600, score=0.8649112764739554, total=   4.4s
[CV] max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=800 
[CV]  max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=800, score=0.8707093821510298, total=   5.6s
[CV] max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=800 
[CV]  max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=800, score=0.8615560640732265, total=   5.7s
[CV] max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=800 
[CV]  max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=800, score=0.8666285060103034, total=   5.6s
[CV] max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=1000 
[CV]  max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=1000, score=0.8655606407322655, total=   6.9s
[CV] max_depth=5, min_samples_leaf=4, min_samples_split=10, n_estimators=1000 
[CV]  max_de

[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=400, score=0.8866628506010303, total=   3.6s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=600 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=600, score=0.8724256292906178, total=   6.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=600 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=600, score=0.8781464530892449, total=   5.5s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=600 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=600, score=0.886090440755581, total=   5.4s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=800 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=800, score=0.8729977116704806, total=   7.1s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=800 
[CV]  max_depth=

[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=200, score=0.8855180309101317, total=   1.9s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=400 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=400, score=0.8752860411899314, total=   3.6s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=400 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=400, score=0.881578947368421, total=   3.5s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=400 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=400, score=0.8809387521465369, total=   3.6s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=600 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=600, score=0.8747139588100686, total=   5.3s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=600 
[CV]  max_depth=

[CV]  max_depth=8, min_samples_leaf=4, min_samples_split=5, n_estimators=1000, score=0.8855180309101317, total=   9.1s
[CV] max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=200, score=0.8770022883295194, total=   1.8s
[CV] max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=200, score=0.8787185354691075, total=   1.7s
[CV] max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=200, score=0.8849456210646823, total=   1.8s
[CV] max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=400 
[CV]  max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=400, score=0.8741418764302059, total=   3.6s
[CV] max_depth=8, min_samples_leaf=4, min_samples_split=10, n_estimators=400 
[CV]  max_depth

[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=5, n_estimators=800, score=0.8969662278191185, total=   9.2s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=5, n_estimators=1000 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=5, n_estimators=1000, score=0.8918764302059496, total=  11.7s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=5, n_estimators=1000 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=5, n_estimators=1000, score=0.8930205949656751, total=  11.6s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=5, n_estimators=1000 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=5, n_estimators=1000, score=0.8941041785918717, total=  11.5s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=10, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=10, n_estimators=200, score=0.8838672768878718, total=   2.2s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=10, n_estimators=200 
[CV]  

[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=600, score=0.8941041785918717, total=   6.8s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=800 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=800, score=0.8895881006864989, total=   9.1s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=800 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=800, score=0.8907322654462243, total=   9.1s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=800 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=800, score=0.8958214081282198, total=   9.9s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=1000 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=1000, score=0.8901601830663616, total=  11.6s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=1000 
[CV]  max_de

[CV]  max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=400, score=0.8952489982827705, total=   4.7s
[CV] max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=600 
[CV]  max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=600, score=0.8850114416475973, total=   7.8s
[CV] max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=600 
[CV]  max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=600, score=0.8861556064073226, total=   6.7s
[CV] max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=600 
[CV]  max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=600, score=0.8975386376645679, total=   6.7s
[CV] max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=800 
[CV]  max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=800, score=0.8872997711670481, total=   8.6s
[CV] max_depth=15, min_samples_leaf=4, min_samples_split=5, n_estimators=800 
[CV]  max_depth

[CV]  max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=200, score=0.8946765884373211, total=   2.4s
[CV] max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=400 
[CV]  max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=400, score=0.8901601830663616, total=   4.8s
[CV] max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=400 
[CV]  max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=400, score=0.8935926773455377, total=   5.0s
[CV] max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=400 
[CV]  max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=400, score=0.9026903262736119, total=   5.2s
[CV] max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=600 
[CV]  max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=600, score=0.8941647597254004, total=   8.1s
[CV] max_depth=25, min_samples_leaf=1, min_samples_split=5, n_estimators=600 
[CV]  max_depth

[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=2, n_estimators=1000, score=0.8992558672009159, total=  12.0s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=200, score=0.8895881006864989, total=   2.4s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=200, score=0.8872997711670481, total=   2.4s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=200, score=0.8946765884373211, total=   2.4s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=400 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=400, score=0.8918764302059496, total=   4.7s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=5, n_estimators=400 
[CV]  max_dept

[CV]  max_depth=25, min_samples_leaf=4, min_samples_split=2, n_estimators=800, score=0.8998282770463651, total=   9.7s
[CV] max_depth=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 
[CV]  max_depth=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000, score=0.8867276887871853, total=  10.9s
[CV] max_depth=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 
[CV]  max_depth=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000, score=0.8832951945080092, total=  11.1s
[CV] max_depth=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000 
[CV]  max_depth=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000, score=0.8986834573554665, total=  10.9s
[CV] max_depth=25, min_samples_leaf=4, min_samples_split=5, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=4, min_samples_split=5, n_estimators=200, score=0.8924485125858124, total=   2.2s
[CV] max_depth=25, min_samples_leaf=4, min_samples_split=5, n_estimators=200 
[CV]  max

[CV]  max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=600, score=0.8981110475100171, total=   6.6s
[CV] max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=800 
[CV]  max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=800, score=0.8958810068649885, total=   8.8s
[CV] max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=800 
[CV]  max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=800, score=0.8947368421052632, total=   8.9s
[CV] max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=800 
[CV]  max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=800, score=0.8992558672009159, total=   8.9s
[CV] max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1000 
[CV]  max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, score=0.8935926773455377, total=  11.0s
[CV] max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1000 
[CV]  max_de

[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=400, score=0.9015455065827133, total=   4.3s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600, score=0.8895881006864989, total=   6.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600, score=0.8867276887871853, total=   6.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600, score=0.8992558672009159, total=   6.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=800 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=800, score=0.8924485125858124, total=   8.5s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=800 
[CV]  max_depth

[CV]  max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200, score=0.8963938179736691, total=   2.0s
[CV] max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=400 
[CV]  max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=400, score=0.8884439359267735, total=   4.0s
[CV] max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=400 
[CV]  max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=400, score=0.88558352402746, total=   4.0s
[CV] max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=400 
[CV]  max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=400, score=0.8958214081282198, total=   4.0s
[CV] max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=600 
[CV]  max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=600, score=0.8890160183066361, total=   5.9s
[CV] max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=600 
[CV]  max_depth=3

[CV]  max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=1000, score=0.8935317687464225, total=  11.8s


[Parallel(n_jobs=1)]: Done 675 out of 675 | elapsed: 71.0min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 400, 600, 800, 1000], 'max_depth': [5, 8, 15, 25, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

# Model Evaluation

In [28]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 800}
0.8966240701888232


# Model Score

In [29]:
# Score the model
grid.score(X_test_scaled, y_test)

0.8935926773455377

# Save the Model

In [30]:
# Save the fitted model to file
import joblib
filename = 'random_forest_model.sav'
joblib.dump(grid, filename)

['random_forest_model.sav']