In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
ds = pd.read_csv('./data/covtype.csv')

In [3]:
scale = StandardScaler()

In [4]:
X = ds.drop('Cover_Type', axis=1)

In [5]:
scaled_num = pd.DataFrame(scale.fit_transform(X.iloc[:,0:10]), columns = X.iloc[:,0:10].columns)

In [6]:
X = pd.concat([scaled_num, X.iloc[:,10:]], axis = 1)

In [7]:
y = ds['Cover_Type']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=650, test_size=0.3)

In [14]:
rf = RandomForestClassifier(random_state=650)

In [40]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 650,
 'verbose': 0,
 'warm_start': False}

In [33]:
rf.fit(X_train, y_train)

In [36]:
rf_pred = rf.predict(X_test)

In [38]:
accuracy_score(rf_pred, y_test)

0.952009133467964

In [41]:
param_grid = {
    'min_samples_leaf': [1, 2],
    'min_samples_split': [2, 4],
    'n_estimators': [250]
}

In [42]:
grid_search = GridSearchCV(estimator = rf, 
                           param_grid = param_grid, 
                           cv = 3, 
                           verbose = 2)

In [43]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END bootstrap=True, min_samples_leaf=1, min_samples_split=2, n_estimators=250; total time= 1.9min
[CV] END bootstrap=True, min_samples_leaf=1, min_samples_split=2, n_estimators=250; total time= 1.9min
[CV] END bootstrap=True, min_samples_leaf=1, min_samples_split=2, n_estimators=250; total time= 1.9min
[CV] END bootstrap=True, min_samples_leaf=1, min_samples_split=4, n_estimators=250; total time= 1.8min
[CV] END bootstrap=True, min_samples_leaf=1, min_samples_split=4, n_estimators=250; total time= 1.9min
[CV] END bootstrap=True, min_samples_leaf=1, min_samples_split=4, n_estimators=250; total time= 2.1min
[CV] END bootstrap=True, min_samples_leaf=2, min_samples_split=2, n_estimators=250; total time= 2.0min
[CV] END bootstrap=True, min_samples_leaf=2, min_samples_split=2, n_estimators=250; total time= 1.8min
[CV] END bootstrap=True, min_samples_leaf=2, min_samples_split=2, n_estimators=250; total time= 1.7min
[CV] END boot

In [44]:
grid_search.best_params_

{'bootstrap': True,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 250}

In [45]:
best_rf = grid_search.best_estimator_

In [46]:
best_rf_pred = best_rf.predict(X_test)

In [47]:
accuracy_score(best_rf_pred, y_test)

0.9525254727372866