# RandomForest HyperParameter Tuning

https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74



In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer

from pprint import pprint
import pandas as pd
import numpy as np
import random


forest = RandomForestClassifier(random_state = 91)

print('Defaults arameters:')
pprint(forest.get_params())


Defaults arameters:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 91,
 'verbose': 0,
 'warm_start': False}


**Parameters to tune**

- n_estimators = number of trees in the foreset
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)


# Loading our data 

- Selected features and model from model exercice (see **model classif selection.ipynb**)


In [2]:
df = pd.read_csv("bigtable.csv")

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep]#.astype(np.float64).

df = clean_dataset(df).reset_index()

features = df[[
    'x', 
    'y', 
    'dayofweek', 
    'sin_day', 
    'cos_day',
    'sin_year', 
    'cos_year', 
    'TEMP', 
    'cos_wind', 
    'sin_wind', 
    'Wind-Rate', 
    'DEW', 
    'SKY', 
    'VIS', 
    'ATM'
]].astype(np.float64)

features.loc[:,'dayofweek'] = features['dayofweek'].astype('category')

numericColumns = ['x','y',
    'dayofweek', 'sin_day', 'cos_day', 'sin_year', 'cos_year', 
    'TEMP', 'cos_wind', 'sin_wind', 'Wind-Rate', 'DEW', 'SKY', 'VIS',  'ATM'
    ]
categoricalColumns = ['dayofweek']

gs = df[['station_id']]

labels = df[[
    'pm25',
    'AQI_VALUE', #pm25 transformed using EPA methodology
    'AQI_class'  #pm25 transformed into EPA categorical class
]]

labels.loc[:,"polluted"] = (labels.loc[:,"AQI_class"] != "Good")

y=labels["polluted"]
X=features

def tts_gs(X, y, gs, test_size):
    stations = gs["station_id"].unique()
    nb_stations = len(stations) * test_size
    print(nb_stations)
    my_randoms = random.sample(list(stations), int(nb_stations))
    filters = gs["station_id"].isin(my_randoms)
    print('Test substations selected: ')
    print(my_randoms)
    return X[~filters], X[filters], y[~filters], y[filters]

Xtrain, Xtest, ytrain, ytest = tts_gs(X, y, gs, 0.25)

3.25
Test substations selected: 
['DC_EPA', 'McMillan 1', '14th & S ST NW A']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [3]:
y = LabelEncoder().fit_transform(y)

In [4]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 140, num = 12)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 8]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'estimator__n_estimators': n_estimators,
               'estimator__max_features': max_features,
               'estimator__max_depth': max_depth,
               'estimator__min_samples_split': min_samples_split,
               'estimator__min_samples_leaf': min_samples_leaf,
               'estimator__bootstrap': bootstrap}

pprint(random_grid)

{'estimator__bootstrap': [True, False],
 'estimator__max_depth': [10,
                          21,
                          33,
                          45,
                          57,
                          69,
                          80,
                          92,
                          104,
                          116,
                          128,
                          140,
                          None],
 'estimator__max_features': ['auto', 'sqrt'],
 'estimator__min_samples_leaf': [1, 2, 3, 5],
 'estimator__min_samples_split': [2, 3, 5, 8],
 'estimator__n_estimators': [100,
                             222,
                             344,
                             466,
                             588,
                             711,
                             833,
                             955,
                             1077,
                             1200]}


In [5]:
# Use the random grid to search for best hyperparameters
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#categorical_features = categoricalColumns
categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

model = Pipeline([
     ("ColumnTransformer", ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numericColumns),
            ('cat', categorical_transformer, categoricalColumns)
        ])),
     ('estimator', RandomForestClassifier(random_state = 91))
])


# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 200, cv = 3, verbose=2, random_state=91, n_jobs = -1)

# Fit the random search model
rf_random.fit(Xtrain, ytrain)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 33.3min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 503.7min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('ColumnTransformer',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('num',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('scaler',
                                                                                                StandardScaler(copy=True,
                                                                                          

In [6]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = sum(predictions != test_labels)
    trues = sum((test_labels == True))
    falses = sum((test_labels == False))
    true_positives = sum((predictions == test_labels) & (test_labels == True))
    true_negatives = sum((predictions == test_labels) & (test_labels == False))
    false_positives = sum((predictions != test_labels) & (test_labels == True))
    false_negatives = sum((predictions != test_labels) & (test_labels == False))
    
    print('Model Performance')
    print('Average Error: {:0.4f}.'.format(errors / len(test_labels)))
    print('Average true_positives: {:0.4f}.'.format(true_positives / len(test_labels)))
    print('Average true_negatives: {:0.4f}.'.format(true_negatives / len(test_labels)))
    print('Average false_positives: {:0.4f}.'.format(false_positives / len(test_labels)))
    print('Average false_negatives: {:0.4f}.'.format(errors / len(test_labels)))
    
    #Precision = TruePositives / (TruePositives + FalsePositives)
    trues_precision = (true_positives / (true_positives + false_positives))
    falses_precision = (true_negatives / (true_negatives + false_negatives))
    print('Precision for Trues (is Polluted) = {:0.4f}%.'.format(100*trues_precision))
    print('Precision for False (not Polluted) {:0.24}%.'.format(100*falses_precision))
    
    #Recall = TruePositives / (TruePositives + FalseNegatives)
    trues_recall = true_positives / (true_positives + false_negatives)
    falses_recall = true_negatives / (true_negatives + false_positives)
    print('(!)Precision for Trues (is Polluted) = {:0.4f}%.'.format(100*trues_recall))
    print('Precision for False (not Polluted) {:0.4f}%.'.format(100*falses_recall))
    
    #F-1 = (2 * Precision * Recall) / (Precision + Recall)
    trues_f1 = (2 * trues_precision * trues_recall) /(trues_precision + trues_recall)
    falses_f1 = (2 * falses_precision * falses_recall) /(falses_precision + falses_recall)
    print('(!)F1 for Trues (is Polluted) = {:0.4f}%.'.format(100*trues_f1))
    print('F1 for False (not Polluted) {:0.4f}%.'.format(100*falses_f1))
    
    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {:0.4f}%".format(model.__class__.__name__, 100 * f1_score(test_labels, predictions)))
    
    return f1_score(test_labels, predictions)

In [7]:
#base_model = RandomForestClassifier(n_estimators = 10, random_state = 91)
model.fit(Xtrain, ytrain)
base_accuracy = evaluate(model, Xtest, ytest)

Model Performance
Average Error: 0.1933.
Average true_positives: 0.1433.
Average true_negatives: 0.6634.
Average false_positives: 0.0073.
Average false_negatives: 0.1933.
Precision for Trues (is Polluted) = 95.1629%.
Precision for False (not Polluted) 78.1033219871601573913722%.
(!)Precision for Trues (is Polluted) = 43.5145%.
Precision for False (not Polluted) 98.9141%.
(!)F1 for Trues (is Polluted) = 59.7208%.
F1 for False (not Polluted) 87.2854%.
Pipeline: 59.7208%


In [8]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, Xtest, ytest)

Model Performance
Average Error: 0.2073.
Average true_positives: 0.1319.
Average true_negatives: 0.6608.
Average false_positives: 0.0187.
Average false_negatives: 0.2073.
Precision for Trues (is Polluted) = 87.6033%.
Precision for False (not Polluted) 77.7974061786376012150868%.
(!)Precision for Trues (is Polluted) = 41.1556%.
Precision for False (not Polluted) 97.2530%.
(!)F1 for Trues (is Polluted) = 56.0019%.
F1 for False (not Polluted) 86.4440%.
Pipeline: 56.0019%


In [9]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of -6.23%.


In [11]:
rf_random.best_params_

{'estimator__n_estimators': 100,
 'estimator__min_samples_split': 8,
 'estimator__min_samples_leaf': 3,
 'estimator__max_features': 'auto',
 'estimator__max_depth': 116,
 'estimator__bootstrap': True}

In [14]:
model.get_params()

{'memory': None,
 'steps': [('ColumnTransformer',
   ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                     transformer_weights=None,
                     transformers=[('num',
                                    Pipeline(memory=None,
                                             steps=[('scaler',
                                                     StandardScaler(copy=True,
                                                                    with_mean=True,
                                                                    with_std=True))],
                                             verbose=False),
                                    ['x', 'y', 'dayofweek', 'sin_day', 'cos_day',
                                     'sin_year', 'cos_year', 'TEMP', 'cos_wind',
                                     'sin_wind', 'Wind-Rate', 'DEW', 'SKY', 'VIS',
                                     'ATM']),
                                   ('cat',
                    

In [None]:
from sklearn.externals import joblib
joblib.dump(rf_random.best_estimator_, 'best_random.model')
