# RandomForest HyperParameter Tuning

https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74



In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score

from pprint import pprint
import pandas as pd
import numpy as np
import random


forest = RandomForestClassifier(random_state = 91)

print('Defaults arameters:')
pprint(forest.get_params())


Defaults arameters:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 91,
 'verbose': 0,
 'warm_start': False}


**Parameters to tune**

- n_estimators = number of trees in the foreset
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)


# Loading our data 

- Selected features and model from model exercice (see **model classif selection.ipynb**)


In [15]:
df = pd.read_csv("bigtable.csv")

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep]#.astype(np.float64).

df = clean_dataset(df).reset_index()

features = df[[
    'x', 
    'y', 
    'dayofweek', 
    'sin_day', 
    'cos_day',
    'sin_year', 
    'cos_year', 
    'TEMP', 
    'cos_wind', 
    'sin_wind', 
    'Wind-Rate', 
    'DEW', 
    'SKY', 
    'VIS', 
    'ATM'
]].astype(np.float64)

#features.loc[:,'dayofweek'] = features['dayofweek'].astype('category')

gs = df[['station_id']]

labels = df[[
    'pm25',
    'AQI_VALUE', #pm25 transformed using EPA methodology
    'AQI_class'  #pm25 transformed into EPA categorical class
]]

labels.loc[:,"polluted"] = (labels.loc[:,"AQI_class"] != "Good")

y=labels["polluted"]
X=features

def tts_gs(X, y, gs, test_size):
    stations = gs["station_id"].unique()
    nb_stations = len(stations) * test_size
    print(nb_stations)
    my_randoms = random.sample(list(stations), int(nb_stations))
    filters = gs["station_id"].isin(my_randoms)
    print('Test substations selected: ')
    print(my_randoms)
    return X[~filters], X[filters], y[~filters], y[filters]

Xtrain, Xtest, ytrain, ytest = tts_gs(X, y, gs, 0.25)

3.25
Test substations selected: 
['14th & S ST NW A', '14th & S ST NW B', 'MA_EPA']


In [30]:
y

0         True
1         True
2        False
3        False
4        False
         ...  
97669    False
97670    False
97671    False
97672    False
97673    False
Name: polluted, Length: 97674, dtype: bool

In [33]:
y = LabelEncoder().fit_transform(y)

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [45]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 8)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'estimator__n_estimators': n_estimators,
               'estimator__max_features': max_features,
               'estimator__max_depth': max_depth,
               'estimator__min_samples_split': min_samples_split,
               'estimator__min_samples_leaf': min_samples_leaf,
               'estimator__bootstrap': bootstrap}

pprint(random_grid)

{'estimator__bootstrap': [True, False],
 'estimator__max_depth': [10, 22, 35, 48, 61, 74, 87, 100, None],
 'estimator__max_features': ['auto', 'sqrt'],
 'estimator__min_samples_leaf': [1, 2, 4],
 'estimator__min_samples_split': [2, 3, 5],
 'estimator__n_estimators': [100, 325, 550, 775, 1000]}


In [46]:
# Use the random grid to search for best hyperparameters

model = Pipeline([
     ('preprocessor', StandardScaler()),
     ('estimator', RandomForestClassifier(random_state = 91))
])

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=91, n_jobs = -1)

# Fit the random search model
rf_random.fit(Xtrain, ytrain)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 28.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocessor',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('estimator',
                                              RandomForestClassifier(bootstrap=True,
                                                                     ccp_alpha=0.0,
                                                                     class_weight=None,
                                                                     criterion='gini',
                                                                     max_depth=None,
                                                                     max_features='auto',
                                        

In [55]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = sum(predictions != test_labels)
    trues = sum((test_labels == True))
    falses = sum((test_labels == False))
    true_positives = sum((predictions == test_labels) & (test_labels == True))
    true_negatives = sum((predictions == test_labels) & (test_labels == False))
    false_positives = sum((predictions != test_labels) & (test_labels == True))
    false_negatives = sum((predictions != test_labels) & (test_labels == False))
    
    print('Model Performance')
    print('Average Error: {:0.4f}.'.format(errors / len(test_labels)))
    print('Average true_positives: {:0.4f}.'.format(true_positives / len(test_labels)))
    print('Average true_negatives: {:0.4f}.'.format(true_negatives / len(test_labels)))
    print('Average false_positives: {:0.4f}.'.format(false_positives / len(test_labels)))
    print('Average false_negatives: {:0.4f}.'.format(errors / len(test_labels)))
    
    #Precision = TruePositives / (TruePositives + FalsePositives)
    trues_precision = (true_positives / (true_positives + false_positives))
    falses_precision = (true_negatives / (true_negatives + false_negatives))
    print('Precision for Trues (is Polluted) = {:0.4f}%.'.format(100*trues_precision))
    print('Precision for False (not Polluted) {:0.24}%.'.format(100*falses_precision))
    
    #Recall = TruePositives / (TruePositives + FalseNegatives)
    trues_recall = true_positives / (true_positives + false_negatives)
    falses_recall = true_negatives / (true_negatives + false_positives)
    print('(!)Precision for Trues (is Polluted) = {:0.4f}%.'.format(100*trues_recall))
    print('Precision for False (not Polluted) {:0.4f}%.'.format(100*falses_recall))
    
    #F-1 = (2 * Precision * Recall) / (Precision + Recall)
    trues_f1 = (2 * trues_precision * trues_recall) /(trues_precision + trues_recall)
    falses_f1 = (2 * falses_precision * falses_recall) /(falses_precision + falses_recall)
    print('(!)F1 for Trues (is Polluted) = {:0.4f}%.'.format(100*trues_f1))
    print('F1 for False (not Polluted) {:0.4f}%.'.format(100*falses_f1))
    
    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {:0.4f}%".format(model.__class__.__name__, 100 * f1_score(test_labels, predictions)))
    
    return f1_score(test_labels, predictions)

In [None]:
#base_model = RandomForestClassifier(n_estimators = 10, random_state = 91)
model.fit(Xtrain, ytrain)
base_accuracy = evaluate(model, Xtest, ytest)

In [56]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, Xtest, ytest)

Model Performance
Average Error: 0.1362.
Average true_positives: 0.1338.
Average true_negatives: 0.7300.
Average false_positives: 0.0379.
Average false_negatives: 0.1362.
Precision for Trues (is Polluted) = 77.9286%.
Precision for False (not Polluted) 88.1283533885502947669011%.
(!)Precision for Trues (is Polluted) = 57.6315%.
Precision for False (not Polluted) 95.0663%.
(!)F1 for Trues (is Polluted) = 66.2606%.
F1 for False (not Polluted) 91.4660%.
Pipeline: 66.2606%


In [57]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 3.15%.
