# Implementação do Random Forest

In [1]:
TRAINING_DATASET_SOURCE = '../training_data.csv'  # Since we are one directory up, we should go down one directory to import the datasets
TEST_DATASET_SOURCE = '../test_data.csv'

In [2]:
import pandas as pd

train_df = pd.read_csv(TRAINING_DATASET_SOURCE)
test_df = pd.read_csv(TEST_DATASET_SOURCE)

# Definição dos dados de teste e de treino

In [3]:
print(train_df.shape, test_df.shape)

(5000, 13) (1206, 12)


In [4]:
train_df['incidents'].value_counts()

None         2028
High         1073
Low           718
Very_High     603
Medium        578
Name: incidents, dtype: int64

In [5]:
#count_class0, count_class1, count_class2, count_class3, count_class4 = train_df['incidents'].value_counts().to_frame()

incidents_count = train_df['incidents'].value_counts()

max_count = incidents_count.max()

print('Max value count:', max_count)

df_classes = []
for category, counts in zip(incidents_count.index, incidents_count):
    #print(category, counts)
    df_classes.append(train_df[train_df['incidents'] == category])

df_classes_over = []

for category in df_classes:
    df_classes_over.append(category.sample(max_count, replace=True))

df_test_over = pd.concat(df_classes_over, axis=0)

print(df_test_over['incidents'].value_counts())


Max value count: 2028
None         2028
High         2028
Low          2028
Very_High    2028
Medium       2028
Name: incidents, dtype: int64


In [6]:
dropped_columns = ['city_name', 'avg_precipitation', 'magnitude_of_delay', 'record_date', 'affected_roads']

X = df_test_over.drop([*dropped_columns, 'incidents'], axis=1)
y = df_test_over['incidents']

all_features = X.columns.tolist()

In [7]:
X

Unnamed: 0,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_humidity,avg_wind_speed,avg_rain
1931,0,DARK,8.0,1016.0,76.0,1.0,Sem Chuva
2869,0,LIGHT,22.0,1019.0,69.0,0.0,Sem Chuva
4801,0,LIGHT,13.0,1023.0,42.0,2.0,Sem Chuva
4139,0,DARK,16.0,1020.0,80.0,1.0,Sem Chuva
1625,0,LIGHT,16.0,1015.0,93.0,1.0,Sem Chuva
...,...,...,...,...,...,...,...
4860,210,LIGHT,6.0,1026.0,94.0,1.0,Sem Chuva
1406,0,DARK,13.0,1018.0,92.0,0.0,Sem Chuva
2782,0,LIGHT,12.0,1025.0,65.0,2.0,Sem Chuva
3640,0,DARK,11.0,1024.0,68.0,2.0,Sem Chuva


In [8]:
numerical_features = [column for column, dtype in zip(X.columns, X.dtypes)
                      if dtype.kind in ['i', 'f']]

categorical_features = [column for column, dtype in zip(X.columns, X.dtypes)
                        if dtype.kind not in ['i', 'f']]

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2000)

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

preprocessor = make_column_transformer(
    (make_pipeline(
        SimpleImputer(strategy='median'),
        MinMaxScaler()
    ), numerical_features),

    (make_pipeline(
        SimpleImputer(strategy='constant', fill_value='missing'),
        OneHotEncoder(categories='auto', handle_unknown='ignore')

    ), categorical_features)
)

In [14]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_classif

preprocessor_best = make_pipeline(preprocessor, VarianceThreshold(), SelectKBest(f_classif, k='all'))

In [15]:
from sklearn.ensemble import RandomForestClassifier

RF_Model = make_pipeline(preprocessor_best, RandomForestClassifier(n_estimators=100))

In [16]:
RF_Model.fit(X_train, y_train)
RF_Model.score(X_train, y_train)

0.9978867286559594

In [17]:
RF_Model.score(X_test, y_test)

0.9293228139381986

## Hyperparameter Tuning

In [24]:
import numpy as np

n_estimators = [int(x) for x in np.linspace(start=10, stop=20, num=3)]

max_features = ['auto', 'sqrt']

max_depth = [2, 6]

min_samples_split = [2, 5]

min_samples_leaf = [1, 2]

bootstrap = [True, False]

In [25]:
param_grid = {
    'randomforestclassifier__n_estimators': n_estimators,
    'randomforestclassifier__max_features': max_features,
    'randomforestclassifier__max_depth': max_depth,
    'randomforestclassifier__min_samples_split': min_samples_split,
    'randomforestclassifier__min_samples_leaf': min_samples_leaf,
    'randomforestclassifier__bootstrap': bootstrap,
}

param_grid

{'randomforestclassifier__n_estimators': [10, 15, 20],
 'randomforestclassifier__max_features': ['auto', 'sqrt'],
 'randomforestclassifier__max_depth': [2, 6],
 'randomforestclassifier__min_samples_split': [2, 5],
 'randomforestclassifier__min_samples_leaf': [1, 2],
 'randomforestclassifier__bootstrap': [True, False]}

In [32]:
from sklearn.model_selection import RandomizedSearchCV

rf_RandomGrid = RandomizedSearchCV(estimator=RF_Model, param_distributions=param_grid, cv=3, verbose=1, n_jobs=-1,
                                   n_iter=5, scoring='f1')

In [33]:
%%time
rf_RandomGrid.fit(X_train, y_train, average='weighted')

Fitting 3 folds for each of 5 candidates, totalling 15 fits


ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\gonca\.conda\envs\DAA\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\gonca\.conda\envs\DAA\lib\site-packages\sklearn\pipeline.py", line 377, in fit
    fit_params_steps = self._check_fit_params(**fit_params)
  File "C:\Users\gonca\.conda\envs\DAA\lib\site-packages\sklearn\pipeline.py", line 300, in _check_fit_params
    raise ValueError(
ValueError: Pipeline.fit does not accept the average parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.


In [34]:
rf_RandomGrid.score(X_train, y_train)

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.