# Implementação do Random Forest

In [None]:
TRAINING_DATASET_SOURCE = 'datasets/training_data.csv'  # Since we are one directory up, we should go down one directory to import the datasets
TEST_DATASET_SOURCE = 'datasets/test_data.csv'

In [None]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from pandas import DataFrame
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

train_df = pd.read_csv(TRAINING_DATASET_SOURCE)
test_df = pd.read_csv(TEST_DATASET_SOURCE)

# Definição dos dados de teste e de treino

In [None]:
print(train_df.shape, test_df.shape)

In [None]:
train_df['incidents'].value_counts()

In [None]:
#count_class0, count_class1, count_class2, count_class3, count_class4 = train_df['incidents'].value_counts().to_frame()

incidents_count = train_df['incidents'].value_counts()

max_count = incidents_count.max()

print('Max value count:', max_count)

df_classes = []
for category, counts in zip(incidents_count.index, incidents_count):
    #print(category, counts)
    df_classes.append(train_df[train_df['incidents'] == category])

df_classes_over = []

for category in df_classes:
    df_classes_over.append(category.sample(max_count, replace=True))

df_test_over = pd.concat(df_classes_over, axis=0)

print(df_test_over['incidents'].value_counts())


In [None]:
features = df_test_over.drop(['incidents'], axis=1)
target = df_test_over['incidents']

all_features = features.columns.tolist()

In [None]:
features

### Obtenção das features numericas e categoricas

In [None]:
dropped_columns = ['city_name', 'avg_precipitation', 'magnitude_of_delay', 'record_date']

numerical_features = [column for column, dtype in zip(features.columns, features.dtypes)
                      if dtype.kind in ['i', 'f'] and column not in dropped_columns]

categorical_features = [column for column, dtype in zip(features.columns, features.dtypes)
                        if
                        dtype.kind not in ['i', 'f'] and column != 'affected_roads' and column not in dropped_columns]

### Divisão dos dados em dados de teste e treino

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=2000)

#### Criação de steps para tratar da remoção de features


In [None]:
# noinspection PyPep8Naming,PyMethodMayBeStatic
class ColumnsRemovalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X: DataFrame, y=None):
        X_ = X.copy()

        X_.drop(columns=dropped_columns, inplace=True)

        return X_

#### Criação de steps para tratar da feature `record_date`

In [None]:
# noinspection PyPep8Naming,PyMethodMayBeStatic
class RecordDateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X: DataFrame, y=None):
        X_ = X.copy()

        ### Extrair a hora e dia da semana da feature 'record_date'
        record_date = pd.DatetimeIndex(X_['record_date'])

        X_['hour'] = record_date.hour
        X_['day'] = record_date.day
        X_['month'] = record_date.month
        X_['weekday'] = record_date.weekday

        X_.drop(columns=['record_date'], inplace=True)

        imputer = SimpleImputer(strategy='median')
        scaler = MinMaxScaler()

        imputer.fit(X_)
        X_ = imputer.transform(X_)

        scaler.fit(X_)
        X_ = scaler.transform(X_)

        return X_

#### Criação de steps para tratar da feature `affected_roads`

In [None]:
# noinspection PyPep8Naming,PyMethodMayBeStatic
class AffectedRoadsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X: DataFrame, y=None):
        X_ = X.copy()

        road_quantity = []
        for line in X_['affected_roads']:
            unique_roads = set(str(line).split(','))
            valid_roads = [elem for elem in unique_roads if elem != '']
            count = len(valid_roads)
            road_quantity.append(count)

        X_['num_affected_roads'] = road_quantity
        X_.drop(columns=['affected_roads'], inplace=True)

        imputer = SimpleImputer(strategy='median')
        scaler = MinMaxScaler()

        imputer.fit(X_)
        X_ = imputer.transform(X_)

        scaler.fit(X_)
        X_ = scaler.transform(X_)

        return X_

### Pipeline de preparação de dados

In [None]:
preprocessor = make_column_transformer(
    (make_pipeline(
        AffectedRoadsTransformer()
    ), ['affected_roads']),

    (make_pipeline(
        RecordDateTransformer()
    ), ['record_date']),

    (make_pipeline(
        ColumnsRemovalTransformer()
    ), dropped_columns),

    (make_pipeline(
        SimpleImputer(strategy='median'),
        MinMaxScaler(),
    ), numerical_features),

    (make_pipeline(
        SimpleImputer(strategy='constant', fill_value='missing'),
        OneHotEncoder(categories='auto', handle_unknown='ignore'),
    ), categorical_features),
)

In [None]:
X_train

In [None]:
preprocessor.fit(X_train)
preprocessor.transform(X_train)

In [None]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_classif

preprocessor_best = make_pipeline(preprocessor, VarianceThreshold())#, SelectKBest(f_classif, k='all'))

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF_Model = make_pipeline(preprocessor_best, RandomForestClassifier(n_estimators=100))

In [None]:
RF_Model.fit(X_train, y_train)
RF_Model.score(X_train, y_train)

In [None]:
RF_Model.score(X_test, y_test)

## Hyperparameter Tuning

In [None]:
import numpy as np

n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=5)]

max_features = ['sqrt']

max_depth = [2, 6, 12, 20, 30, 40, 50]

min_samples_split = [2, 5, 30]

min_samples_leaf = [1, 2, 50]

bootstrap = [True, False]

criterions = ["gini", "entropy", "log_loss"]

In [None]:
param_grid = {
    'randomforestclassifier__n_estimators': n_estimators,
    'randomforestclassifier__max_features': max_features,
    'randomforestclassifier__max_depth': max_depth,
    'randomforestclassifier__min_samples_split': min_samples_split,
    'randomforestclassifier__min_samples_leaf': min_samples_leaf,
    'randomforestclassifier__bootstrap': bootstrap,
    'randomforestclassifier__criterion': criterions
}

param_grid

In [None]:
from sklearn import metrics

print(metrics.get_scorer_names())

In [None]:
from sklearn.model_selection import GridSearchCV

rf_RandomGrid = RandomizedSearchCV(estimator=RF_Model, param_distributions=param_grid, cv=20, verbose=1, n_jobs=-1,
                                   n_iter=50, scoring='balanced_accuracy', random_state=1000)


#rf_RandomGrid = GridSearchCV(estimator=RF_Model, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1, scoring='balanced_accuracy')

In [None]:
%%time
rf_RandomGrid.fit(X_train, y_train)

In [None]:
rf_RandomGrid.score(X_train, y_train)

In [None]:
rf_RandomGrid.cv_results_

In [None]:
rf_RandomGrid.best_score_

In [None]:
rf_RandomGrid.best_params_

In [None]:
rf_RandomGrid.best_estimator_

In [None]:
print(f'Train: {rf_RandomGrid.score(X_train, y_train):.3f}')
print(f'Test: {rf_RandomGrid.score(X_test, y_test):.3f}')

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

predictions = rf_RandomGrid.predict(X_test)

cm = confusion_matrix(y_test, predictions)
# TP FP
# FN TN
disp = ConfusionMatrixDisplay(cm)

disp.plot(cmap='inferno')

### Obtenção das previsões do dataset de submissão

In [None]:
features = test_df.copy()

preprocessor.fit(features)
preprocessor.transform(features)

predictions = rf_RandomGrid.predict(features)
predictions_df = pd.DataFrame(predictions)
predictions_df.index += 1
predictions_df.to_csv("../submission.csv", header=['Incidents'], index_label='RowId')