# Implementação do Random Forest

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from pandas import DataFrame
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

TRAINING_DATASET_SOURCE = '../training_data.csv'  # Since we are one directory up, we should go down one directory to import the datasets
TEST_DATASET_SOURCE = '../test_data.csv'

train_df = pd.read_csv(TRAINING_DATASET_SOURCE)
test_df = pd.read_csv(TEST_DATASET_SOURCE)

# Definição dos dados de teste e de treino

In [None]:
print(train_df.shape, test_df.shape)
print(train_df['incidents'].value_counts())

incidents_count = train_df['incidents'].value_counts()

max_count = incidents_count.max()
print('Max value count:', max_count)

df_classes = []
for category, counts in zip(incidents_count.index, incidents_count):
    #print(category, counts)
    df_classes.append(train_df[train_df['incidents'] == category])

df_classes_over = []

for category in df_classes:
    df_classes_over.append(category.sample(max_count, replace=True))

df_test_over = pd.concat(df_classes_over, axis=0)

print(df_test_over['incidents'].value_counts())


In [None]:
features = df_test_over.drop(['incidents'], axis=1)
target = df_test_over['incidents']

all_features = features.columns.tolist()

features[:5]

### Obtenção das features numericas e categoricas

In [None]:
from sklearn.preprocessing import StandardScaler

categorical_to_numerical = {
    'luminosity': {
        'LOW_LIGHT': 0,
        'LIGHT': 1,
        'DARK': 2
    },
    'avg_rain': {
        'Sem Chuva': 0,
        'chuva fraca': 1,
        'chuva moderada': 2,
        'chuva forte': 3,
    }
}


def decision_tree_data_preparation(df: DataFrame) -> DataFrame:
    prep_df = df.copy()

    dropped_columns = ['city_name', 'avg_precipitation', 'magnitude_of_delay']

    numerical_features = [column for column, dtype in zip(features.columns, features.dtypes) if
                          dtype.kind in ['i', 'f'] and column not in dropped_columns]

    numerical_features = ['avg_temperature', 'avg_atm_pressure', 'avg_humidity', 'avg_wind_speed', 'luminosity']
    #assert numerical_features == ['avg_temperature', 'avg_atm_pressure', 'avg_humidity', 'avg_wind_speed', 'luminosity']

    categorical_features = [column for column, dtype in zip(features.columns, features.dtypes) if
                            dtype.kind not in ['i', 'f'] and column not in dropped_columns]

    prep_df = prep_df.drop(dropped_columns, axis=1)
    prep_df.drop_duplicates()

    ### Converter as features categoricas em numericas
    prep_df.replace(categorical_to_numerical, inplace=True)

    ### Extrair a hora e dia da semana da feature 'record_date'
    record_date = pd.DatetimeIndex(prep_df['record_date'])

    prep_df['hour'] = record_date.hour
    prep_df['day'] = record_date.day
    prep_df['month'] = record_date.month
    prep_df['weekday'] = record_date.weekday

    prep_df.drop(columns=['record_date'], inplace=True)

    #train_df['affected_roads'] = train_df['affected_roads'].fillna(train_df['affected_roads'].mode().iloc[0])

    num_affected_roads = []
    for line in prep_df['affected_roads']:
        unique_roads = set(str(line).split(','))
        valid_roads = [elem for elem in unique_roads if elem != '']
        count = len(valid_roads)
        num_affected_roads.append(count)

    prep_df['num_affected_roads'] = num_affected_roads

    ### Ao analisar o resultado pós-tratamento, verificámos que a feature 'affected_roads' tinha alta correlação com 'delay_in_minutes'
    prep_df.drop(columns=['affected_roads'], inplace=True)

    ### Converter a feature 'delay_in_seconds' para 'delay_in_minutes' de modo a reduzir o intervalo de valores
    delay_in_minutes = prep_df['delay_in_seconds'].map(lambda seconds: seconds / 60)

    prep_df.drop(columns=['delay_in_seconds'], inplace=True)
    prep_df['delay_in_minutes'] = delay_in_minutes

    ### Limites superior e inferior (sem outliers) dos diagramas de caixa
    #numerical_features.remove('delay_in_seconds')
    #numerical_features.append('delay_in_minutes')

    return prep_df

### Divisão dos dados em dados de teste e treino

In [None]:
target_num = pd.DataFrame(target.map({'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}))
target_num.reset_index(drop=True, inplace=True)
target_num

In [None]:
X_train, X_test, y_train, y_test = train_test_split(decision_tree_data_preparation(features), target_num, test_size=0.3,
                                                    random_state=2000)

### XGBoost Model

In [None]:
from xgboost import XGBClassifier

boost_model = XGBClassifier(n_estimators=500, early_stopping_rounds=5, random_state=22)
boost_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

In [None]:
from sklearn.metrics import accuracy_score

print("Train accuracy:", boost_model.score(X_train, y_train))

predictions = boost_model.predict(X_test)
print("Test accuracy: " + str(accuracy_score(predictions, y_test)))

In [44]:
from scipy import stats

clf_xgb = XGBClassifier(objective='multi:softmax')

param_dist = {
    'n_estimators': stats.randint(150, 500),
    'learning_rate': stats.uniform(0.01, 1),
    'subsample': stats.uniform(0.3, 0.7),
    'max_depth': [3, 5, 8],
    'colsample_bytree': stats.uniform(0.5, 0.45),
    'min_child_weight': [1, 3]
}

clf = RandomizedSearchCV(clf_xgb, param_distributions=param_dist, verbose=3,
                         cv=20, n_iter=50, scoring='accuracy', error_score=0,
                         random_state=1,
                         n_jobs=-1)

clf.fit(X_train, y_train)

print("Cross Validation results:", clf.cv_results_)

print("RandomGrid best score:", clf.best_score_)

print(f'Train: {clf.score(X_train, y_train):.3f}')
print(f'Test: {clf.score(X_test, y_test):.3f}')

clf.best_estimator_

Fitting 20 folds for each of 50 candidates, totalling 1000 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Cross Validation results: {'mean_fit_time': array([ 9.25274947, 17.09005015,  7.62104981,  9.09464887,  8.52954987,
        7.31574947,  5.07199948,  7.87439961, 10.58844969,  9.39970008,
       21.45329956,  8.7837495 , 13.69439992,  6.83869972, 11.10589986,
        8.62279927,  8.38169988, 10.62974981, 20.69694959,  4.10169971,
        9.73744975, 16.9145492 , 10.56314945,  6.81909953, 11.16275001,
        5.48114932, 11.95399995, 12.10465012,  9.8977995 ,  5.72714951,
        9.92684965, 11.63169951, 12.60144968, 12.29044933,  9.44079947,
        8.7102495 , 10.77184982, 12.11054993, 13.84569966,  7.61399953,
       12.11519952,  6.77154949,  8.45434974, 17.69334953,  9.5424492 ,
       19.00139964, 15.50444981, 15.49119928, 14.45489925,  7.48679872]), 'std_fit_time': array([0.25842747, 0.25842512, 0.25009331, 0.11348471, 0.12692124,
       0.08593316, 0.09772172, 0.05741843, 0.16162266, 0.14207715,
       0.41061408, 0.14704468, 0.09246712, 0.03716182, 0.2596253 ,
       0.09663162

In [None]:
def train_and_evaluate(train_features, train_target, val_features, val_target, **params):
    model = XGBClassifier(random_state=22, n_jobs=-1, **params)

    model.fit(train_features, train_target)
    train_accuracy = model.score(train_features, train_target)

    val_accuracy = model.score(val_features, val_target)

    return model, train_accuracy, val_accuracy

In [None]:
def test_params_kfold(n_splits, **params):
    train_accuracys, val_accuracys, models = [], [], []

    kfold = KFold(n_splits)

    kfold_features = features.copy()
    kfold_target = target_num.copy()

    kfold_features = decision_tree_data_preparation(kfold_features)

    for train_idxs, val_idxs in kfold.split(kfold_features):
        X_train, train_targets = kfold_features.iloc[train_idxs], kfold_target.iloc[train_idxs]
        X_val, val_targets = kfold_features.iloc[val_idxs], kfold_target.iloc[val_idxs]

        model, train_acc, val_acc = train_and_evaluate(X_train, train_targets, X_val, val_targets, **params)

        models.append(model)
        train_accuracys.append(train_acc)
        val_accuracys.append(val_acc)

    print("Train accuracys:", train_accuracys)
    print("Validation accuracys:", val_accuracys)
    print(f'Train accuracy: {np.mean(train_accuracys)}, Validation accuracy: {np.mean(val_accuracys)}')

    return models

In [None]:
% % time
test_params_kfold(5, n_estimators=500, max_depth=6, learning_rate=0.9)

## Hyperparameter Tuning

### Obtenção das previsões do dataset de submissão

In [45]:
test_data = test_df.copy()

test_data_prepared = decision_tree_data_preparation(test_data)

predictions = boost_model.predict(test_data_prepared)  #RF_Model.predict(test_data_prepared)
predictions_df = pd.DataFrame(predictions)
predictions_df = predictions_df[0].map(
    {0: 'None', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very_High'})
predictions_df.index += 1
predictions_df.to_csv("../submission_v2.csv", header=['Incidents'], index_label='RowId')