# Implementação do Random Forest

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from pandas import DataFrame
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

TRAINING_DATASET_SOURCE = '../training_data.csv'  # Since we are one directory up, we should go down one directory to import the datasets
TEST_DATASET_SOURCE = '../test_data.csv'

train_df = pd.read_csv(TRAINING_DATASET_SOURCE)
test_df = pd.read_csv(TEST_DATASET_SOURCE)

# Definição dos dados de teste e de treino

In [2]:
print(train_df.shape, test_df.shape)
print(train_df['incidents'].value_counts())

incidents_count = train_df['incidents'].value_counts()

max_count = incidents_count.max()
print('Max value count:', max_count)

df_classes = []
for category, counts in zip(incidents_count.index, incidents_count):
    #print(category, counts)
    df_classes.append(train_df[train_df['incidents'] == category])

df_classes_over = []

for category in df_classes:
    df_classes_over.append(category.sample(max_count, replace=True))

df_test_over = pd.concat(df_classes_over, axis=0)

print(df_test_over['incidents'].value_counts())


(5000, 13) (1206, 12)
None         2028
High         1073
Low           718
Very_High     603
Medium        578
Name: incidents, dtype: int64
Max value count: 2028
None         2028
High         2028
Low          2028
Very_High    2028
Medium       2028
Name: incidents, dtype: int64


In [3]:
features = df_test_over.drop(['incidents'], axis=1)
target = df_test_over['incidents']

all_features = features.columns.tolist()

features[:5]

Unnamed: 0,city_name,magnitude_of_delay,delay_in_seconds,affected_roads,record_date,luminosity,avg_temperature,avg_atm_pressure,avg_humidity,avg_wind_speed,avg_precipitation,avg_rain
3123,Guimaraes,UNDEFINED,0,",",2021-01-22 22:00,DARK,8.0,1017.0,92.0,3.0,0.0,Sem Chuva
10,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-07-27 20:00,LIGHT,23.0,1015.0,67.0,0.0,0.0,Sem Chuva
400,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-10-07 21:00,DARK,20.0,1020.0,74.0,1.0,0.0,Sem Chuva
4050,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-09-20 01:00,DARK,15.0,1022.0,85.0,1.0,0.0,Sem Chuva
1172,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-07-25 09:00,LIGHT,17.0,1014.0,83.0,0.0,0.0,Sem Chuva


### Obtenção das features numericas e categoricas

In [4]:
from sklearn.preprocessing import StandardScaler

categorical_to_numerical = {
    'luminosity': {
        'LOW_LIGHT': 0,
        'LIGHT': 1,
        'DARK': 2
    },
    'avg_rain': {
        'Sem Chuva': 0,
        'chuva fraca': 1,
        'chuva moderada': 2,
        'chuva forte': 3,
    }
}


def decision_tree_data_preparation(df: DataFrame) -> DataFrame:
    prep_df = df.copy()

    dropped_columns = ['city_name', 'avg_precipitation', 'magnitude_of_delay']

    numerical_features = [column for column, dtype in zip(features.columns, features.dtypes) if
                          dtype.kind in ['i', 'f'] and column not in dropped_columns]

    numerical_features = ['avg_temperature', 'avg_atm_pressure', 'avg_humidity', 'avg_wind_speed', 'luminosity']
    #assert numerical_features == ['avg_temperature', 'avg_atm_pressure', 'avg_humidity', 'avg_wind_speed', 'luminosity']

    categorical_features = [column for column, dtype in zip(features.columns, features.dtypes) if
                            dtype.kind not in ['i', 'f'] and column not in dropped_columns]

    prep_df = prep_df.drop(dropped_columns, axis=1)
    prep_df.drop_duplicates()

    ### Converter as features categoricas em numericas
    prep_df.replace(categorical_to_numerical, inplace=True)

    ### Extrair a hora e dia da semana da feature 'record_date'
    record_date = pd.DatetimeIndex(prep_df['record_date'])

    prep_df['hour'] = record_date.hour
    prep_df['day'] = record_date.day
    prep_df['month'] = record_date.month
    prep_df['weekday'] = record_date.weekday

    prep_df.drop(columns=['record_date'], inplace=True)

    #train_df['affected_roads'] = train_df['affected_roads'].fillna(train_df['affected_roads'].mode().iloc[0])

    num_affected_roads = []
    for line in prep_df['affected_roads']:
        unique_roads = set(str(line).split(','))
        valid_roads = [elem for elem in unique_roads if elem != '']
        count = len(valid_roads)
        num_affected_roads.append(count)

    prep_df['num_affected_roads'] = num_affected_roads

    ### Ao analisar o resultado pós-tratamento, verificámos que a feature 'affected_roads' tinha alta correlação com 'delay_in_minutes'
    prep_df.drop(columns=['affected_roads'], inplace=True)

    ### Converter a feature 'delay_in_seconds' para 'delay_in_minutes' de modo a reduzir o intervalo de valores
    delay_in_minutes = prep_df['delay_in_seconds'].map(lambda seconds: seconds / 60)

    prep_df.drop(columns=['delay_in_seconds'], inplace=True)
    prep_df['delay_in_minutes'] = delay_in_minutes

    ### Limites superior e inferior (sem outliers) dos diagramas de caixa
    #numerical_features.remove('delay_in_seconds')
    #numerical_features.append('delay_in_minutes')

    return prep_df

### Divisão dos dados em dados de teste e treino

In [44]:
target_num = pd.DataFrame(target.map({'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}))
target_num.reset_index(drop=True ,inplace=True)
target_num

Unnamed: 0,incidents
0,0
1,0
2,0
3,0
4,0
...,...
10135,2
10136,2
10137,2
10138,2


In [45]:
X_train, X_test, y_train, y_test = train_test_split(decision_tree_data_preparation(features), target_num, test_size=0.3,
                                                    random_state=2000)

### XGBoost Model

In [46]:
from xgboost import XGBClassifier

boost_model = XGBClassifier(n_estimators=500, early_stopping_rounds=5, random_state=22)
boost_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.09885
[1]	validation_0-mlogloss:0.83764
[2]	validation_0-mlogloss:0.67255
[3]	validation_0-mlogloss:0.55742
[4]	validation_0-mlogloss:0.47885
[5]	validation_0-mlogloss:0.41586
[6]	validation_0-mlogloss:0.37156
[7]	validation_0-mlogloss:0.33635
[8]	validation_0-mlogloss:0.30662
[9]	validation_0-mlogloss:0.28544
[10]	validation_0-mlogloss:0.26645
[11]	validation_0-mlogloss:0.25199
[12]	validation_0-mlogloss:0.23595
[13]	validation_0-mlogloss:0.22458
[14]	validation_0-mlogloss:0.21135
[15]	validation_0-mlogloss:0.20068
[16]	validation_0-mlogloss:0.19279
[17]	validation_0-mlogloss:0.18583
[18]	validation_0-mlogloss:0.18001
[19]	validation_0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[31]	validation_0-mlogloss:0.12101
[32]	validation_0-mlogloss:0.11720
[33]	validation_0-mlogloss:0.11483
[34]	validation_0-mlogloss:0.11236
[35]	validation_0-mlogloss:0.10957
[36]	validation_0-mlogloss:0.10588
[37]	validation_0-mlogloss:0.10276
[38]	validation_0-mlogloss:0.10092
[39]	validation_0-mlogloss:0.09861
[40]	validation_0-mlogloss:0.09581
[41]	validation_0-mlogloss:0.09333
[42]	validation_0-mlogloss:0.09230
[43]	validation_0-mlogloss:0.09090
[44]	validation_0-mlogloss:0.08969
[45]	validation_0-mlogloss:0.08751
[46]	validation_0-mlogloss:0.08546
[47]	validation_0-mlogloss:0.08416
[48]	validation_0-mlogloss:0.08270
[49]	validation_0-mlogloss:0.08167
[50]	validation_0-mlogloss:0.08078
[51]	validation_0-mlogloss:0.07938
[52]	validation_0-mlogloss:0.07834
[53]	validation_0-mlogloss:0.07765
[54]	validation_0-mlogloss:0.07661
[55]	validation_0-mlogloss:0.07577
[56]	validation_0-mlogloss:0.07479
[57]	validation_0-mlogloss:0.07420
[58]	validation_0-mlogloss:0.07312
[59]	validation_0-ml

In [47]:
from sklearn.metrics import accuracy_score

print("Train accuracy:", boost_model.score(X_train, y_train))

predictions = boost_model.predict(X_test)
print("Test accuracy: " + str(accuracy_score(predictions, y_test)))

Train accuracy: 1.0
Test accuracy: 0.9842209072978304


In [48]:
def train_and_evaluate(train_features, train_target, val_features, val_target, **params):
    model = make_pipeline(
        XGBClassifier(use_label_encoder=False, random_state=22, n_jobs=-1, **params)
    )

    model.fit(train_features, train_target)
    train_accuracy = model.score(train_features, train_target)
    val_accuracy = model.score(val_features, val_target)

    return model, train_accuracy, val_accuracy

In [52]:
def test_params_kfold(n_splits, **params):
    train_accuracys, val_accuracys, models = [], [], []

    kfold = KFold(n_splits)

    for train_idxs, val_idxs in kfold.split(decision_tree_data_preparation(features)):
        X_train, train_targets = features.iloc[train_idxs], target_num.iloc[train_idxs]
        X_val, val_targets = features.iloc[val_idxs], target_num.iloc[val_idxs]

        model, train_acc, val_acc = train_and_evaluate(X_train, train_targets, X_val, val_targets, **params)

        models.append(model)
        train_accuracys.append(train_acc)
        val_accuracys.append(val_acc)

    print(f'Train accuracy: {np.mean(train_accuracys)}, Validation accuracy: {np.mean(val_accuracys)}')

    return models

In [53]:
%%time
test_params_kfold(5, n_estimators=500, max_depth=6, learning_rate=0.9)

ValueError: The label must consist of integer labels of form 0, 1, 2, ..., [num_class - 1].

## Hyperparameter Tuning

### Obtenção das previsões do dataset de submissão

In [29]:
test_data = test_df.copy()

test_data_prepared = decision_tree_data_preparation(test_data)

predictions = boost_model.predict(test_data_prepared)  #RF_Model.predict(test_data_prepared)
predictions_df = pd.DataFrame(predictions)
predictions_df = predictions_df[0].map(
    {0: 'None', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very_High'})
predictions_df.index += 1
predictions_df.to_csv("../submission_v2.csv", header=['Incidents'], index_label='RowId')