# Implementação do Random Forest

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from pandas import DataFrame
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

TRAINING_DATASET_SOURCE = '../training_data.csv'  # Since we are one directory up, we should go down one directory to import the datasets
TEST_DATASET_SOURCE = '../test_data.csv'

train_df = pd.read_csv(TRAINING_DATASET_SOURCE)
test_df = pd.read_csv(TEST_DATASET_SOURCE)

# Definição dos dados de teste e de treino

In [12]:
print(train_df.shape, test_df.shape)
print(train_df['incidents'].value_counts())

incidents_count = train_df['incidents'].value_counts()

max_count = incidents_count.max()
print('Max value count:', max_count)

df_classes = []
for category, counts in zip(incidents_count.index, incidents_count):
    #print(category, counts)
    df_classes.append(train_df[train_df['incidents'] == category])

df_classes_over = []

for category in df_classes:
    df_classes_over.append(category.sample(max_count, replace=True))

df_test_over = pd.concat(df_classes_over, axis=0)

print(df_test_over['incidents'].value_counts())


(5000, 13) (1206, 12)
None         2028
High         1073
Low           718
Very_High     603
Medium        578
Name: incidents, dtype: int64
Max value count: 2028
None         2028
High         2028
Low          2028
Very_High    2028
Medium       2028
Name: incidents, dtype: int64


In [13]:
features = df_test_over.drop(['incidents'], axis=1)
target = df_test_over['incidents']

all_features = features.columns.tolist()

features[:5]

Unnamed: 0,city_name,magnitude_of_delay,delay_in_seconds,affected_roads,record_date,luminosity,avg_temperature,avg_atm_pressure,avg_humidity,avg_wind_speed,avg_precipitation,avg_rain
804,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-07-27 09:00,LIGHT,18.0,1016.0,85.0,0.0,0.0,Sem Chuva
3620,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-10-09 14:00,LIGHT,24.0,1019.0,65.0,0.0,0.0,Sem Chuva
470,Guimaraes,UNDEFINED,0,",",2021-12-27 08:00,LOW_LIGHT,12.0,1014.0,94.0,2.0,0.0,Sem Chuva
2645,Guimaraes,UNDEFINED,0,"N101,N101,N101",2021-12-28 22:00,DARK,13.0,1024.0,93.0,1.0,0.0,Sem Chuva
402,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-11-12 01:00,DARK,10.0,1019.0,93.0,1.0,0.0,Sem Chuva


### Obtenção das features numericas e categoricas

In [14]:
from sklearn.preprocessing import StandardScaler

categorical_to_numerical = {
    'luminosity': {
        'LOW_LIGHT': 0,
        'LIGHT': 1,
        'DARK': 2
    },
    'avg_rain': {
        'Sem Chuva': 0,
        'chuva fraca': 1,
        'chuva moderada': 2,
        'chuva forte': 3,
    }
}


def decision_tree_data_preparation(df: DataFrame) -> DataFrame:
    prep_df = df.copy()

    dropped_columns = ['city_name', 'avg_precipitation', 'magnitude_of_delay']

    numerical_features = [column for column, dtype in zip(features.columns, features.dtypes) if
                          dtype.kind in ['i', 'f'] and column not in dropped_columns]

    numerical_features = ['avg_temperature', 'avg_atm_pressure', 'avg_humidity', 'avg_wind_speed', 'luminosity']
    #assert numerical_features == ['avg_temperature', 'avg_atm_pressure', 'avg_humidity', 'avg_wind_speed', 'luminosity']

    categorical_features = [column for column, dtype in zip(features.columns, features.dtypes) if
                            dtype.kind not in ['i', 'f'] and column not in dropped_columns]

    prep_df = prep_df.drop(dropped_columns, axis=1)
    prep_df.drop_duplicates()

    ### Converter as features categoricas em numericas
    prep_df.replace(categorical_to_numerical, inplace=True)

    ### Extrair a hora e dia da semana da feature 'record_date'
    record_date = pd.DatetimeIndex(prep_df['record_date'])

    prep_df['hour'] = record_date.hour
    prep_df['day'] = record_date.day
    prep_df['month'] = record_date.month
    prep_df['weekday'] = record_date.weekday

    prep_df.drop(columns=['record_date'], inplace=True)

    #train_df['affected_roads'] = train_df['affected_roads'].fillna(train_df['affected_roads'].mode().iloc[0])

    num_affected_roads = []
    for line in prep_df['affected_roads']:
        unique_roads = set(str(line).split(','))
        valid_roads = [elem for elem in unique_roads if elem != '']
        count = len(valid_roads)
        num_affected_roads.append(count)

    prep_df['num_affected_roads'] = num_affected_roads

    ### Ao analisar o resultado pós-tratamento, verificámos que a feature 'affected_roads' tinha alta correlação com 'delay_in_minutes'
    prep_df.drop(columns=['affected_roads'], inplace=True)

    ### Converter a feature 'delay_in_seconds' para 'delay_in_minutes' de modo a reduzir o intervalo de valores
    delay_in_minutes = prep_df['delay_in_seconds'].map(lambda seconds: seconds / 60)

    prep_df.drop(columns=['delay_in_seconds'], inplace=True)
    prep_df['delay_in_minutes'] = delay_in_minutes

    ### Limites superior e inferior (sem outliers) dos diagramas de caixa
    #numerical_features.remove('delay_in_seconds')
    #numerical_features.append('delay_in_minutes')

    return prep_df

### Divisão dos dados em dados de teste e treino

In [15]:
target_num = pd.DataFrame(target.map({'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}))
target_num.reset_index(drop=True, inplace=True)
target_num

Unnamed: 0,incidents
0,0
1,0
2,0
3,0
4,0
...,...
10135,2
10136,2
10137,2
10138,2


In [16]:
X_train, X_test, y_train, y_test = train_test_split(decision_tree_data_preparation(features), target_num, test_size=0.3,
                                                    random_state=2000)

### XGBoost Model

In [17]:
from xgboost import XGBClassifier
import xgboost as xgb

print(y_train)
print(y_test)
#boost_model = XGBClassifier(n_estimators=500, early_stopping_rounds=5, random_state=22)
#boost_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

      incidents
8721          2
5465          1
2527          3
5321          1
3987          3
...         ...
9628          2
4380          1
1590          0
4045          3
4936          1

[7098 rows x 1 columns]
      incidents
1564          0
9052          2
8684          2
7491          4
389           0
...         ...
1288          0
4606          1
4999          1
6644          4
2501          3

[3042 rows x 1 columns]


In [18]:
from sklearn.metrics import accuracy_score

#print("Train accuracy:", boost_model.score(X_train, y_train))

#predictions = boost_model.predict(X_test)
#print("Test accuracy: " + str(accuracy_score(predictions, y_test)))

In [26]:
from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import f1_score

X = decision_tree_data_preparation(features)
y = pd.DataFrame(target)
print(type(y_test))
#y = pd.concat([y_test, y_train])
clf_xgb = XGBClassifier(objective='binary:logistic')

param_dist = {
    'n_estimators': stats.randint(150, 500),
    'learning_rate': stats.uniform(0.01, 0.07),
    'subsample': stats.uniform(0.3, 0.7),
    'max_depth': [3, 5, 8],
    'colsample_bytree': stats.uniform(0.5, 0.45),
    'min_child_weight': [1, 3]
}

clf = RandomizedSearchCV(clf_xgb, param_distributions=param_dist, verbose=3,
                         cv=2, n_iter=5, scoring='accuracy', error_score=0,
                         n_jobs=-1)
numFolds = 2
folds = KFold(n_splits=numFolds, shuffle=True)

estimators = []
results = np.zeros(len(X))
score = 0.0
lista_resultados = []
for train_index, test_index in folds.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()
    clf.fit(X_train, y_train)

    estimators.append(clf.best_estimator_)
    results[test_index] = clf.predict(X_test)
    score += accuracy_score(results[test_index], y_test)
    lista_resultados.append(accuracy_score(results[test_index], y_test))
    #score += f1_score(y_test, results[test_index])
score /= numFolds

print(f"Minimo é: {min(lista_resultados)}")
'''
def hyper_param(model, params):
    print("Modelo atual: " , model)
    
    # cv = cross-validation generator
    # verbose = quanto é apresentado
    tuning_model=GridSearchCV(model,param_grid=params,scoring='neg_mean_squared_error',cv=3)
    tuning_model.fit(X_train, y_train)
    
    return tuning_model.best_params_

params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=999,
    seed=42,
    nfold=5,
    metrics = 'error',
    early_stopping_rounds=10
)
print(cv_results)
'''

<class 'numpy.ndarray'>
Fitting 2 folds for each of 5 candidates, totalling 10 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




ValueError: could not convert string to float: 'None'

## Hyperparameter Tuning

### Obtenção das previsões do dataset de submissão

In [None]:
test_data = test_df.copy()

test_data_prepared = decision_tree_data_preparation(test_data)

predictions = boost_model.predict(test_data_prepared)  #RF_Model.predict(test_data_prepared)
predictions_df = pd.DataFrame(predictions)
predictions_df = predictions_df[0].map(
    {0: 'None', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very_High'})
predictions_df.index += 1
predictions_df.to_csv("../submission_v2.csv", header=['Incidents'], index_label='RowId')