# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras.losses import SparseCategoricalCrossentropy
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

%matplotlib inline
print(tf.__version__)

2.9.1


# Carregamento de dados

In [2]:
TRAINING_DATASET_SOURCE = 'training_data.csv'
TEST_DATASET_SOURCE = 'test_data.csv'

train_df = pd.read_csv(TRAINING_DATASET_SOURCE)
test_df = pd.read_csv(TEST_DATASET_SOURCE)

# SEED utilizada

In [3]:
SEED = 2022

# Preparação dos dados

In [4]:
categorical_to_numerical = {
    'avg_rain': {
        'Sem Chuva': 0,
        'chuva fraca': 1,
        'chuva moderada': 2,
        'chuva forte': 3
    },
    'luminosity': {
        'LOW_LIGHT': 0,
        'LIGHT': 1,
        'DARK': 2,
    }
}

incidents_to_numerical = {
    'incidents': {
        'None': 0,
        'Low': 1,
        'Medium': 2,
        'High': 3,
        'Very_High': 4,
    }
}

In [5]:
def neural_network_data_preparation(df: pd.DataFrame) -> pd.DataFrame:
    dropped_columns = ['city_name', 'magnitude_of_delay', 'avg_precipitation']

    prep_df = df.drop(dropped_columns, axis=1)

    ### Extrair a hora e dia da semana da feature 'record_date'
    record_date = pd.DatetimeIndex(prep_df['record_date'])

    prep_df['record_date_hour'] = record_date.hour
    prep_df['record_date_day'] = record_date.day
    prep_df['record_date_month'] = record_date.month
    prep_df['record_date_weekday'] = record_date.weekday

    prep_df.drop(columns=['record_date'], inplace=True)

    ### Quantificar a feature 'affected_roads' para o número único de estradas afetadas
    road_quantity = []
    for line in prep_df['affected_roads']:
        res = set(str(line).split(','))
        res2 = [elem for elem in res if elem != '']
        count = len(res2)
        road_quantity.append(count)

    prep_df['affected_roads'] = road_quantity

    prep_df.replace(categorical_to_numerical, inplace=True)

    ### Target
    if 'incidents' in prep_df.columns:
        prep_df.replace(incidents_to_numerical, inplace=True)

    return prep_df

In [6]:
X = neural_network_data_preparation(train_df)

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   delay_in_seconds     5000 non-null   int64  
 1   affected_roads       5000 non-null   int64  
 2   luminosity           5000 non-null   int64  
 3   avg_temperature      5000 non-null   float64
 4   avg_atm_pressure     5000 non-null   float64
 5   avg_humidity         5000 non-null   float64
 6   avg_wind_speed       5000 non-null   float64
 7   avg_rain             5000 non-null   int64  
 8   incidents            5000 non-null   int64  
 9   record_date_hour     5000 non-null   int64  
 10  record_date_day      5000 non-null   int64  
 11  record_date_month    5000 non-null   int64  
 12  record_date_weekday  5000 non-null   int64  
dtypes: float64(4), int64(9)
memory usage: 507.9 KB


In [8]:
y = X['incidents']

X.drop(columns=['incidents'], inplace=True)

In [9]:
scaler_X = MinMaxScaler(feature_range=(0, 1)).fit(X)
X_scaled = pd.DataFrame(scaler_X.transform(X[X.columns]), columns=X.columns)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=SEED)

# Construção da estrutura da rede neuronal

In [11]:
from keras import regularizers
from tensorflow import keras

optimizer = keras.optimizers.Adam(learning_rate=0.001)


def build_model():
    model = keras.Sequential()

    model.add(Dense(16, input_dim=12, activation='relu',
                    kernel_regularizer=regularizers.l2(0.001),
                    ))  # Input Layer

    model.add(Dense(8, activation="relu",
                    kernel_regularizer=regularizers.l2(0.001),
                    ))

    model.add(Dense(5, activation='softmax',
                    kernel_regularizer=regularizers.l2(0.001),
                    ))  # Output Layer

    # Model compilation
    model.compile(loss=SparseCategoricalCrossentropy(), optimizer=optimizer,
                  #keras.optimizers.Adam(learning_rate=0.001),
                  metrics=['accuracy'])

    return model


In [12]:
model = build_model()

In [13]:
EPOCHS = 500
BATCH_SIZE = 100

In [14]:
history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

-------
-------
-------

In [15]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)



In [16]:
predicts = model.predict(X_test)

categories_predicted = [np.argmax(pred) for pred in predicts]



In [17]:
predicts[:5]

array([[6.24764025e-01, 1.89585939e-01, 1.09240174e-01, 7.62948319e-02,
        1.15002462e-04],
       [7.51121645e-14, 3.46249677e-02, 6.37950888e-03, 2.24686176e-01,
        7.34309196e-01],
       [9.30186272e-01, 6.34086952e-02, 6.39894418e-03, 6.15401814e-06,
        2.49290199e-13],
       [9.53964531e-01, 4.08694856e-02, 5.15167601e-03, 1.43478701e-05,
        1.48755643e-11],
       [8.00024867e-01, 1.92097053e-01, 7.79068610e-03, 8.73290483e-05,
        1.33183664e-09]], dtype=float32)

In [18]:
categories_predicted

[0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 3,
 0,
 0,
 3,
 4,
 2,
 3,
 0,
 0,
 3,
 4,
 0,
 3,
 0,
 4,
 1,
 0,
 0,
 0,
 0,
 3,
 4,
 4,
 4,
 3,
 0,
 3,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 3,
 2,
 0,
 4,
 4,
 0,
 3,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 3,
 3,
 3,
 4,
 1,
 3,
 4,
 0,
 0,
 0,
 1,
 0,
 3,
 3,
 3,
 3,
 2,
 0,
 0,
 4,
 0,
 0,
 3,
 3,
 2,
 4,
 0,
 0,
 0,
 0,
 1,
 3,
 1,
 3,
 4,
 3,
 1,
 2,
 3,
 0,
 0,
 3,
 3,
 2,
 0,
 4,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 3,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 1,
 3,
 3,
 4,
 0,
 3,
 1,
 2,
 3,
 4,
 3,
 0,
 3,
 0,
 2,
 2,
 0,
 3,
 3,
 0,
 0,
 0,
 1,
 4,
 0,
 0,
 2,
 0,
 3,
 1,
 0,
 0,
 3,
 4,
 3,
 3,
 4,
 0,
 0,
 2,
 4,
 0,
 3,
 2,
 2,
 0,
 0,
 0,
 1,
 2,
 3,
 3,
 3,
 0,
 3,
 1,
 0,
 1,
 0,
 4,
 0,
 3,
 3,
 2,
 0,
 1,
 2,
 3,
 1,
 3,
 2,
 0,
 4,
 0,
 3,
 3,
 3,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 3,
 3,
 3,
 0,
 0,
 3,
 3,
 0,
 0,
 2,
 0,
 0,
 4,
 0,
 0,
 3,
 3,
 3,
 0,
 0,
 3,
 0,
 4,
 1,
 3,
 0,
 4,
 0,
 3,
 0,
 3,
 0,
 0,


In [19]:
y_test

3419    1
178     4
3721    0
3087    0
3685    1
       ..
3884    4
3855    0
2070    4
1482    0
2602    0
Name: incidents, Length: 1000, dtype: int64

# Obter as previsões no dataset de submissão

In [20]:
X = neural_network_data_preparation(test_df)

scaler_X = MinMaxScaler(feature_range=(0, 1)).fit(X)
X_scaled = pd.DataFrame(scaler_X.transform(X[X.columns]), columns=X.columns)

X_scaled

Unnamed: 0,delay_in_seconds,affected_roads,luminosity,avg_temperature,avg_atm_pressure,avg_humidity,avg_wind_speed,avg_rain,record_date_hour,record_date_day,record_date_month,record_date_weekday
0,0.081461,0.142857,0.5,0.464286,0.59375,0.784946,0.000000,0.0,0.826087,0.400000,0.272727,0.166667
1,0.000000,0.142857,1.0,0.500000,0.59375,0.365591,0.222222,0.0,0.173913,0.400000,0.818182,0.333333
2,0.000000,0.142857,0.5,0.714286,0.56250,0.688172,0.000000,0.0,0.826087,0.566667,0.545455,1.000000
3,0.009417,0.285714,0.5,0.571429,0.28125,0.795699,0.333333,0.0,0.652174,0.966667,0.818182,0.833333
4,0.000000,0.142857,0.5,0.642857,0.71875,0.849462,0.000000,0.0,0.434783,0.566667,0.818182,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1201,0.000000,0.142857,0.5,0.357143,0.81250,0.677419,0.222222,0.0,0.434783,1.000000,1.000000,0.666667
1202,0.002960,0.142857,1.0,0.785714,0.53125,0.344086,0.222222,0.0,0.000000,0.500000,0.545455,0.666667
1203,0.084824,0.142857,0.5,0.392857,0.59375,0.569892,0.000000,0.0,0.652174,0.600000,1.000000,1.000000
1204,0.016548,0.142857,0.5,0.464286,0.50000,0.462366,0.111111,0.0,0.739130,0.233333,0.181818,0.000000


In [21]:
categories_prob_predictions = model.predict(X_scaled)

categories_prob_predictions



array([[6.1506165e-22, 3.3568775e-03, 6.2292762e-04, 3.3943556e-02,
        9.6207666e-01],
       [6.8406546e-01, 3.0793300e-01, 7.8484528e-03, 1.5309476e-04,
        4.2707646e-09],
       [7.4936759e-01, 1.9306409e-01, 4.1502073e-02, 1.6062908e-02,
        3.2123401e-06],
       ...,
       [9.8039036e-13, 4.9568065e-05, 2.6055204e-02, 8.5858166e-01,
        1.1531359e-01],
       [2.6146513e-10, 2.6017028e-01, 7.4400930e-03, 2.0831950e-01,
        5.2407014e-01],
       [3.3860633e-03, 8.7390877e-02, 1.7650658e-01, 7.2259378e-01,
        1.0122718e-02]], dtype=float32)

In [22]:
numerical_predictions = [np.argmax(pred) for pred in categories_prob_predictions]

numerical_predictions[:10]

[4, 0, 0, 1, 0, 4, 4, 0, 3, 1]

In [23]:
numerical_predictions_df = pd.DataFrame(numerical_predictions)

incidents_categories = {
    0: 'None',
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very_High',
}

predictions_df = numerical_predictions_df.replace(incidents_categories)

In [24]:
predictions_df.index += 1

predictions_df.to_csv("submission.csv", header=['Incidents'], index_label='RowId')