# Competição ML @SBS/DAA - 5ª Edição (2022/2023)

Esta competição está relacionada com o Trabalho Prático de Grupo das UCs do perfil Machine Learning: Fundamentos e Aplicações da Uminho e da UC Dados e Aprendizagem Automática, tendo, como destinatários, alunos do Mestrado em Engenharia Informática, do Mestrado em Matemática e Computação, e do Mestrado em Engenharia de Sistemas.

# Descrição da Competição
A previsão da quantidade de incidentes rodoviários é um conhecido problema de características estocásticas, não-lineares. Tem, contudo, aparecido na literatura um conjunto de modelos que demonstram um potencial assinalável neste tipo de previsões. Com isso em consideração, foi construído um dataset que contém dados referentes à quantidade e características dos incidentes rodoviários que ocorreram na cidade de Guimarães em 2021 (o dataset cobre um período que vai desde o dia 01 de Janeiro de 2021 até ao dia 31 de Dezembro do mesmo ano).

Com esta competição espera-se que os alunos desenvolvam e otimizem modelos de Machine Learning que sejam capazes de prever o número de incidentes rodoviários que irão acontecer na cidade de Guimarães a uma determinada hora.

## Imports utilizados

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


# matplotlib inline

### Mudar o modo que a Interactive shell imprime as variáveis

In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

## Carregar o dataset de treino

In [None]:
TRAINING_DATASET_SOURCE = 'training_data.csv'
TEST_DATASET_SOURCE = 'test_data.csv'

In [None]:
train_df = pd.read_csv(TRAINING_DATASET_SOURCE)
test_df = pd.read_csv(TEST_DATASET_SOURCE)

## Seed utilizada

In [None]:
SEED = 101

## Exploração de dados

In [None]:
train_df.head()

train_df.describe()

train_df.info()

In [None]:
for column in train_df.columns:
    train_df[column].value_counts()

## Visualização gráfica dos dados

In [None]:
sns.heatmap(train_df.corr(numeric_only=True))

In [None]:
sns.heatmap(train_df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
sns.histplot(train_df['luminosity'])

In [None]:
sns.histplot(train_df['avg_rain'])

In [None]:
sns.displot(train_df['incidents'], kde=True)

## Relação entre Features

In [None]:
train_df.groupby(by=['avg_rain', 'incidents']).count()

## Preparação de Dados

### Features removidas

- Avg_humidity - Relaciona-se bastante com a feature avg_temperature

In [None]:
# X = decision_tree_data_preparation(train_df)

# noinspection PyPep8Naming
def decision_tree_data_preparation(df: pd.DataFrame) -> pd.DataFrame:
    dropped_columns = ['city_name', 'magnitude_of_delay', 'avg_rain', 'avg_humidity', 'avg_precipitation']
    prep_df = df.drop(dropped_columns, axis=1)

    ### Agrupar em 2 bins a feature 'delay_in_seconds'
    prep_df.loc[prep_df['delay_in_seconds'] == 0, 'delay'] = 0  # 'NONE'
    prep_df.loc[prep_df['delay_in_seconds'] >= 1, 'delay'] = 1  # 'DELAYED'

    prep_df.drop(columns=['delay_in_seconds'], inplace=True)

    ### Agrupar em 3 bins a feature 'luminosity'
    prep_df.loc[prep_df['luminosity'] == 'LOW_LIGHT', 'luminosity_binned'] = 0
    prep_df.loc[prep_df['luminosity'] == 'LIGHT', 'luminosity_binned'] = 1
    prep_df.loc[prep_df['luminosity'] == 'DARK', 'luminosity_binned'] = 2

    prep_df.drop(columns=['luminosity'], inplace=True)

    ### Extrair a hora e dia da semana da feature 'record_date'
    record_date = pd.DatetimeIndex(prep_df['record_date'])

    prep_df['record_date_hour'] = record_date.hour
    prep_df['record_date_weekday'] = record_date.weekday

    prep_df.drop(columns=['record_date'], inplace=True)

    ### Agrupar em 3 bins a feature 'avg_temperature'

    prep_df.loc[prep_df['avg_temperature'].between(0, 10, 'right'), 'temperature'] = 0  #'LOW'
    prep_df.loc[prep_df['avg_temperature'].between(10, 19, 'both'), 'temperature'] = 1  #'MEAN'
    prep_df.loc[prep_df['avg_temperature'].between(19, 35, 'right'), 'temperature'] = 2  #'HIGH'

    prep_df.drop(columns=['avg_temperature'], inplace=True)

    ### Quantificar a feature 'affected_roads' para o número único de estradas afetadas
    road_quantity = []
    for line in prep_df['affected_roads']:
        res = set(str(line).split(','))
        res2 = [elem for elem in res if elem != '']
        count = len(res2)
        road_quantity.append(count)

    prep_df['affected_roads'] = road_quantity

    ### (?) Remover outliers da feature 'avg_wind_speed'

    # outliers = train_df["avg_wind_speed"].quantile(8.0)
    # X.drop(X[X['avg_wind_speed'] >= 8.0].index, inplace=True)

    return prep_df

### Divisão do dataset de treino em dados de treino e de teste

In [None]:
from sklearn.model_selection import train_test_split

X = decision_tree_data_preparation(train_df)

y = X['incidents']

X.drop(columns=['incidents'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

### Modelo de Árvore de decisão

In [None]:
clf = DecisionTreeClassifier(random_state=SEED)

clf.fit(X_train, y_train)

scores = cross_val_score(clf, X, y, cv=5)

# Validação cruzada (K Fold)
from sklearn.svm import SVC

cross_valid_model = SVC(random_state=2021)
scores = cross_val_score(cross_valid_model, X, np.ravel(y), cv=2)

scores

print("Accuracy de %0.2f com um desvio padrão de %0.2f" % (scores.mean(), scores.std()))

#### Obter as previsões

In [None]:
predictions = clf.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predictions)
# TP FP
# FN TN
disp = ConfusionMatrixDisplay(cm)

disp.plot()

In [None]:
accuracy_score(y_test, predictions)
# (TP + TN) / (TP + FP + FN + TN)

precision_score(y_test, predictions, average='micro')
# TP / (TP + FP)

recall_score(y_test, predictions, average='micro')
# TP / (TP + FN)

# Reading the test dataset

In [89]:
test_df.head()
test_df.info()
test_df.describe()

Unnamed: 0,city_name,magnitude_of_delay,delay_in_seconds,affected_roads,record_date,luminosity,avg_temperature,avg_atm_pressure,avg_humidity,avg_wind_speed,avg_precipitation,avg_rain
0,Guimaraes,UNDEFINED,1211,"N101,N101,N101,N101,N101,N101,",2021-04-13 19:00,LIGHT,15.0,1018.0,80.0,0.0,0.0,Sem Chuva
1,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-10-13 04:00,DARK,16.0,1018.0,41.0,2.0,0.0,Sem Chuva
2,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-07-18 19:00,LIGHT,22.0,1017.0,71.0,0.0,0.0,Sem Chuva
3,Guimaraes,UNDEFINED,140,"N101,R206,N101,N101,N101,N101",2021-10-30 15:00,LIGHT,18.0,1008.0,81.0,3.0,0.0,Sem Chuva
4,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101",2021-10-18 10:00,LIGHT,20.0,1022.0,86.0,0.0,0.0,Sem Chuva


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1206 entries, 0 to 1205
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city_name           1206 non-null   object 
 1   magnitude_of_delay  1206 non-null   object 
 2   delay_in_seconds    1206 non-null   int64  
 3   affected_roads      1184 non-null   object 
 4   record_date         1206 non-null   object 
 5   luminosity          1206 non-null   object 
 6   avg_temperature     1206 non-null   float64
 7   avg_atm_pressure    1206 non-null   float64
 8   avg_humidity        1206 non-null   float64
 9   avg_wind_speed      1206 non-null   float64
 10  avg_precipitation   1206 non-null   float64
 11  avg_rain            1206 non-null   object 
dtypes: float64(5), int64(1), object(6)
memory usage: 113.2+ KB


Unnamed: 0,delay_in_seconds,avg_temperature,avg_atm_pressure,avg_humidity,avg_wind_speed,avg_precipitation
count,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0
mean,565.080431,14.649254,1018.024876,73.865672,1.332504,0.0
std,1584.544376,4.69118,5.338753,17.202341,1.33751,0.0
min,0.0,2.0,999.0,7.0,0.0,0.0
25%,0.0,11.0,1015.0,62.0,0.0,0.0
50%,0.0,14.0,1018.0,77.0,1.0,0.0
75%,268.75,18.0,1022.0,90.0,2.0,0.0
max,14866.0,30.0,1031.0,100.0,9.0,0.0


In [94]:
X = decision_tree_data_preparation(test_df)

# Getting the predictions

In [1]:
predictions = clf.predict(X)

predictions_df = pd.DataFrame(predictions)

predictions_df.index += 1

predictions_df.to_csv("submission.csv", header=['Incidents'], index_label='RowId')

NameError: name 'clf' is not defined