# **Análise de Dados Pré-Respetivo Tratamento**

## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# matplotlib inline

## Carregar os dataset de treino e de teste

In [2]:
TRAINING_DATASET_SOURCE = 'training_data.csv'
TEST_DATASET_SOURCE = 'test_data.csv'

In [3]:
train_df = pd.read_csv(TRAINING_DATASET_SOURCE)
test_df = pd.read_csv(TEST_DATASET_SOURCE)

## Exploração de dados

In [4]:
train_df.head()

Unnamed: 0,city_name,magnitude_of_delay,delay_in_seconds,affected_roads,record_date,luminosity,avg_temperature,avg_atm_pressure,avg_humidity,avg_wind_speed,avg_precipitation,avg_rain,incidents
0,Guimaraes,UNDEFINED,0,",",2021-03-15 23:00,DARK,12.0,1013.0,70.0,1.0,0.0,Sem Chuva,
1,Guimaraes,UNDEFINED,385,"N101,",2021-12-25 18:00,DARK,12.0,1007.0,91.0,1.0,0.0,Sem Chuva,
2,Guimaraes,UNDEFINED,69,",",2021-03-12 15:00,LIGHT,14.0,1025.0,64.0,0.0,0.0,Sem Chuva,Low
3,Guimaraes,MAJOR,2297,"N101,R206,N105,N101,N101,N101,N101,N101,N101,N...",2021-09-29 09:00,LIGHT,15.0,1028.0,75.0,1.0,0.0,Sem Chuva,Very_High
4,Guimaraes,UNDEFINED,0,"N101,N101,N101,N101,N101,",2021-06-13 11:00,LIGHT,27.0,1020.0,52.0,1.0,0.0,Sem Chuva,High


In [5]:
train_df.describe()

Unnamed: 0,delay_in_seconds,avg_temperature,avg_atm_pressure,avg_humidity,avg_wind_speed,avg_precipitation
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,560.567,14.583,1018.145,74.455,1.2535,0.0
std,1686.859581,4.820514,5.174372,17.204638,1.269847,0.0
min,0.0,1.0,997.0,6.0,0.0,0.0
25%,0.0,11.0,1015.0,63.0,0.0,0.0
50%,0.0,14.0,1019.0,78.0,1.0,0.0
75%,234.0,18.0,1022.0,90.0,2.0,0.0
max,31083.0,35.0,1032.0,100.0,10.0,0.0


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city_name           5000 non-null   object 
 1   magnitude_of_delay  5000 non-null   object 
 2   delay_in_seconds    5000 non-null   int64  
 3   affected_roads      4915 non-null   object 
 4   record_date         5000 non-null   object 
 5   luminosity          5000 non-null   object 
 6   avg_temperature     5000 non-null   float64
 7   avg_atm_pressure    5000 non-null   float64
 8   avg_humidity        5000 non-null   float64
 9   avg_wind_speed      5000 non-null   float64
 10  avg_precipitation   5000 non-null   float64
 11  avg_rain            5000 non-null   object 
 12  incidents           5000 non-null   object 
dtypes: float64(5), int64(1), object(7)
memory usage: 507.9+ KB


- Quantidade de valores únicos de cada feature

In [7]:
for column in train_df.columns:
    train_df[column].value_counts()

- Análise dos valores da feature **avg_precipitation**

In [8]:
train_df['avg_precipitation'].value_counts()

0.0    5000
Name: avg_precipitation, dtype: int64

- Definição do primeiro e do terceiro quartis
- Definição do valor máximo (*upper*) e mínimo (*lower*)

In [None]:
numeric_columns = ['avg_temperature', 'avg_atm_pressure', 'avg_humidity', 'avg_wind_speed',
                   'delay_in_seconds', 'avg_precipitation']
categorical_columns = ['city_name', 'magnitude_of_delay', 'luminosity', 'avg_rain',
                       'affected_roads', 'record_date']

In [None]:
whiskers = []

for num in numeric_columns:
    Q1 = train_df[num].quantile(0.25)
    Q3 = train_df[num].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    whiskers.append((num, lower, upper))

print(whiskers)

## Visualização gráfica dos dados

- Matriz de correlação para valores numéricos

In [None]:
correlationMatrix = train_df.corr()
f, ax = plt.subplots(figsize=(6, 8))
sns.heatmap(correlationMatrix, vmin=-1, vmax=1, square=True, annot=True)

- Distribuições bivariadas de pares num conjunto de dados

In [None]:
sns.pairplot(train_df)

- Diagrama de caixa para cada feature numérica

In [None]:
for num in numeric_columns:
    fig = plt.figure(figsize=(10, 5))
    sns.boxplot(train_df[num])
    plt.title('Box Plot: Feature ' + num, fontsize=15)
    plt.show()

In [None]:
sns.catplot(x='incidents', y='delay_in_seconds', data=train_df, kind='box', aspect=1.5)

- Quantidade de valores nulos em cada feature

In [None]:
sns.heatmap(train_df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

- Histogramas para cada feature categórica

In [None]:
for categ in categorical_columns:
    fig = plt.figure(figsize=(10, 5))
    sns.histplot(train_df[categ])

###### Observação do número de vezes que cada categoria da feature **avg_rain** aparece

In [None]:
train_df['avg_rain'].value_counts()

- Histograma de um conjunto univariante de observações da feature **incidents** (target)

In [None]:
train_df['incidents'].unique()
train_df['incidents'] = pd.Categorical(train_df['incidents'], categories=['None', 'Low', 'Medium', 'High', 'Very_High'])
sns.displot(train_df['incidents'], kde=True)

- Histograma de um conjunto univariante de observações da feature **avg_wind_speed**

In [None]:
sns.displot(train_df['avg_wind_speed'], kde=True)

- Histograma e outras informações uteis de um conjunto univariante de observações da feature **delay_in_seconds**

In [None]:
sns.displot(train_df['delay_in_seconds'], kde=True)

plt.ylim(0, 150)

In [None]:
train_df['delay_in_seconds'].describe()

In [None]:
sns.displot(train_df[train_df['delay_in_seconds'] > 10000]['delay_in_seconds'], kde=True)

plt.ylim(0, 50)

## Relação entre Features

- Relação entre a feature avg_rain e a feature incidents (target)

In [None]:
train_df.groupby(by=['avg_rain', 'incidents']).count()

- Relação entre a feature magnitude_of_delay e a feature incidents (target)

In [None]:
train_df.groupby(by=['magnitude_of_delay', 'incidents']).count()

In [None]:
train_df.groupby(by=['delay_in_seconds', 'incidents']).mean()

# Análise do dataset de teste

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
test_df.describe()

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
record_date = pd.DatetimeIndex(train_df['record_date'])

train_df['hour'] = record_date.hour
train_df['day'] = record_date.day
train_df['month'] = record_date.month
train_df['weekday'] = record_date.weekday

In [None]:
train_df['hour'].value_counts()

In [None]:
train_df['day'].value_counts()

In [None]:
train_df['month'].value_counts()

In [None]:
train_df['weekday'].value_counts()

## Comparação dos dados do dataset de treino com os dados do dataset de teste

In [None]:
train_df[train_df.duplicated()]

In [None]:
test_df[test_df.duplicated()]


### Obtenção das features númericas

In [None]:
[column for column, dtype in zip(train_df.columns, train_df.dtypes) if dtype.kind in ['i', 'f']]

### Obtenção das features categoricas

In [None]:
[column for column, dtype in zip(train_df.columns, train_df.dtypes) if dtype.kind not in ['i', 'f']]