# Análise exploratória de dados

## 1. Descrição dos dados

### 1.1 Configurações iniciais

In [40]:
# importando bibliotecas
import pandas as pd
from pathlib import Path

### 1.2 Dicionário de dados

In [41]:
# Lendo dicionario de dados
data_path = Path('../data/external/dicionario.csv')
df_dict = pd.read_csv(data_path, sep=';')

df_dict

Unnamed: 0,Variável,Significado,Tipo,Subtipo
0,instant,índice do registro,Quantitativa,Discreta
1,dteday,data,Qualitativa,Ordinal
2,season,estação,Qualitativa,Ordinal
3,yr,ano,Qualitativa,Ordinal
4,mnth,mês,Qualitativa,Ordinal
5,hr,hora,Qualitativa,Ordinal
6,holiday,se o dia é feriado ou não,Qualitativa,Nominal
7,weekday,dia da semana,Qualitativa,Ordinal
8,workingday,se o dia não é fim de semana nem feriado,Qualitativa,Nominal
9,weathersit,situação climática,Qualitativa,Ordinal


### 1.3 Conjunto de dados

Leitura do conjunto de dados

In [42]:
data_path = Path('../data/raw/hour.csv')
df = pd.read_csv(data_path)

df

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


### 1.4 Dados faltantes

Verificar se existe algum valor nulo (NaN).

In [44]:
df.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

Não há nenhum valor nulo encontrado.

Checa se outros valores foram utilizados para representar dados faltantes.

In [51]:
uniques = {'Variável': df_dict['Variável'], 'Valores': []}
for index, row in df_dict.iterrows():
    uniques['Valores'].append(df[row['Variável']].unique())

df_uniques = pd.DataFrame.from_dict(uniques)
df_uniques

Unnamed: 0,Variável,Valores
0,instant,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,dteday,"[2011-01-01, 2011-01-02, 2011-01-03, 2011-01-0..."
2,season,"[1, 2, 3, 4]"
3,yr,"[0, 1]"
4,mnth,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
5,hr,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
6,holiday,"[0, 1]"
7,weekday,"[6, 0, 1, 2, 3, 4, 5]"
8,workingday,"[0, 1]"
9,weathersit,"[1, 2, 3, 4]"


## 2. Perguntas de partida e hipóteses

## 3. Insights