# Beer Consumption

## Libraries

In [1]:
import pandas as pd
import numpy as np

## Read dataset

In [2]:
data_path = 'dataset/Consumo_cerveja.csv'

In [4]:
raw_data = pd.read_csv(data_path)

In [5]:
raw_data

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,2015-01-01,273,239,325,0,0.0,25.461
1,2015-01-02,2702,245,335,0,0.0,28.972
2,2015-01-03,2482,224,299,0,1.0,30.814
3,2015-01-04,2398,215,286,12,1.0,29.799
4,2015-01-05,2382,21,283,0,0.0,28.900
...,...,...,...,...,...,...,...
936,,,,,,,
937,,,,,,,
938,,,,,,,
939,,,,,,,


## Pre-processing

### Dropna

In [21]:
raw_data.isna().sum()

Data                           0
Temperatura Media (C)          0
Temperatura Minima (C)         0
Temperatura Maxima (C)         0
Precipitacao (mm)              0
Final de Semana                0
Consumo de cerveja (litros)    0
dtype: int64

In [10]:
raw_data.dropna(axis=0, how='all', inplace=True)

In [12]:
raw_data.isna().sum()

Data                           0
Temperatura Media (C)          0
Temperatura Minima (C)         0
Temperatura Maxima (C)         0
Precipitacao (mm)              0
Final de Semana                0
Consumo de cerveja (litros)    0
dtype: int64

### Check for zeros

In [22]:
(raw_data==0).sum()

Data                             0
Temperatura Media (C)            0
Temperatura Minima (C)           0
Temperatura Maxima (C)           0
Precipitacao (mm)                0
Final de Semana                261
Consumo de cerveja (litros)      0
dtype: int64

### Replace comma by dot

In [26]:
def real_brazilian_to_float(numeric_string):
    """Converte valores numérios no formato brasileiro 1.234,56 para float"""
    try:
        return float(numeric_string.replace(".", "").replace(",", "."))
    except:
        return numeric_string 

In [37]:
cols_to_dot = ['Temperatura Media (C)',
               'Temperatura Minima (C)',
               'Temperatura Maxima (C)',
               'Precipitacao (mm)']

In [38]:
for cols in cols_to_dot:
    raw_data[cols] = raw_data[cols].apply(real_brazilian_to_float)

### Convert 'Data' column type

In [46]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365 entries, 0 to 364
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Data                         365 non-null    object 
 1   Temperatura Media (C)        365 non-null    float64
 2   Temperatura Minima (C)       365 non-null    float64
 3   Temperatura Maxima (C)       365 non-null    float64
 4   Precipitacao (mm)            365 non-null    float64
 5   Final de Semana              365 non-null    float64
 6   Consumo de cerveja (litros)  365 non-null    float64
dtypes: float64(6), object(1)
memory usage: 22.8+ KB


In [50]:
raw_data['Data'] = raw_data['Data'].astype('datetime64[ns]')

In [51]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365 entries, 0 to 364
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Data                         365 non-null    datetime64[ns]
 1   Temperatura Media (C)        365 non-null    float64       
 2   Temperatura Minima (C)       365 non-null    float64       
 3   Temperatura Maxima (C)       365 non-null    float64       
 4   Precipitacao (mm)            365 non-null    float64       
 5   Final de Semana              365 non-null    float64       
 6   Consumo de cerveja (litros)  365 non-null    float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 22.8 KB


**Checkpoint**

In [52]:
df = raw_data.copy()

In [54]:
df.describe()

Unnamed: 0,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
count,365.0,365.0,365.0,365.0,365.0,365.0
mean,21.226356,17.46137,26.611507,5.196712,0.284932,25.401367
std,3.180108,2.826185,4.317366,12.417844,0.452001,4.399143
min,12.9,10.6,14.5,0.0,0.0,14.343
25%,19.02,15.3,23.8,0.0,0.0,22.008
50%,21.38,17.9,26.9,0.0,0.0,24.867
75%,23.28,19.6,29.4,3.2,1.0,28.631
max,28.86,24.5,36.5,94.8,1.0,37.937


---

## KNN Regressor implementation