In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
df = pd.read_csv('Consumo_cerveja.csv', sep=';')

# EDA and Data Manipulation

In [4]:
df.head()

Unnamed: 0,data,temp_media,temp_min,temp_max,chuva,fds,consumo
0,01/01/2015,27.3,23.9,32.5,0.0,0,25461
1,02/01/2015,27.02,24.5,33.5,0.0,0,28972
2,03/01/2015,24.82,22.4,29.9,0.0,1,30814
3,04/01/2015,23.98,21.5,28.6,1.2,1,29799
4,05/01/2015,23.82,21.0,28.3,0.0,0,28900


In [5]:
df.shape

(365, 7)

In [6]:
df.isnull().sum()

data          0
temp_media    0
temp_min      0
temp_max      0
chuva         0
fds           0
consumo       0
dtype: int64

In [7]:
for column in df.columns[1:]:
    print('Valores coluna: ', column)
    print(pd.cut(df[column], bins=3).value_counts())
    print('\n\n')

Valores coluna:  temp_media
(18.22, 23.54]     208
(23.54, 28.86]      86
(12.884, 18.22]     71
Name: temp_media, dtype: int64



Valores coluna:  temp_min
(15.233, 19.867]    203
(10.586, 15.233]     91
(19.867, 24.5]       71
Name: temp_min, dtype: int64



Valores coluna:  temp_max
(21.833, 29.167]    206
(29.167, 36.5]      102
(14.478, 21.833]     57
Name: temp_max, dtype: int64



Valores coluna:  chuva
(-0.0948, 31.6]    349
(31.6, 63.2]        12
(63.2, 94.8]         4
Name: chuva, dtype: int64



Valores coluna:  fds
(-0.001, 0.333]    261
(0.667, 1.0]       104
(0.333, 0.667]       0
Name: fds, dtype: int64



Valores coluna:  consumo
(22207.667, 30072.333]    206
(14319.406, 22207.667]     97
(30072.333, 37937.0]       62
Name: consumo, dtype: int64





In [8]:
def remove_outliars(df, column):
    Q1 = df[column].quantile(.25)
    Q3 = df[column].quantile(.75)

    IIQ = Q3 - Q1

    limite_inferior = Q1 - 1.5 * IIQ
    limite_superior = Q3 + 1.5 * IIQ
    
    print('Coluna: ', column)
    print('IIQ: ', IIQ, '\nQ1: ', Q1, '\nQ3: ', Q3)
    print('Limite inferior: ', limite_inferior, '\nLimite superior: ', limite_superior)
    print('\n\n')
    
    df_select = df[(df[column] >= limite_inferior) & (df[column] <= limite_superior)]
    return df_select

In [9]:
for column in df.columns[1:]:
    df = remove_outliars(df, column)

Coluna:  temp_media
IIQ:  4.260000000000002 
Q1:  19.02 
Q3:  23.28
Limite inferior:  12.629999999999997 
Limite superior:  29.67



Coluna:  temp_min
IIQ:  4.300000000000001 
Q1:  15.3 
Q3:  19.6
Limite inferior:  8.85 
Limite superior:  26.050000000000004



Coluna:  temp_max
IIQ:  5.599999999999998 
Q1:  23.8 
Q3:  29.4
Limite inferior:  15.400000000000004 
Limite superior:  37.8



Coluna:  chuva
IIQ:  3.05 
Q1:  0.0 
Q3:  3.05
Limite inferior:  -4.574999999999999 
Limite superior:  7.624999999999999



Coluna:  fds
IIQ:  1.0 
Q1:  0.0 
Q3:  1.0
Limite inferior:  -1.5 
Limite superior:  2.5



Coluna:  consumo
IIQ:  6610.0 
Q1:  22365.5 
Q3:  28975.5
Limite inferior:  12450.5 
Limite superior:  38890.5





In [10]:
df.shape

(299, 7)

# Preprocessing the data and utilizing a model

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [12]:
features = ['temp_min', 'temp_max', 'chuva', 'fds']
X = df[features]
y = df.consumo

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

In [14]:
numerical_cols = ['temp_min', 'temp_max', 'chuva']

In [15]:
numerical_transform = Pipeline(steps = [('minmax_scaler', MinMaxScaler())]) 

In [16]:
preprocessor = ColumnTransformer(transformers = [('num', numerical_transform, numerical_cols)])

In [18]:
lrmodel = LinearRegression()

In [19]:
pipe = Pipeline(steps=[('preprocessor', preprocessor), ('lr_model', lrmodel)])

In [20]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('minmax_scaler',
                                                                   MinMaxScaler())]),
                                                  ['temp_min', 'temp_max',
                                                   'chuva'])])),
                ('lr_model', LinearRegression())])

In [21]:
y_pred = pipe.predict(X_test)

In [22]:
mean_squared_error(y_test, y_pred)

9909766.108123701

In [23]:
lrmodel.coef_

array([ -554.02557498, 14384.71812546,  -891.93380312])

In [24]:
lrmodel2 = LinearRegression()
model = lrmodel2.fit(X_train, y_train)

In [28]:
preds = model.predict(X_test)

In [29]:
mean_squared_error(preds, y_test)

5840875.561181966

Standardizing the data makes the model worse for this specific case.