# Treinamento de um modelo de Regressão Linear
## Etapa de análise exploratória
Não repetiremos aqui a análise exploratória feita anteriormente. Faremos apenas a carga dos dados.

In [46]:
import pandas as pd
import numpy as np

In [47]:
df = pd.read_csv('day.csv')

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


## Etapa de pré-processamento

### Separando os conjuntos de treino e teste

In [49]:
from sklearn.model_selection import train_test_split

# separando os conjuntos de dados de treino e teste
df_treino, df_teste = train_test_split(df, test_size=0.2, random_state=42)

# separando a coluna alvo do conjunto de treino
df_treino_labels = df_treino['cnt'].copy()
df_treino        = df_treino.drop(columns='cnt')

# separando a coluna alvo do conjunto de teste
df_teste_labels = df_teste['cnt'].copy()
df_teste        = df_teste.drop(columns='cnt')

### Automação do processo de pré-processamento

In [50]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [51]:
nomes_atributos_numericos   = ['temp', 'hum', 'windspeed']
nomes_atributos_categoricos = ['season','mnth','weekday','weathersit']
nomes_atributos_binarios    = ['holiday','workingday']

In [52]:
pipeline_atr_numericos = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

preproc_completo = ColumnTransformer([
    ('numericos',   pipeline_atr_numericos, nomes_atributos_numericos),
    ('binarios',    'passthrough',          nomes_atributos_binarios),
    ('categoricos', OneHotEncoder(),        nomes_atributos_categoricos),
    ], 
    sparse_threshold=0)

In [53]:
# pre-processamento do conjunto de treino
preproc_completo.fit(df_treino)
X_treino = preproc_completo.transform(df_treino)
y_treino = df_treino_labels.values.reshape(-1,1)

# pre-processamento do conjunto de teste
X_teste = preproc_completo.transform(df_teste)
y_teste = df_teste_labels.values.reshape(-1,1)

In [54]:
X_treino[0:2]

array([[-0.86461022,  0.22342426,  1.9570449 ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.73728848,  2.17101774,  0.0267188 ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ]])

In [55]:
X_teste[0:2]

array([[-0.13416911,  0.72208642, -0.21401277,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-1.72667406, -1.36120952,  1.12576527,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ]])

## Etapa de Treinamento
### Treinando um modelo de regressão linear

In [56]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

# o método fit treina o modelo, ou seja, encontra os parâmetros ideais para este dataset
lin_reg.fit(X_treino, y_treino)

lin_reg.intercept_, lin_reg.coef_

(array([3869.04213892]),
 array([[ 1299.82199804,  -428.74357039,  -231.32409507,  -458.32000039,
           227.78682898,  -796.04171218,   178.08848691,  -124.38369077,
           742.33691603,   205.4843596 ,   254.76022527,   600.83471373,
           -66.2420565 ,   103.11654716,  -447.16443333, -1022.804465  ,
          -548.32603409,   557.49090887,   341.31942056,   -94.56113089,
           116.09194462,  -127.16916386,    -3.6838876 ,  -102.74004255,
           -72.01973881,  -128.04854328,    75.95904084,   357.70233527,
           697.51584691,   443.85650952, -1141.37235643]]))

## Etapa de avaliação do desempenho

In [57]:
from sklearn.metrics import mean_squared_error

In [58]:
# o método predict apenas faz previsões com o uso de um modelo já treinado
y_teste_previsto = lin_reg.predict(X_teste)

rmse = np.sqrt(mean_squared_error(y_teste,y_teste_previsto))
rmse

1368.2854784725805

## Realizando previsões

In [59]:
df_treino.head(3)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
682,683,2012-11-13,4,1,11,0,2,1,2,0.343333,0.323225,0.662917,0.342046,327,3767
250,251,2011-09-08,3,0,9,0,4,1,3,0.633913,0.555361,0.939565,0.192748,153,1689
336,337,2011-12-03,4,0,12,0,6,0,1,0.299167,0.310604,0.612917,0.095783,706,2908


In [60]:
# novo daset contendo apenas os dados do dia de amanhã
amanha = pd.DataFrame( [[732, '2021-05-11', 2, 3, 5, 0, 2, 1, 1, 0.85, 0.80, 0.90, 0.1, 0, 0]],
                        columns=['instant','dteday','season','yr','mnth','holiday','weekday',
                                 'workingday','weathersit','temp','atemp','hum','windspeed','casual','registered'])
amanha

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
0,732,2021-05-11,2,3,5,0,2,1,1,0.85,0.8,0.9,0.1,0,0


In [61]:
X_amanha = preproc_completo.transform(amanha)
X_amanha

array([[ 1.92852492,  1.89248135, -1.17245256,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ]])

In [62]:
# o método predict faz apenas previsões com base no treinamento feito anteriormente com o método fit
y_predict_amanha = lin_reg.predict(X_amanha)
y_predict_amanha

array([[6939.37623358]])

In [63]:
print("Alugueis de bicicletas previstos para amanhã: ",np.round(y_predict_amanha[0][0]))

Alugueis de bicicletas previstos para amanhã:  6939.0
