In [1]:
import pandas as pd
import inflection
import math
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import datetime

# Carregando os dados
df_sales_raw = pd.read_csv('train.csv', low_memory=False)
df_store_raw = pd.read_csv('store.csv', low_memory=False)

# Merge dos dataframes
df_raw = pd.merge(df_sales_raw, df_store_raw, how='left', on='Store')
df1 = df_raw.copy()

# Renomeando colunas para snake_case
cols_old = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
            'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
            'CompetitionDistance', 'CompetitionOpenSinceMonth',
            'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
            'Promo2SinceYear', 'PromoInterval']
snakecase = lambda x: inflection.underscore(x)
cols_new = list(map(snakecase, cols_old))
df1.columns = cols_new

# Conversão de tipos e tratamento de valores nulos
df1['date'] = pd.to_datetime(df1['date'])
df1['competition_distance'] = df1['competition_distance'].fillna(200000.0)
df1['competition_open_since_month'] = df1['competition_open_since_month'].fillna(df1['date'].dt.month)
df1['competition_open_since_year'] = df1['competition_open_since_year'].fillna(df1['date'].dt.year)
df1['promo2_since_week'] = df1['promo2_since_week'].fillna(df1['date'].dt.isocalendar().week)
df1['promo2_since_year'] = df1['promo2_since_year'].fillna(df1['date'].dt.year)

# Mapeamento de meses
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
df1['month_map'] = df1['date'].dt.month.map(month_map)

# Tratamento de promoção
df1['promo_interval'] = df1['promo_interval'].fillna(0)
df1['is_promo'] = df1.apply(lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split(',') else 0, axis=1)

# Conversão de tipos
df1['competition_open_since_month'] = df1['competition_open_since_month'].astype(int)
df1['competition_open_since_year'] = df1['competition_open_since_year'].astype(int)
df1['promo2_since_week'] = df1['promo2_since_week'].astype(int)
df1['promo2_since_year'] = df1['promo2_since_year'].astype(int)

# Criação de novas features relacionadas a datas
df2 = df1.copy()
df2['year'] = df2['date'].dt.year
df2['week_of_year'] = df2['date'].dt.isocalendar().week
df2['day'] = df2['date'].dt.day
df2['year_week'] = df2['date'].dt.strftime('%Y-%W')
df2['competition_since'] = pd.to_datetime(df2['competition_open_since_year'].astype(str) + '-' + df2['competition_open_since_month'].astype(str) + '-01')
df2['competition_time_month'] = ((df2['date'] - df2['competition_since']) / 30).dt.days.astype(int)
df2['promo_since'] = pd.to_datetime(df2['promo2_since_year'].astype(str) + '-' + df2['promo2_since_week'].astype(str) + '-1', format='%Y-%W-%w') - pd.DateOffset(weeks=1)
df2['promo_time_week'] = ((df2['date'] - df2['promo_since']) / 7).dt.days.astype(int)

# Tratamento de variáveis categóricas
df2['assortment'] = df2['assortment'].map({'a': 'basic', 'b': 'extra', 'c': 'extended'})
df2['state_holiday'] = df2['state_holiday'].map({'a': 'public_holiday', 'b': 'easter_holiday', 'c': 'christmas'})
df2['month'] = df2['date'].dt.month  # Adicionando a coluna 'month'
df3 = df2.copy()  # Copiando o DataFrame df2 para df3

# Remoção de colunas e tratamento de valores nulos e zeros
df3 = df2[(df2['open'] != 0) & (df2['sales'] > 0)]
cols_drop = ['customers', 'open', 'promo_interval', 'month_map']
df3 = df3.drop(columns=cols_drop)

# Escalonamento de features numéricas
from sklearn.preprocessing import RobustScaler, MinMaxScaler
rs = RobustScaler()
df3['competition_distance'] = rs.fit_transform(df3[['competition_distance']])
df3['competition_time_month'] = rs.fit_transform(df3[['competition_time_month']])
mms = MinMaxScaler()
df3['promo_time_week'] = mms.fit_transform(df3[['promo_time_week']])
mms1 = MinMaxScaler()
df3['year'] = mms1.fit_transform(df3[['year']])

# Codificação de variáveis categóricas
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Codificação de variáveis categóricas
df3 = pd.get_dummies(df3, columns=['state_holiday'], dtype=int)
df3['store_type'] = LabelEncoder().fit_transform(df3['store_type'])
assortment_dict = {'basic': 1, 'extra': 2, 'extended': 3}
df3['assortment'] = df3['assortment'].map(assortment_dict)

# Transformação logarítmica na variável alvo
df3['sales'] = np.log1p(df3['sales'])

# Criação de variáveis trigonométricas
df3['month_sin'] = np.sin(2 * np.pi * df3['month'] / 12)
df3['month_cos'] = np.cos(2 * np.pi * df3['month'] / 12)
df3['day_sin'] = np.sin(2 * np.pi * df3['day'] / 30)
df3['day_cos'] = np.cos(2 * np.pi * df3['day'] / 30)
df3['week_of_year_sin'] = np.sin(2 * np.pi * df3['week_of_year'] / 52)
df3['week_of_year_cos'] = np.cos(2 * np.pi * df3['week_of_year'] / 52)
df3['day_of_week_sin'] = np.sin(2 * np.pi * df3['day_of_week'] / 7)
df3['day_of_week_cos'] = np.cos(2 * np.pi * df3['day_of_week'] / 7)



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


KeyboardInterrupt: 

In [6]:
df6 = df3.copy()

In [7]:
cols_drop = ['week_of_year','day','month','day_of_week','promo_since','competition_since','year_week']
df6 = df6.drop(cols_drop,axis=1)

In [8]:
aux1 = df6.loc[:,['store','date']].groupby('store').min()
aux1

Unnamed: 0_level_0,date
store,Unnamed: 1_level_1
1,2013-01-02
2,2013-01-02
3,2013-01-02
4,2013-01-02
5,2013-01-02
...,...
1111,2013-01-02
1112,2013-01-02
1113,2013-01-02
1114,2013-01-02


In [9]:
aux1 = df6.loc[:,['store','date']].groupby('store').max()
aux1

Unnamed: 0_level_0,date
store,Unnamed: 1_level_1
1,2015-07-31
2,2015-07-31
3,2015-07-31
4,2015-07-31
5,2015-07-31
...,...
1111,2015-07-31
1112,2015-07-31
1113,2015-07-31
1114,2015-07-31


In [10]:
aux1 = df6.loc[:,['store','date']].groupby('store').max().reset_index()['date'][0]
aux1

Timestamp('2015-07-31 00:00:00')

In [11]:
aux1 = df6.loc[:,['store','date']].groupby('store').max().reset_index()['date'][0]-datetime.timedelta(days=6*7)
aux1

Timestamp('2015-06-19 00:00:00')

In [12]:
# Selecionar linhas anteriores a '2015-06-19' para treinamento
X_train = df6[df6['date'] < '2015-06-19']
y_train = X_train['sales']

# Selecionar linhas a partir de '2015-06-19' para teste
X_test = df6[df6['date'] >= '2015-06-19']
y_test = X_test['sales']

print('training min date {}'.format(X_train['date'].min()))
print('training max date {}'.format(X_train['date'].max()))

print('\ntest min date {}'.format(X_test['date'].min()))
print('test max date {}'.format(X_test['date'].max()))

# ultimas 6 semanas para test
#treinar com o passado

training min date 2013-01-01 00:00:00
training max date 2015-06-18 00:00:00

test min date 2015-06-19 00:00:00
test max date 2015-07-31 00:00:00


In [13]:
!pip install boruta



In [14]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

In [15]:
X_train_n = X_train.drop(['date', 'sales'], axis=1).values
#ravel trasnformar em vetor e nao em dataframe
y_train_n = y_train.values.ravel()

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# Substitua esta parte pelos seus próprios dados de treino e teste
X_train_n = X_train.drop(['date', 'sales'], axis=1).values
y_train_n = y_train.values.ravel()

# Criar um classificador RandomForest simples com menos estimadores
rf = RandomForestClassifier(n_estimators=10, max_depth=10, random_state=42)

#passando o random forest para escolher as features mais importntes , passando auto para ele escolher como vai fazer
#verbose para plotar , n precisa  
#randomforest para fazer o processo varias vezes
#fit , n pode ser data frame tem que ser numpy
# Passar o RandomForestClassifier para o BorutaPy
boruta = BorutaPy(rf, n_estimators=10, random_state=20)

# Treinar o BorutaPy
boruta.fit(X_train_n, y_train_n)

In [None]:
#vai fazer o rank de elevaçã ds features
cols_selected = boruta.support_.tolist()
#ver qual a melhor feature
X_train_fs = X_train.drop( ['date', 'sales'], axis=1 )
#.colums pega só coluna
#fazer nova coluna pq x_train virou um vetor por conta do boruta
cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns.to_list()#ver oq ele nao selecionou(ver a diferença )
cols_not_selected_boruta = list( np.setdiff1d( X_train_fs.columns,cols_selected_boruta ) )

In [None]:
#boruta n ta dando muito certo , mas eles selecionaria essas colunas
# mas ao invexz de manual sozinho q é aas colunas que é importante para o modelo
cols_selected_boruta = df6[['store', 'promo', 'store_type', 'assortment', 'competition_distance', 'competition_open_since_month', 'competition_open_since_year', 'promo2', 'promo2_since_week', 'promo2_since_year', 'competition_time_month', 'promo_time_week', 'day_of_week_sin', 'day_of_week_cos', 'month_cos', 'day_sin', 'day_cos', 'week_of_year_cos']]
list(cols_selected_boruta)

In [None]:
cols_selected_borutaa = df6[['store','promo','store_type','assortment','competition_distance','competition_open_since_month','competition_open_since_year','promo2','promo2_since_week','promo2_since_year','competition_time_month','promo_time_week','day_of_week_sin','day_of_week_cos','month_cos','month_sin','day_sin','day_cos','week_of_year_cos','week_of_year_sin']]
cols_selected_borutaa

#collumns to ad
#adicionar essas duas colunas que eu tinha tirado 
cols_selected_boruta = cols_selected_borutaa.columns.tolist()

# Adicionar as colunas 'date' e 'sales' à lista
cols_selected_boruta.extend(['date', 'sales'])

#final features
cols_selected_boruta.extend(feat_to_add)
cols_selected_boruta

In [None]:
cols_not_selected_boruta = df6[['is_promo', 'month_sin', 'school_holiday', 'state_holiday_christmas', 'state_holiday_easter_holiday', 'state_holiday_public_holiday', 'state_holiday_regular_day', 'week_of_year_sin', 'year']]
list(cols_not_selected_boruta)

In [23]:
df6

Unnamed: 0,store,date,sales,promo,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,...,state_holiday_easter_holiday,state_holiday_public_holiday,month_sin,month_cos,day_sin,day_cos,week_of_year_sin,week_of_year_cos,day_of_week_sin,day_of_week_cos
0,1,2015-07-31,8.568646,1,1,2,1,-0.170968,9,2008,...,0,0,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984,-0.974928,-0.222521
1,2,2015-07-31,8.710290,1,1,0,1,-0.283871,11,2007,...,0,0,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984,-0.974928,-0.222521
2,3,2015-07-31,9.025816,1,1,0,1,1.903226,12,2006,...,0,0,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984,-0.974928,-0.222521
3,4,2015-07-31,9.546527,1,1,2,3,-0.275806,9,2009,...,0,0,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984,-0.974928,-0.222521
4,5,2015-07-31,8.481151,1,1,0,1,4.448387,4,2015,...,0,0,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984,-0.974928,-0.222521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016776,682,2013-01-01,8.124447,0,1,1,1,-0.351613,9,2006,...,0,1,0.5,0.866025,0.207912,0.978148,0.120537,0.992709,0.974928,-0.222521
1016827,733,2013-01-01,9.284148,0,1,1,2,-0.237097,10,1999,...,0,1,0.5,0.866025,0.207912,0.978148,0.120537,0.992709,0.974928,-0.222521
1016863,769,2013-01-01,8.524367,0,1,1,2,-0.240323,1,2013,...,0,1,0.5,0.866025,0.207912,0.978148,0.120537,0.992709,0.974928,-0.222521
1017042,948,2013-01-01,8.410053,0,1,1,2,-0.145161,1,2013,...,0,1,0.5,0.866025,0.207912,0.978148,0.120537,0.992709,0.974928,-0.222521


# ML

In [None]:
# classificação e regressão e series temporais 
#deep leraning é classificação
quando quer fazer previsão isso se chama predição

In [2]:
# nao supervisionado - Clusterização
# agrupamento/clusterização
!pip install pandas



In [None]:
# semi-supervisionado
#ação     > <       ambiente

#ação faz ação e recebe uma recompensa
#e depois de um tempo quando ele só recebe recompensa positiva ele só faz aquela ação
EX: igual capa de serie de netflix
quando vc clica é a recompensa e ai acaba que vai aprendendo em qual vc mais clicka é a melhor


# machine learning moddeling

In [24]:
#selecionando as colunas mais relevantes para xtrain e xtest

x_train = X_train[cols_selected_boruta_full]
x_test = X_test[cols_selected_boruta_full]

## 1 Modelo de média

In [None]:
para ter uma base doq é bom ou ruim , se eu fazer um ML e for pior que a média o algortmo nao está bom


In [35]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error


In [32]:
def ml_error(model_name,y,y_hat):
    mae = mean_absolute_error(y,y_hat)
    mape = mean_absolute_percentage_error(y,y_hat)
    rmse = np.sqrt(mean_squared_error(y,y_hat))
    return pd.DataFrame({'Model Name' : model_name,
                         'MAE' : mae,
                         'MAPE' : mape,
                         'rmse' : rmse }, index=[0])
    #index 0 n sei pq

In [33]:
aux1 = x_test.copy()
aux1['sales'] = y_test.copy()
#agora temos todas as predicoes medias por média por loja agora anexar no conjunto de dados original
aux2 = aux1[['store', 'sales']].groupby( 'store' ).mean().reset_index().rename(columns={'sales': 'predictions'} )
aux1 = pd.merge( aux1, aux2, how='left', on='store' )
#predição 
yhat_baseline = aux1['predictions']
#performace
#usamos logaritmo para modularizar a variavel(colocar na variavel certa)
#usamos log na variavel resposta sales(exponencial para voltar na variavel certa)(expm1)
baseline_result = ml_error( 'Average Model', np.expm1( y_test ), np.expm1(yhat_baseline ) )
baseline_result

Unnamed: 0,Model Name,MAE,MAPE,rmse
0,Average Model,1354.800353,0.2064,1835.135542


In [34]:
df6.columns

Index(['store', 'date', 'sales', 'promo', 'school_holiday', 'store_type',
       'assortment', 'competition_distance', 'competition_open_since_month',
       'competition_open_since_year', 'promo2', 'promo2_since_week',
       'promo2_since_year', 'is_promo', 'year', 'competition_time_month',
       'promo_time_week', 'state_holiday_christmas',
       'state_holiday_easter_holiday', 'state_holiday_public_holiday',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'week_of_year_sin',
       'week_of_year_cos', 'day_of_week_sin', 'day_of_week_cos'],
      dtype='object')

# 2 - regressão linear

In [36]:
X_train = X_train.drop(columns=['date'])
X_test = X_test.drop(columns=['date'])
from sklearn.linear_model import LinearRegression


#model 
lr = LinearRegression().fit(x_train, y_train)


#prediction
y_hat_lr = lr.predict(x_test)

#performace 
lr_result = ml_error('Linear Regressior', np.expm1(y_test) , np.expm1(y_hat_lr))
lr_result

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- date
Feature names seen at fit time, yet now missing:
- is_promo
- school_holiday
- state_holiday_christmas
- state_holiday_easter_holiday
- state_holiday_public_holiday
- ...


# 3 - regressão linear regularização - Lasso

In [37]:
from sklearn.linear_model import LinearRegression,Lasso


#model 
lrr = Lasso(alpha = 0.01).fit(x_train, y_train)


#prediction
y_hat_lrr = lrr.predict(x_test)

#performace 
lrr_result = ml_error('Lasso Regressior', np.expm1(y_test) , np.expm1(y_hat_lrr))
lrr_result

{'Model': 'Lasso Regressior',
 'MAE': 1890.5097178841665,
 'MAPE': 0.28978990131613386,
 'RMSE': 2739.365070708826}

In [None]:
# notase que os modelos lieares tiveram performace pior doq a média entao isso quer dizer que nosso modelo
# nao é linear ele nao é simples ele é mais complexos , entao os proximos passo é testar modelos de regressao nao lineares


# 4 - random forest regressor

In [26]:
from sklearn.ensemble import RandomForestRegressor
# model 
#esti = quantas arvores aleatoria vai criar, criar em paralelo , random = origem dos numero aleatorio principamente na hora de escolher as features
#crio uma mesma origem aleatoria
rf = RandomForestRegressor(n_estimators=37,n_jobs=-1,random_state=42).fit(x_train,y_train)


#predict
yhat_rf = rf.predict(x_test)

#performace
rf_result = ml_error('Random Forest Regressor',np.expm1(y_test),np.expm1(yhat_rf))
rf_result

Unnamed: 0,Model Name,MAE,MAPE,rmse
0,Random Forest Regressor,688.51303,0.101249,1025.568281


In [7]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:02[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


# 5 - XGBoost Regressor

In [23]:
import xgboost as xgb
# model 
#obj usar no problema de regressao
#eta quantas passagens usa pra fzer o aprendizado
#max maxima profundida da arvore
# sub quantas amostra ou % das variaveis que quero pega
# col varias arvores e quero pega varias arvores
#
#
model_xgb = xgb.XGBRegressor(objective='reg:squarederror',
                             n_estimators=100,
                             eta=0.01,
                             max_depth=10,
                             subsample=0.7,
                             colsample_bytee=0.9,
                                ).fit(x_train,y_train)


#predict
yhat_xgb = model_xgb.predict(x_test)

#performace
xgb_result = ml_error('XGBoost Regressor',np.expm1(y_test),np.expm1(yhat_xgb))
xgb_result

Parameters: { "colsample_bytee" } are not used.



Unnamed: 0,Model Name,MAE,MAPE,rmse
0,XGBoost Regressor,1695.767243,0.251781,2478.381381


# comparar models performace

In [24]:
modelling_result = pd.concat([baseline_result,lr_result,lrr_result,rf_result,xgb_result])

In [25]:
modelling_result.sort_values('rmse')

Unnamed: 0,Model Name,MAE,MAPE,rmse
0,Random Forest Regressor,693.652884,0.101926,1031.71697
0,Average Model,1354.800353,0.2064,1835.135542
0,XGBoost Regressor,1695.767243,0.251781,2478.381381
0,Linear Regressior,1867.089774,0.292694,2671.049215
0,Lasso Regressior,1891.704881,0.289106,2744.451737


In [None]:
# os modelos nao lineares estao com performace melhro que media
# esse erro nao pode usar pq o modelo foi treinado em vendas nas ultimas semanas
#tem que fazer a separação do treino e teste
#vamo aprender cross validatition

# Croos Validatition

In [None]:
# regresssao e clasificação 

pegar varias fatias do dataset para usar como treino e teste
cada vez que faz o cross validatition pega uma parte diferente do data set

cada vez que faz o cross validatition tem um erro(1,2,3)

In [None]:
# time series

a diferença é que nao pode fazer a separação aleatoria mas sim
respeitar o periodo  de tempo 

pegar do inicio até certo tempo depois 
pega inicio+certo tempo até outro tempo
pega inici+certotempo+outrotempo ate o final e etc


nosso algortimo é time series

# implementando cross validatitiomn time series

In [42]:
# Verificar se as colunas 'date' e 'sales' estão presentes em x_train
if 'date' not in x_train.columns or 'sales' not in x_train.columns:
    # Se não estiverem presentes, adicione-as ao DataFrame x_train
    x_train['date'] = df6['date']
    x_train['sales'] = df6['sales']

# Selecionar as colunas desejadas do DataFrame original
x_training = x_train[cols_selected_boruta_full]

# Verificar as primeiras linhas do novo DataFrame
x_training.head()

Unnamed: 0,store,promo,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,...,day_sin,day_cos,week_of_year_cos,week_of_year_sin,date,sales,date.1,sales.1,date.2,sales.2
47945,1,1,2,1,-0.170968,9,2008,0,25,2015,...,-0.587785,-0.809017,-0.992709,0.120537,2015-06-18,8.443762,2015-06-18,8.443762,2015-06-18,8.443762
47946,2,1,0,1,-0.283871,11,2007,1,13,2010,...,-0.587785,-0.809017,-0.992709,0.120537,2015-06-18,8.547722,2015-06-18,8.547722,2015-06-18,8.547722
47947,3,1,0,1,1.903226,12,2006,1,14,2011,...,-0.587785,-0.809017,-0.992709,0.120537,2015-06-18,8.927712,2015-06-18,8.927712,2015-06-18,8.927712
47948,4,1,2,3,-0.275806,9,2009,0,25,2015,...,-0.587785,-0.809017,-0.992709,0.120537,2015-06-18,9.091669,2015-06-18,9.091669,2015-06-18,9.091669
47949,5,1,0,1,4.448387,4,2015,0,25,2015,...,-0.587785,-0.809017,-0.992709,0.120537,2015-06-18,8.50208,2015-06-18,8.50208,2015-06-18,8.50208


In [42]:
#fazer primeira parte que é pegar os dados do começo
#.timedelta fez eu pegar as primeiras 7 semanas
def cross_validation( x_training, kfold, model_name, model, verbose=False ):
    mae_list = []
    mape_list = []
    rmse_list = []
    for k in reversed( range( 1, kfold+1 ) ):
        if verbose:
            print( '\nKFold Number: {}'.format( k ) )
            # start and end date for validation
            validation_start_date = x_training['date'].max() - datetime.timedelta(days=k*6*7)
            validation_end_date = x_training['date'].max() - datetime.timedelta(days=(k-1)*6*7)
            # filtering dataset
            training = x_training[x_training['date'] < validation_start_date]
            validation = x_training[(x_training['date'] >= validation_start_date) &(x_training['date'] <= validation_end_date)]
            # training and validation dataset
            # training
            xtraining = training.drop( ['date', 'sales'], axis=1 )
            ytraining = training['sales']
            # validation
            xvalidation = validation.drop( ['date', 'sales'], axis=1 )
            yvalidation = validation['sales']
            # model
            m = model.fit( xtraining, ytraining )
            # prediction
            yhat = m.predict( xvalidation )
            # performance
            m_result = ml_error( model_name, np.expm1( yvalidation ), np.expm1(yhat ) )
            # store performance of each kfold iteration
            mae_list.append( m_result['MAE'] )
            mape_list.append( m_result['MAPE'] )
            rmse_list.append( m_result['RMSE'] )
            return pd.DataFrame( {'Model Name': model_name,
            'MAE CV': np.round( np.mean( mae_list ), 2 ).astype(str ) + ' +/- ' + np.round( np.std( mae_list ), 2 ).astype( str ),
            'MAPE CV': np.round( np.mean( mape_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( mape_list ), 2 ).astype( str ),
            'RMSE CV': np.round( np.mean( rmse_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( rmse_list ), 2 ).astype( str )}, index=[0] )


In [None]:
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import pandas as pd
import datetime

def cross_validation(x_training, kfold, model_name, model, verbose=False):
    mae_list = []
    mape_list = []
    rmse_list = []
    
    tscv = TimeSeriesSplit(n_splits=kfold)
    for train_index, test_index in tscv.split(x_training):
        if verbose:
            print('\nKFold Number: {}'.format(kfold))
            # Training and validation dataset
            # Training
            xtraining = x_training.iloc[train_index].drop(['date', 'sales'], axis=1)
            ytraining = x_training.iloc[train_index]['sales']
            # Validation
            xvalidation = x_training.iloc[test_index].drop(['date', 'sales'], axis=1)
            yvalidation = x_training.iloc[test_index]['sales']
            # Model
            m = model.fit(xtraining, ytraining)
            # Prediction
            yhat = m.predict(xvalidation)
            # Performance
            m_result = ml_error(model_name, np.expm1(yvalidation), np.expm1(yhat))
            # Store performance of each kfold iteration
            mae_list.append(m_result['MAE'])
            mape_list.append(m_result['MAPE'])
            rmse_list.append(m_result['RMSE'])

    return pd.DataFrame({'Model Name': model_name,
                         'MAE CV': np.round(np.mean(mae_list), 2).astype(str) + ' +/- ' + np.round(np.std(mae_list), 2).astype(str),
                         'MAPE CV': np.round(np.mean(mape_list), 2).astype(str) + ' +/- ' + np.round(np.std(mape_list), 2).astype(str),
                         'RMSE CV': np.round(np.mean(rmse_list), 2).astype(str) + ' +/- ' + np.round(np.std(rmse_list), 2).astype(str)}, index=[0])


In [31]:
x_training.columns


Index(['store', 'promo', 'store_type', 'assortment', 'competition_distance',
       'competition_open_since_month', 'competition_open_since_year', 'promo2',
       'promo2_since_week', 'promo2_since_year', 'competition_time_month',
       'promo_time_week', 'day_of_week_sin', 'day_of_week_cos', 'month_sin',
       'month_cos', 'day_sin', 'day_cos', 'week_of_year_sin',
       'week_of_year_cos', 'date', 'sales'],
      dtype='object')

In [None]:
# Selecionar as colunas desejadas do DataFrame original
x_training = ['store', 'promo', 'store_type', 'assortment', 'competition_distance',
       'competition_open_since_month', 'competition_open_since_year', 'promo2',
       'promo2_since_week', 'promo2_since_year', 'competition_time_month',
       'promo_time_week', 'day_of_week_sin', 'day_of_week_cos', 'month_sin',
       'month_cos', 'day_sin', 'day_cos', 'week_of_year_sin',
       'week_of_year_cos', 'date', 'sales']
fazer separação de traing 
e validaçãlo para fazer as separações
xtraining n pode ter data e sales
y train é o sales
xvalidation tbm nao pode ter data sales
yvalidation  é sales
ytraining é só o sales

fazer a divisao tbm do xval e yval 
 mostrar mae mape e rmse 
chamar uma funcao chamada ml_error

# Cross validatition nos modelos de ML

In [43]:
#def cross_validation(x_training, kfold, model_name, model, verbose=False):



lr_result_cv = cross_validation(x_training, 5 , 'Linear Regressor', lr,verbose=False)
lr_result_cv

In [None]:
#def cross_validation(x_training, kfold, model_name, model, verbose=False):



lrr_result_cv = cross_validation(x_training, 5 , 'Lasso Regressor', lrr,verbose=False)



In [None]:
#def cross_validation(x_training, kfold, model_name, model, verbose=False):

lrr_result_cv = cross_validation(x_training, 5 , 'Lasso Regressor', lrr,verbose=False)





In [None]:
#def cross_validation(x_training, kfold, model_name, model, verbose=False):


rf_result_cv = cross_validation(x_training, 5 , 'Random Forest Regressor', rf,verbose=False)




In [None]:
#def cross_validation(x_training, kfold, model_name, model, verbose=False):


xgb_result_cv = cross_validation(x_training, 5 , 'XGB', model_xgb,verbose=False)


In [None]:
# real Result - Cross Validatition


modelling_result = pd.concat([baseline_result,lr_result_cv,lrr_result_cv,rf_result_cv,xgb_result_cv])
modelling_result.sort_values('rmse')

In [None]:
# usar o random FOrest

#mas vamos continuar com XGB para treinar 