<a href="https://colab.research.google.com/github/flaviohds/analise_superstore/blob/main/superstore_ML_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
################################################################################
#
#  Treinamento do modelo de Machine Learning para previsão de série temporal.
#  Dataset: https://www.kaggle.com/datasets/rohitsahoo/sales-forecasting
#  Github do projeto: https://github.com/flaviohds/analise_superstore
#  Este é apenas um notebook de treinamento, o estudo do dataset que inclui
# análise exploratória e busca dos hyper-parâmetros utilizados podem ser
# visualizados nos notebooks em https://github.com/flaviohds/MVP_Vendas_Analise
# e parte 1 de https://github.com/flaviohds/MVP_Machine_Learning
#
################################################################################

import pandas as pd
from numpy import pi, cos, sin
from joblib import dump
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.pipeline import Pipeline

# Importa os dados brutos
url = 'https://raw.githubusercontent.com/flaviohds/analise_superstore/refs/heads/main/superstore_sales.csv'
df_raw = pd.read_csv(url, index_col=0)

df_raw.head()

Unnamed: 0_level_0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
Row ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
2,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
3,CA-2017-138688,12/06/2017,16/06/2017,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
4,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
5,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368


In [2]:
# Converte a coluna de data
df_raw['Order Date'] = pd.to_datetime(df_raw['Order Date'], dayfirst=True)

# Agrupa a contagem de produtos vendidos e a soma dos valores por data
date_sum = df_raw.groupby('Order Date')['Sales'].sum()

# Cria um dataframe com os agrupamentos
df_by_date = pd.DataFrame({'soma das vendas':date_sum.T})

# Deleta as variaveis que nao serao mais usadas
del df_raw
del date_sum

# Adiciona os dias faltantes que não tiveram vendas
df_by_date = df_by_date.asfreq('D', fill_value=0)

# Copia a data que ficou no indice para uma nova coluna
df_by_date['Date'] = df_by_date.index

# Reseta o indice
df_by_date = df_by_date.reset_index(drop=True)

# Cria as colunas de data e descarta as coluna desnecessarias
df_by_date['Weekday'] = df_by_date['Date'].dt.weekday
df_by_date['Month'] = df_by_date['Date'].dt.month
df_by_date['Year'] = df_by_date['Date'].dt.year
df_by_date['Day'] = df_by_date['Date'].dt.day
df_by_date['day_of_week_sin'] = sin(df_by_date['Weekday'] * (2*pi/7))
df_by_date['day_of_week_cos'] = cos(df_by_date['Weekday'] * (2*pi/7))
df_by_date.drop(columns='Date', inplace=True)
df_by_date.drop(columns='Weekday', inplace=True)

# Cria 7 colunas de lag
for t in range(1,8):
  df_by_date['vendas_t-' + str(t)] = df_by_date['soma das vendas'].shift(t)

# Remove as primeiras linhas (que nao tinham os atributos de lag)
df_by_date.dropna(axis='index', how='any', inplace=True)

df_by_date.head(8)

Unnamed: 0,soma das vendas,Month,Year,Day,day_of_week_sin,day_of_week_cos,vendas_t-1,vendas_t-2,vendas_t-3,vendas_t-4,vendas_t-5,vendas_t-6,vendas_t-7
7,54.83,1,2015,10,-0.974928,-0.222521,40.544,0.0,87.158,4407.1,19.536,288.06,16.448
8,9.94,1,2015,11,-0.781831,0.62349,54.83,40.544,0.0,87.158,4407.1,19.536,288.06
9,0.0,1,2015,12,0.0,1.0,9.94,54.83,40.544,0.0,87.158,4407.1,19.536
10,3553.795,1,2015,13,0.781831,0.62349,0.0,9.94,54.83,40.544,0.0,87.158,4407.1
11,61.96,1,2015,14,0.974928,-0.222521,3553.795,0.0,9.94,54.83,40.544,0.0,87.158
12,149.95,1,2015,15,0.433884,-0.900969,61.96,3553.795,0.0,9.94,54.83,40.544,0.0
13,299.964,1,2015,16,-0.433884,-0.900969,149.95,61.96,3553.795,0.0,9.94,54.83,40.544
14,0.0,1,2015,17,-0.974928,-0.222521,299.964,149.95,61.96,3553.795,0.0,9.94,54.83


In [3]:
# Separando rotulos e entradas do modelo para treinamento
y = df_by_date['soma das vendas']
y = y.values
x = df_by_date.drop('soma das vendas', axis=1)
x = x.values

# Define os modelos de base com os hiperparametros desejados
base_models = []
model1 = KNeighborsRegressor(metric='manhattan', n_neighbors=28,
                                    weights='distance')
model2 = ElasticNet(alpha=0.07, l1_ratio=0.8)
model3 = RandomForestRegressor(max_depth=10, max_features='sqrt',
                                        min_samples_leaf=6, n_estimators=55,
                                        min_samples_split=2)

base_models.append(('KNN', model1))
base_models.append(('EN', model2))
base_models.append(('RF', model3))

# Instanciando o conselho e o pipeline
voting = ('Voting', VotingRegressor(base_models, weights=[0.2, 0.2, 0.6]))
min_max_scaler = ('MinMaxScaler', MinMaxScaler())
ensemble = Pipeline([min_max_scaler, voting])

# Finalmente, treinando o modelo
ensemble.fit(x,y)

In [4]:
# Salvando o modelo
dump(ensemble, 'superstore_model.pkl')

['superstore_model.pkl']