In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from unicodedata import normalize
from datetime import datetime, timedelta
import time
from workalendar.america import Brazil

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

---

# MODELLING

In [2]:
def erro(y_true, y_pred):
    ind_consider = np.sum(y_true, axis='columns') >= 10
    error = ((np.abs(np.sum(y_true, axis='columns') - np.sum(y_pred, axis='columns')))/(np.sum(y_true, axis='columns')))[ind_consider]

    error = np.sum(error)
    return error

---

In [3]:
df = pd.read_csv('model data//model_data.csv', parse_dates=['mes_referencia'])
df = df.set_index(['mes_referencia', 'id'])

In [4]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,TPV_mensal,work_days,TPV_mensal/work_day,1,2,3,4,5,6,7,...,RJ,RN,RO,RR,RS,SC,SE,SP,TO,covid
mes_referencia,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-12-31,1,10107.90,20,505.395000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-01-31,1,6023.15,22,273.779545,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-02-28,1,4347.30,20,217.365000,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-03-31,1,9769.75,21,465.226190,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-04-30,1,6701.70,22,304.622727,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-31,206329,60213.99,23,2617.999565,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2020-04-30,206330,104.50,21,4.976190,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2020-05-31,206330,18335.62,20,916.781000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2020-06-30,206330,15098.04,22,686.274545,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3066772 entries, (Timestamp('2018-12-31 00:00:00'), 1) to (Timestamp('2020-07-31 00:00:00'), 206330)
Columns: 156 entries, TPV_mensal to covid
dtypes: float64(4), int64(152)
memory usage: 3.6 GB


In [6]:
print('columns: ', list(df.columns))

columns:  ['TPV_mensal', 'work_days', 'TPV_mensal/work_day', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '2017', '2018', '2019', '2020', 'TPVEstimate normalized', 'TPVEstimate sqrt', 'Academias e Clubes', 'Acessórios, Bolsas e Bijuterias', 'Alimentação Rápida', 'Anúncios', 'Armarinhos e Tecido', 'Artigos Esportivos', 'Artigos Religiosos e Antiguidades', 'Artigos de Decoração', 'Associação', 'Autopeças e Acessórios', 'Autopeças e Serviços Automotivos - Outros', 'Bares e Restaurantes', 'Calçados', 'Cama, Mesa e Banho', 'Casa e Decoração - Outros', 'Cias Aéreas', 'Clinicas de Estética e Massagem', 'Comércio de Alimentos', 'Comércio de Bebidas', 'Comércio de Veículos', 'Conserto de Produtos e Reparos de Peças', 'Consultorias', 'Cosméticos e Perfumaria', 'Dedetização e Desinfecção', 'Delivery e Entrega', 'Drogarias e Farmácias', 'Eletrodomésticos', 'Eletrônicos', 'Ensino Básico', 'Ensino Superior e Técnico', 'Entretenimento e Turismo', 'Equipamentos de Uso Comercial e Ind

In [94]:
y = df[['TPV_mensal', 'TPV_mensal/work_day']]
X = df.drop(['TPV_mensal', 'TPV_mensal/work_day'], axis='columns')

In [95]:
drop_columns = ['1', '2017', 'Outro', 'Outro.1', '0-2.5k', 'MEI', 'NAN']
X = X.drop(drop_columns, axis='columns')

In [96]:
X['TPVEstimate sqrt'] = X['TPVEstimate sqrt'].fillna(0.)

---

In [97]:
X_test = X.loc[(df.index.get_level_values('mes_referencia') > '2020-06')]
X_train = X.loc[(df.index.get_level_values('mes_referencia') < '2020-06')]

In [98]:
y_test = y.loc[(df.index.get_level_values('mes_referencia') > '2020-06')]
y_train = y.loc[(df.index.get_level_values('mes_referencia') < '2020-06')]

---

In [12]:
def model_predict(model, predictor='TPV_mensal', X_train=X_train, X_test=X_test, 
                      y_train=y_train, y_test=y_test):
    
    model.fit(X_train, y_train[[predictor]])
    
    y_pred = model.predict(X_test)
    y_pred = pd.DataFrame(data=y_pred, index=y_test.index, columns=[predictor])
    y_pred = pd.pivot_table(y_pred.reset_index(), index=['id'], columns=['mes_referencia'], 
                            values=[predictor])
    y_pred[y_pred <= 0] = 0
    
    y_true = pd.pivot_table(y_test[[predictor]].reset_index(), 
                            index=['id'], columns=['mes_referencia'], 
                            values=[predictor])
    
    return y_true, y_pred

---

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
y_true, y_pred = model_predict(LinearRegression(normalize=True))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  217701.6536780853


In [15]:
y_true, y_pred = model_predict(LinearRegression(normalize=True), predictor='TPV_mensal/work_day')
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  219902.00276564888


In [16]:
y_true, y_pred = model_predict(LinearRegression(normalize=True, positive=True))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  240830.11071480592


---

In [17]:
from sklearn.linear_model import Ridge

In [18]:
y_true, y_pred = model_predict(Ridge(alpha=1.0, normalize=True, random_state=42))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  271896.87979362736


In [19]:
y_true, y_pred = model_predict(Ridge(alpha=2.5, normalize=True, random_state=42))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  337577.6216378274


In [20]:
y_true, y_pred = model_predict(Ridge(alpha=5., normalize=True, random_state=42))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  401211.69957931695


In [21]:
y_true, y_pred = model_predict(Ridge(alpha=10., normalize=True, random_state=42))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  458834.6529859386


---

In [24]:
from sklearn.linear_model import SGDRegressor

In [28]:
pipeline_model = make_pipeline(StandardScaler(), 
                               SGDRegressor(max_iter=500, tol=1e-3, random_state=42))
y_true, y_pred = model_predict(pipeline_model)
print('erro absoluto relativo: ', erro(y_true, y_pred))

  return f(*args, **kwargs)


erro absoluto relativo:  184662612.7472871


---

In [29]:
from sklearn.linear_model import ElasticNet

In [30]:
y_true, y_pred = model_predict(ElasticNet(random_state=42))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  345559.9181563495


In [31]:
y_true, y_pred = model_predict(ElasticNet(alpha=0.75, random_state=42))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  327320.948880468


In [32]:
y_true, y_pred = model_predict(ElasticNet(alpha=2., random_state=42))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  396760.71968183934


In [33]:
y_true, y_pred = model_predict(ElasticNet(alpha=1., random_state=42, 
                                          normalize=True))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  557469.475085908


In [34]:
y_true, y_pred = model_predict(ElasticNet(alpha=1., random_state=42, 
                                          normalize=True, positive=True))
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  557469.7902666472


In [36]:
y_true, y_pred = model_predict(ElasticNet(alpha=1., random_state=42, 
                                          normalize=True, positive=True), predictor='TPV_mensal/work_day')
print('erro absoluto relativo: ', erro(y_true, y_pred))

erro absoluto relativo:  600110.3999913863


---

In [38]:
from sklearn.linear_model import BayesianRidge

In [39]:
y_true, y_pred = model_predict(BayesianRidge())
print('erro absoluto relativo: ', erro(y_true, y_pred))

  return f(*args, **kwargs)


erro absoluto relativo:  217727.14711544669


In [40]:
y_true, y_pred = model_predict(BayesianRidge(normalize=True))
print('erro absoluto relativo: ', erro(y_true, y_pred))

  return f(*args, **kwargs)


erro absoluto relativo:  217792.8518536692


---

In [41]:
from sklearn.ensemble import RandomForestRegressor

In [42]:
y_true, y_pred = model_predict(RandomForestRegressor(max_depth=2, random_state=0))
print('erro absoluto relativo: ', erro(y_true, y_pred))

  after removing the cwd from sys.path.


erro absoluto relativo:  447955.22509528097


In [43]:
y_true, y_pred = model_predict(RandomForestRegressor(max_depth=5, 
                                                     random_state=0, 
                                                    n_jobs=-1))
print('erro absoluto relativo: ', erro(y_true, y_pred))

  after removing the cwd from sys.path.


erro absoluto relativo:  277563.78438051196


---

In [44]:
from sklearn.ensemble import AdaBoostRegressor

In [45]:
AdaBoostRegr = AdaBoostRegressor(base_estimator=BayesianRidge(), 
                                 random_state=0, n_estimators=100)
y_true, y_pred = model_predict(AdaBoostRegr)
print('erro absoluto relativo: ', erro(y_true, y_pred))

  return f(*args, **kwargs)


erro absoluto relativo:  1005566.4558667799


In [None]:
AdaBoostRegr = AdaBoostRegressor(base_estimator=BayesianRidge(), 
                                 random_state=0, n_estimators=100, 
                                 learning_rate=0.1)
y_true, y_pred = model_predict(AdaBoostRegr)
print('erro absoluto relativo: ', erro(y_true, y_pred))

  return f(*args, **kwargs)


---

In [60]:
from sklearn.ensemble import GradientBoostingRegressor

In [61]:
y_true, y_pred = model_predict(GradientBoostingRegressor(random_state=42))
print('erro absoluto relativo: ', erro(y_true, y_pred))

  return f(*args, **kwargs)


erro absoluto relativo:  224998.85400022045


---

In [100]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,work_days,2,3,4,5,6,7,8,9,10,...,RJ,RN,RO,RR,RS,SC,SE,SP,TO,covid
mes_referencia,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-12-31,1,20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-01-31,1,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-02-28,1,20,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-03-31,1,21,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-04-30,1,22,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-31,206329,23,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2020-04-30,206330,21,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2020-05-31,206330,20,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2020-06-30,206330,22,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [116]:
X8 = X.loc[(df.index.get_level_values('mes_referencia') == '2020-07-31')].reset_index()
X8['7'] = 0
X8['8'] = 1
X8['work_days'] = 25
X8['mes_referencia'] = datetime(2020,8,31).date()

X9 = X.loc[(df.index.get_level_values('mes_referencia') == '2020-07-31')].reset_index()
X9['7'] = 0
X9['9'] = 1
X9['work_days'] = 25
X9['mes_referencia'] = datetime(2020,9,30).date()

X10 = X.loc[(df.index.get_level_values('mes_referencia') == '2020-07-31')].reset_index()
X10['7'] = 0
X10['10'] = 1
X10['work_days'] = 25
X10['mes_referencia'] = datetime(2020,10,31).date()

X11 = X.loc[(df.index.get_level_values('mes_referencia') == '2020-07-31')].reset_index()
X11['7'] = 0
X11['11'] = 1
X11['work_days'] = 25
X11['mes_referencia'] = datetime(2020,11,30).date()

X12 = X.loc[(df.index.get_level_values('mes_referencia') == '2020-07-31')].reset_index()
X12['7'] = 0
X12['12'] = 1
X12['work_days'] = 25
X12['mes_referencia'] = datetime(2020,12,31).date()

In [117]:
X_output = X8.append([X9, X10, X11, X12])

In [121]:
X_output = X_output.set_index(['mes_referencia', 'id'])

In [122]:
from sklearn.linear_model import LinearRegression

In [125]:
model = LinearRegression(normalize=True)
predictor = 'TPV_mensal'
model.fit(X, y[[predictor]])

y_pred = model.predict(X_output)
y_pred = pd.DataFrame(data=y_pred, index=X_output.index, columns=[predictor])
y_pred = pd.pivot_table(y_pred.reset_index(), index=['id'], columns=['mes_referencia'], 
                        values=[predictor])
y_pred

Unnamed: 0_level_0,TPV_mensal,TPV_mensal,TPV_mensal,TPV_mensal,TPV_mensal
mes_referencia,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,45695.013329,45765.835309,46166.252656,48029.170372,51946.893836
2,11058.360085,11129.182066,11529.599413,13392.517128,17310.240592
3,1746.791188,1817.613169,2218.030516,4080.948231,7998.671695
4,41012.589025,41083.411006,41483.828353,43346.746068,47264.469532
5,6825.755812,6896.577793,7296.995140,9159.912856,13077.636320
...,...,...,...,...,...
206326,10050.484072,10121.306052,10521.723400,12384.641115,16302.364579
206327,14249.447541,14320.269522,14720.686869,16583.604585,20501.328049
206328,5391.528757,5462.350738,5862.768085,7725.685800,11643.409264
206329,45242.338147,45313.160128,45713.577475,47576.495190,51494.218654


In [143]:
len(set(df.reset_index()['id'].unique()) - set(y_pred.index))

63

In [146]:
miss_index = set(df.reset_index()['id'].unique()) - set(y_pred.index)
miss = pd.DataFrame(index=miss_index, columns=y_pred.columns)

In [164]:
miss

Unnamed: 0_level_0,TPV_mensal,TPV_mensal,TPV_mensal,TPV_mensal,TPV_mensal
mes_referencia,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31
112768,22229.381455,,,,
177920,22229.381455,,,,
107011,22229.381455,,,,
151942,22229.381455,,,,
158471,22229.381455,,,,
...,...,...,...,...,...
79993,22229.381455,,,,
72826,22229.381455,,,,
193147,22229.381455,,,,
194428,22229.381455,,,,


In [165]:
miss[('TPV_mensal', '2020-08-31')].fillna(y_pred['TPV_mensal']['2020-08-31'].mean(), inplace=True)
miss[('TPV_mensal', '2020-09-30')].fillna(y_pred['TPV_mensal']['2020-09-30'].mean(), inplace=True)
miss[('TPV_mensal', '2020-10-31')].fillna(y_pred['TPV_mensal']['2020-10-31'].mean(), inplace=True)
miss[('TPV_mensal', '2020-11-30')].fillna(y_pred['TPV_mensal']['2020-11-30'].mean(), inplace=True)
miss[('TPV_mensal', '2020-12-31')].fillna(y_pred['TPV_mensal']['2020-12-31'].mean(), inplace=True)

miss

Unnamed: 0_level_0,TPV_mensal,TPV_mensal,TPV_mensal,TPV_mensal,TPV_mensal
mes_referencia,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31
112768,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962
177920,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962
107011,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962
151942,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962
158471,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962
...,...,...,...,...,...
79993,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962
72826,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962
193147,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962
194428,22229.381455,22300.203436,22700.620783,24563.538498,28481.261962


In [193]:
y_output = y_pred.append(miss).sort_index()
y_output.columns.names = [None, 'id']
y_output.columns = [x[1] for x in y_output.columns]
y_output = y_output.rename({datetime(2020,8,31):'TPV agosto', 
                            datetime(2020,9,30):'TPV setembro', 
                            datetime(2020,10,31):'TPV outubro', 
                            datetime(2020,11,30):'TPV nobembro', 
                            datetime(2020,12,31):'TPV dezembro'}, axis='columns')
y_output.columns.name = 'id'
y_output

id,TPV agosto,TPV setembro,TPV outubro,TPV nobembro,TPV dezembro
1,45695.013329,45765.835309,46166.252656,48029.170372,51946.893836
2,11058.360085,11129.182066,11529.599413,13392.517128,17310.240592
3,1746.791188,1817.613169,2218.030516,4080.948231,7998.671695
4,41012.589025,41083.411006,41483.828353,43346.746068,47264.469532
5,6825.755812,6896.577793,7296.995140,9159.912856,13077.636320
...,...,...,...,...,...
206326,10050.484072,10121.306052,10521.723400,12384.641115,16302.364579
206327,14249.447541,14320.269522,14720.686869,16583.604585,20501.328049
206328,5391.528757,5462.350738,5862.768085,7725.685800,11643.409264
206329,45242.338147,45313.160128,45713.577475,47576.495190,51494.218654


In [194]:
y_output.to_csv('prediction.csv')

---

Dentre novas variáveis que podem ser adicionadas ao modelo, recomenda-se o uso de variáveis que indiquem a mobilidade urbana, principalmente, durante o período de pandemia. Além, podemos introduzir ao modelo variáveis de casos de covid, o que também poderia indicar, inversamente, a atividade do comércio e serviços.

Além, poderíamos adicionar variáveis que indicassem de modo mais granular as regiões dos estabelecimentos (capitais, centros comerciais, ruas tradicionais de comércio).

Incluímos no modelo variáveis que indicassem a quantidade de dias úteis. Dessa forma, também poderíamos adicionar variáveis que indicassem forte atividade no comércio, como Natal, dia das Mães, dia das Crianças.

Por fim, outras variáveis que poderiam ser incluídas, seriam macro-econômicas, que pudessem antecipar alguns movimentos no comércio e serviços, como produção, consumo de energia.

---