In [1]:
from pathlib import Path
import pandas as pd

# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.3g' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 4

In [2]:
# Ler a base ajustada no S3
# df_stocks = pd.read_feather('s3://aq-dl/HistoricalQuotations/base_adj.feather')
df_stocks = pd.read_feather('/mnt/aq_disk/data/HistoricalQuotations/processed/base_adj.feather')
df_stocks

Unnamed: 0,datneg,codneg,nomres,codisi,especi,codbdi,tpmerc,preabe,premax,premin,...,preofv,preexe,totneg,quatot,voltot,dismes,datven,prazot,event2,ajuste
0,2022-02-03,5GTK11,INVESTO 5GTK,BR5GTKCTF000,CI,14,10,95,101,94.9,...,97.7,0,85,2.08e+04,2e+06,100,NaT,0,,1
1,2022-02-04,5GTK11,INVESTO 5GTK,BR5GTKCTF000,CI,14,10,97,98.3,90.5,...,95.8,0,50,1.11e+03,1.06e+05,100,NaT,0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10907285,2013-12-18,ZNTE6L,FERR ZANETTE,BRZNTEACNPB8,PNB,52,17,0.82,0.82,0.82,...,0,0,1,7.44e+05,6.1e+05,104,NaT,0,,1
10907286,2012-12-13,ZNTE7L,FERR ZANETTE,BRZNTEACNPC6,PNC*,52,17,0.82,0.82,0.82,...,0,0,1,4.76e+08,3.9e+05,111,NaT,0,,1


In [3]:
# Filtrar somente cotações após 2011, lote padrão (bdi == 2) e ações ON ou PN
df_stocks.query('\
    codbdi == 2 and \
    datneg >= "2011.01.01" and \
    especi.str.contains("ON |PN ")'
    , inplace=True
)
df_stocks.reset_index(drop=True, inplace=True)
df_stocks

Unnamed: 0,datneg,codneg,nomres,codisi,especi,codbdi,tpmerc,preabe,premax,premin,...,preofv,preexe,totneg,quatot,voltot,dismes,datven,prazot,event2,ajuste
0,2016-10-28,AALR3,ALLIAR,BRAALRACNOR6,ON NM,2,10,19,19.5,18.7,...,19,0,4460,6.43e+06,1.22e+08,100,NaT,0,,0.986
1,2016-10-31,AALR3,ALLIAR,BRAALRACNOR6,ON NM,2,10,18.9,18.9,17.3,...,17.8,0,4238,2.56e+06,4.59e+07,100,NaT,0,,0.986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576712,2022-06-03,YDUQ3,YDUQS PART,BRYDUQACNOR3,ON NM,2,10,16.6,16.6,15.6,...,15.7,0,7843,2.18e+06,3.46e+07,103,NaT,0,,1
576713,2022-06-06,YDUQ3,YDUQS PART,BRYDUQACNOR3,ON NM,2,10,15.8,15.8,15.1,...,15.2,0,5322,1.07e+06,1.65e+07,103,NaT,0,,1


In [4]:
# Manter somente as colunas que serão usadas para fazer o corte nas datas
cols = ['datneg', 'codneg', 'codisi', 'especi', 'premed', 'totneg']
df_stocks = df_stocks[cols].copy()
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg
0,2016-10-28,AALR3,BRAALRACNOR6,ON NM,19,4460
1,2016-10-31,AALR3,BRAALRACNOR6,ON NM,17.9,4238
...,...,...,...,...,...,...
576712,2022-06-03,YDUQ3,BRYDUQACNOR3,ON NM,15.8,7843
576713,2022-06-06,YDUQ3,BRYDUQACNOR3,ON NM,15.4,5322


In [5]:
# Inserir o cód. dos emissores -> 4 caracteres depois do dois primeiros do ISIN
df_stocks["codemi"] = df_stocks.codisi.str[2:6]
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi
0,2016-10-28,AALR3,BRAALRACNOR6,ON NM,19,4460,AALR
1,2016-10-31,AALR3,BRAALRACNOR6,ON NM,17.9,4238,AALR
...,...,...,...,...,...,...,...
576712,2022-06-03,YDUQ3,BRYDUQACNOR3,ON NM,15.8,7843,YDUQ
576713,2022-06-06,YDUQ3,BRYDUQACNOR3,ON NM,15.4,5322,YDUQ


In [6]:
print('Total number of companies in the backtesting period:', df_stocks.codemi.nunique())

Total number of companies in the backtesting period: 500


In [7]:
df_stocks['day_year'] = df_stocks.datneg.dt.day_of_year
df_stocks['year'] = df_stocks.datneg.dt.year
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year
0,2016-10-28,AALR3,BRAALRACNOR6,ON NM,19,4460,AALR,302,2016
1,2016-10-31,AALR3,BRAALRACNOR6,ON NM,17.9,4238,AALR,305,2016
...,...,...,...,...,...,...,...,...,...
576712,2022-06-03,YDUQ3,BRYDUQACNOR3,ON NM,15.8,7843,YDUQ,154,2022
576713,2022-06-06,YDUQ3,BRYDUQACNOR3,ON NM,15.4,5322,YDUQ,157,2022


In [8]:
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year
0,2016-10-28,AALR3,BRAALRACNOR6,ON NM,19,4460,AALR,302,2016
1,2016-10-31,AALR3,BRAALRACNOR6,ON NM,17.9,4238,AALR,305,2016
...,...,...,...,...,...,...,...,...,...
576712,2022-06-03,YDUQ3,BRYDUQACNOR3,ON NM,15.8,7843,YDUQ,154,2022
576713,2022-06-06,YDUQ3,BRYDUQACNOR3,ON NM,15.4,5322,YDUQ,157,2022


In [9]:
# Definir a data de corte e remover negociações posteriores ao corte
df_stocks.query('day_year >= 100', inplace=True)
df_stocks.reset_index(drop=True, inplace=True)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year
0,2016-10-28,AALR3,BRAALRACNOR6,ON NM,19,4460,AALR,302,2016
1,2016-10-31,AALR3,BRAALRACNOR6,ON NM,17.9,4238,AALR,305,2016
...,...,...,...,...,...,...,...,...,...
412781,2022-06-03,YDUQ3,BRYDUQACNOR3,ON NM,15.8,7843,YDUQ,154,2022
412782,2022-06-06,YDUQ3,BRYDUQACNOR3,ON NM,15.4,5322,YDUQ,157,2022


In [10]:
# Criar dataframe com o dia mais próximo do corte para cada ano
df_corte = df_stocks.groupby(by=['year']).agg({'day_year':'min'}).reset_index()
df_corte

Unnamed: 0,year,day_year
0,2011,101
1,2012,100
...,...,...
10,2021,102
11,2022,101


In [11]:
# Usar o ano e o dia de corte como chaves para a operação de merge
df_stocks = df_stocks.merge(
    right=df_corte, how='inner', on=['year', 'day_year']
)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year
0,2017-04-10,AALR3,BRAALRACNOR6,ON NM,15.1,315,AALR,100,2017
1,2017-04-10,ABCB4,BRABCBACNPR4,PN N2,13.6,1005,ABCB,100,2017
...,...,...,...,...,...,...,...,...,...
2465,2016-04-11,VVAR3,BRVVARACNOR1,ON N2,2.38,1,VVAR,102,2016
2466,2016-04-11,WEGE3,BRWEGEACNOR0,ON NM,4.74,9383,WEGE,102,2016


In [12]:
print('Total number of companies available in the cutoff dates:', df_stocks.codemi.nunique())

Total number of companies available in the cutoff dates: 377


In [13]:
# Excluir empresas financeiras e prestadoras de serviços públicos (utilities)
excluded_companies = pd.read_csv('../data/excluded_companies.csv')
excluded_companies = excluded_companies['company_code'].to_list()
df_stocks.query('codemi != @excluded_companies', inplace=True)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year
0,2017-04-10,AALR3,BRAALRACNOR6,ON NM,15.1,315,AALR,100,2017
2,2017-04-10,AGRO3,BRAGROACNOR7,ON NM,8.98,199,AGRO,100,2017
...,...,...,...,...,...,...,...,...,...
2465,2016-04-11,VVAR3,BRVVARACNOR1,ON N2,2.38,1,VVAR,102,2016
2466,2016-04-11,WEGE3,BRWEGEACNOR0,ON NM,4.74,9383,WEGE,102,2016


In [14]:
print('Total number of companies available in the cutoff dates:', df_stocks.codemi.nunique())

Total number of companies available in the cutoff dates: 311


In [15]:
AQ_FOLDER = Path("/mnt/aq_disk/data/AQ/")
df_codes = pd.read_pickle(AQ_FOLDER / "codemi.pkl")
df_codes

Unnamed: 0,codcvm,cnpj,densoc,situac,codemi
0,60,18451005000104,ACOPALMA CIA IND...,CANCELADA,ZWVZ
1,94,92693019000189,PANATLANTICA SA,ATIVO,PATI
...,...,...,...,...,...
1762,26794,22902694000195,SELF IT ACADEMIA...,ATIVO,SLFT
1763,26808,04368898000106,COPEL DISTRIBUIÇ...,ATIVO,CPLD


In [16]:
# A chave do merge é o cód. do emissor (codemir) e somente o cód. CVM (codcvm)
# será inserido no merge
df_codes = df_codes[["codcvm", "codemi"]].copy()
df_codes

Unnamed: 0,codcvm,codemi
0,60,ZWVZ
1,94,PATI
...,...,...
1762,26794,SLFT
1763,26808,CPLD


In [17]:
df_stocks = df_stocks.merge(right=df_codes, how='inner', on='codemi')
df_stocks.reset_index(drop=True, inplace=True)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year,codcvm
0,2017-04-10,AALR3,BRAALRACNOR6,ON NM,15.1,315,AALR,100,2017,24058
1,2018-04-10,AALR3,BRAALRACNOR6,ON NM,15,175,AALR,100,2018,24058
...,...,...,...,...,...,...,...,...,...,...
1797,2015-04-10,RUMO3,BRRUMOACNOR3,ON NM,16.5,14416,RUMO,100,2015,23450
1798,2016-04-11,RUMO3,BRRUMOACNOR3,ON NM,3.31,26340,RUMO,102,2016,23450


In [18]:
print('Total number of companies available in the cutoff dates:', df_stocks.codemi.nunique())

Total number of companies available in the cutoff dates: 277


In [19]:
df_financials = pd.read_csv(
    "../data/financials.csv",
    parse_dates=['doc_env', 'per_ini', 'per_fim']
)
# Renomear coluna com o código CVM para coincidir com as outras bases
df_financials.rename(columns={"cia_id": "codcvm"}, inplace=True)
df_financials

Unnamed: 0,codcvm,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,8.86e+06,-1.9e+07,2.48e+07,0.2
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,8.86e+06,-1.9e+07,2.11e+07,0.171
...,...,...,...,...,...,...,...,...,...
4056,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,6.6e+07,1.04e+08,9.22e+07,0.191
4057,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,2.16e+08,-7.08e+07,2.41e+08,0.306


In [20]:
# year = ano em que a informação será usada -> ano seguinte
df_financials['year'] = df_financials.per_fim.dt.year + 1
df_financials

Unnamed: 0,codcvm,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic,year
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,8.86e+06,-1.9e+07,2.48e+07,0.2,2011
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,8.86e+06,-1.9e+07,2.11e+07,0.171,2011
...,...,...,...,...,...,...,...,...,...,...
4056,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,6.6e+07,1.04e+08,9.22e+07,0.191,2021
4057,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,2.16e+08,-7.08e+07,2.41e+08,0.306,2018


In [21]:
# Incluir os dados contábeis em df_stocks 
df_stocks = df_stocks.merge(right=df_financials, how='inner', on=['year', 'codcvm'])
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year,codcvm,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,2017-04-10,AALR3,BRAALRACNOR6,ON NM,15.1,315,AALR,100,2017,24058,CENTRO DE IMAGEM...,2017-03-22 23:42:13,2016-01-01,2016-12-31,1.15e+08,3.09e+08,1.01e+08,0.0662
1,2018-04-10,AALR3,BRAALRACNOR6,ON NM,15,175,AALR,100,2018,24058,CENTRO DE IMAGEM...,2018-03-28 20:20:31,2017-01-01,2017-12-31,1.18e+08,5.08e+08,7.03e+07,0.0396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,2015-04-10,RUMO3,BRRUMOACNOR3,ON NM,16.5,14416,RUMO,100,2015,23450,RUMO LOGÍSTICA O...,2015-03-03 20:02:32,2014-01-01,2014-12-31,1.03e+09,6.99e+08,2.07e+08,0.102
1950,2016-04-11,RUMO3,BRRUMOACNOR3,ON NM,3.31,26340,RUMO,102,2016,23450,RUMO LOGÍSTICA O...,2016-02-25 20:38:25,2015-01-01,2015-12-31,2.99e+08,8e+09,1.05e+09,0.0887


In [22]:
df_stocks['market_cap'] = df_stocks['shares_outstanding'] * df_stocks['premed']
df_stocks['enterprise_value'] = df_stocks['market_cap'] - df_stocks['net_debt']
df_stocks['earnings_yield'] = df_stocks['ebit'] / df_stocks['enterprise_value']
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year,codcvm,...,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,BRAALRACNOR6,ON NM,15.1,315,AALR,100,2017,24058,...,2017-03-22 23:42:13,2016-01-01,2016-12-31,1.15e+08,3.09e+08,1.01e+08,0.0662,1.74e+09,1.43e+09,0.0703
1,2018-04-10,AALR3,BRAALRACNOR6,ON NM,15,175,AALR,100,2018,24058,...,2018-03-28 20:20:31,2017-01-01,2017-12-31,1.18e+08,5.08e+08,7.03e+07,0.0396,1.77e+09,1.26e+09,0.0557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,2015-04-10,RUMO3,BRRUMOACNOR3,ON NM,16.5,14416,RUMO,100,2015,23450,...,2015-03-03 20:02:32,2014-01-01,2014-12-31,1.03e+09,6.99e+08,2.07e+08,0.102,1.69e+10,1.62e+10,0.0127
1950,2016-04-11,RUMO3,BRRUMOACNOR3,ON NM,3.31,26340,RUMO,102,2016,23450,...,2016-02-25 20:38:25,2015-01-01,2015-12-31,2.99e+08,8e+09,1.05e+09,0.0887,9.9e+08,-7.01e+09,-0.15


In [23]:
# Remover revisões de DFPs publicadas no mesmo dia ou posteriores ao corte
# No livro, o corte é de uma semana
df_stocks.query('doc_env.dt.date < datneg')

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year,codcvm,...,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,BRAALRACNOR6,ON NM,15.1,315,AALR,100,2017,24058,...,2017-03-22 23:42:13,2016-01-01,2016-12-31,1.15e+08,3.09e+08,1.01e+08,0.0662,1.74e+09,1.43e+09,0.0703
1,2018-04-10,AALR3,BRAALRACNOR6,ON NM,15,175,AALR,100,2018,24058,...,2018-03-28 20:20:31,2017-01-01,2017-12-31,1.18e+08,5.08e+08,7.03e+07,0.0396,1.77e+09,1.26e+09,0.0557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,2015-04-10,RUMO3,BRRUMOACNOR3,ON NM,16.5,14416,RUMO,100,2015,23450,...,2015-03-03 20:02:32,2014-01-01,2014-12-31,1.03e+09,6.99e+08,2.07e+08,0.102,1.69e+10,1.62e+10,0.0127
1950,2016-04-11,RUMO3,BRRUMOACNOR3,ON NM,3.31,26340,RUMO,102,2016,23450,...,2016-02-25 20:38:25,2015-01-01,2015-12-31,2.99e+08,8e+09,1.05e+09,0.0887,9.9e+08,-7.01e+09,-0.15


In [24]:
# Manter somente a DFP mais recente ao corte para cada ativo
df_stocks.sort_values('doc_env', inplace=True)
df_stocks.drop_duplicates(subset=['codneg', 'year'], keep='last', inplace=True)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,day_year,year,codcvm,...,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
1349,2011-04-11,TOTS3,BRTOTSACNOR8,ON NM,8.73,244,TOTS,101,2011,19992,...,2011-01-31 19:05:59,2010-01-01,2010-12-31,3.15e+07,1.79e+08,2.12e+08,0.261,2.75e+08,9.53e+07,2.22
785,2011-04-11,LREN3,BRLRENACNOR1,ON NM,6.56,3604,LREN,101,2011,8133,...,2011-02-16 19:53:52,2010-01-01,2010-12-31,1.22e+08,-2.72e+07,4.04e+08,0.407,8.03e+08,8.3e+08,0.487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1644,2022-04-11,ALLD3,BRALLDACNOR3,ON NM,14.2,273,ALLD,101,2022,25330,...,2022-04-27 12:11:23,2021-01-01,2021-12-31,9.32e+07,-3.38e+08,4.08e+08,0.347,1.33e+09,1.66e+09,0.245
1785,2022-04-11,RECV3,BRRECVACNOR3,ON NM,23,2648,RECV,101,2022,25780,...,2022-06-02 17:08:52,2021-01-01,2021-12-31,2.49e+08,-6.15e+07,2.85e+08,0.157,5.7e+09,5.77e+09,0.0494


In [25]:
# Remover colunas intermediárias
df_stocks.drop(columns=["day_year"], inplace=True)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,year,codcvm,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
1349,2011-04-11,TOTS3,BRTOTSACNOR8,ON NM,8.73,244,TOTS,2011,19992,TOTVS S.A.,2011-01-31 19:05:59,2010-01-01,2010-12-31,3.15e+07,1.79e+08,2.12e+08,0.261,2.75e+08,9.53e+07,2.22
785,2011-04-11,LREN3,BRLRENACNOR1,ON NM,6.56,3604,LREN,2011,8133,LOJAS RENNER S.A.,2011-02-16 19:53:52,2010-01-01,2010-12-31,1.22e+08,-2.72e+07,4.04e+08,0.407,8.03e+08,8.3e+08,0.487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1644,2022-04-11,ALLD3,BRALLDACNOR3,ON NM,14.2,273,ALLD,2022,25330,ALLIED TECNOLOGI...,2022-04-27 12:11:23,2021-01-01,2021-12-31,9.32e+07,-3.38e+08,4.08e+08,0.347,1.33e+09,1.66e+09,0.245
1785,2022-04-11,RECV3,BRRECVACNOR3,ON NM,23,2648,RECV,2022,25780,PETRORECÔNCAVO S.A.,2022-06-02 17:08:52,2021-01-01,2021-12-31,2.49e+08,-6.15e+07,2.85e+08,0.157,5.7e+09,5.77e+09,0.0494


In [26]:
# Remover o ativo menos líquido da mesma empresa no ano pelo critério do núm.
# de negociações
df_stocks.sort_values(by=['year', 'codemi', 'totneg'], inplace=True)
df_stocks.drop_duplicates(
    subset=['codemi', 'year'], keep='last', inplace=True, ignore_index=True
)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,year,codcvm,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,BRAEDUACNOR9,ON NM,12.7,978,AEDU,2011,18961,ANHANGUERA EDUCA...,2011-05-26 11:54:58,2010-01-01,2010-12-31,1.46e+08,-5.71e+08,1.57e+08,0.11,1.85e+09,2.42e+09,0.065
1,2011-04-11,ALPA4,BRALPAACNPR7,PN N1,5.28,158,ALPA,2011,10456,ALPARGATAS S.A.,2011-03-30 16:55:23,2010-01-01,2010-12-31,3.53e+08,-3.59e+08,3.24e+08,0.328,1.87e+09,2.23e+09,0.146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,2022-04-11,WLMM4,BRWLMMACNPR3,PN EJ,34.2,14,WLMM,2022,11070,WLM PART. E COMÉ...,2022-03-22 22:18:18,2021-01-01,2021-12-31,3.64e+07,-1.52e+08,1.36e+08,0.349,1.25e+09,1.4e+09,0.0973
1276,2022-04-11,YDUQ3,BRYDUQACNOR3,ON NM,19.6,10252,YDUQ,2022,21016,YDUQS PARTICIPAC...,2022-03-15 18:09:20,2021-01-01,2021-12-31,3.09e+08,3.69e+09,5.47e+08,0.0788,6.05e+09,2.35e+09,0.232


In [27]:
print('Number of companies avaible for backtesting:', df_stocks.codemi.nunique())

Number of companies avaible for backtesting: 243


In [28]:
# O Livro fala em empresas com pelos menos USD 50 milhões de valor de mercado
df_stocks.query('market_cap > 250_000_000', inplace=True)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,year,codcvm,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,BRAEDUACNOR9,ON NM,12.7,978,AEDU,2011,18961,ANHANGUERA EDUCA...,2011-05-26 11:54:58,2010-01-01,2010-12-31,1.46e+08,-5.71e+08,1.57e+08,0.11,1.85e+09,2.42e+09,0.065
1,2011-04-11,ALPA4,BRALPAACNPR7,PN N1,5.28,158,ALPA,2011,10456,ALPARGATAS S.A.,2011-03-30 16:55:23,2010-01-01,2010-12-31,3.53e+08,-3.59e+08,3.24e+08,0.328,1.87e+09,2.23e+09,0.146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,2022-04-11,WLMM4,BRWLMMACNPR3,PN EJ,34.2,14,WLMM,2022,11070,WLM PART. E COMÉ...,2022-03-22 22:18:18,2021-01-01,2021-12-31,3.64e+07,-1.52e+08,1.36e+08,0.349,1.25e+09,1.4e+09,0.0973
1276,2022-04-11,YDUQ3,BRYDUQACNOR3,ON NM,19.6,10252,YDUQ,2022,21016,YDUQS PARTICIPAC...,2022-03-15 18:09:20,2021-01-01,2021-12-31,3.09e+08,3.69e+09,5.47e+08,0.0788,6.05e+09,2.35e+09,0.232


In [29]:
print('Number of companies avaible for backtesting:', df_stocks.codemi.nunique())

Number of companies avaible for backtesting: 236


In [30]:
df_stocks['rank_roic'] = (
    df_stocks.groupby(by=['year'])['roic']
             .rank(method='dense', ascending=False)
)
df_stocks['rank_ey'] = (
    df_stocks.groupby(by=['year'])['earnings_yield']
             .rank(method='dense', ascending=False)
)
df_stocks['ranks_sum'] = df_stocks['rank_roic'] + df_stocks['rank_ey']
df_stocks['rank_final'] = (
    df_stocks.groupby(by=['year'])['ranks_sum']
             .rank(method='first', ascending=True)
)
df_stocks

Unnamed: 0,datneg,codneg,codisi,especi,premed,totneg,codemi,year,codcvm,cia_nome,...,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
0,2011-04-11,AEDU3,BRAEDUACNOR9,ON NM,12.7,978,AEDU,2011,18961,ANHANGUERA EDUCA...,...,-5.71e+08,1.57e+08,0.11,1.85e+09,2.42e+09,0.065,71,67,138,74
1,2011-04-11,ALPA4,BRALPAACNPR7,PN N1,5.28,158,ALPA,2011,10456,ALPARGATAS S.A.,...,-3.59e+08,3.24e+08,0.328,1.87e+09,2.23e+09,0.146,11,45,56,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,2022-04-11,WLMM4,BRWLMMACNPR3,PN EJ,34.2,14,WLMM,2022,11070,WLM PART. E COMÉ...,...,-1.52e+08,1.36e+08,0.349,1.25e+09,1.4e+09,0.0973,23,77,100,39
1276,2022-04-11,YDUQ3,BRYDUQACNOR3,ON NM,19.6,10252,YDUQ,2022,21016,YDUQS PARTICIPAC...,...,3.69e+09,5.47e+08,0.0788,6.05e+09,2.35e+09,0.232,113,48,161,82


In [31]:
df_stocks.sort_values(by=['year', 'rank_final'], inplace=True)
# Alterar o nome do datneg
df_stocks.rename(columns={"datneg": "cutoff_date"}, inplace=True)
df_stocks

Unnamed: 0,cutoff_date,codneg,codisi,especi,premed,totneg,codemi,year,codcvm,cia_nome,...,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
57,2011-04-11,LREN3,BRLRENACNOR1,ON NM,6.56,3604,LREN,2011,8133,LOJAS RENNER S.A.,...,-2.72e+07,4.04e+08,0.407,8.03e+08,8.3e+08,0.487,5,7,12,1
93,2011-04-11,TOTS3,BRTOTSACNOR8,ON NM,8.73,244,TOTS,2011,19992,TOTVS S.A.,...,1.79e+08,2.12e+08,0.261,2.75e+08,9.53e+07,2.22,20,1,21,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235,2022-04-11,RDNI3,BRRDNIACNOR9,ON NM,8.01,35,RDNI,2022,20451,RNI NEGÓCIOS IMO...,...,4.71e+08,1.98e+07,0.0179,3.51e+08,-1.21e+08,-0.164,151,145,296,156
1144,2022-04-11,COGN3,BRCOGNACNOR2,ON NM,2.69,38694,COGN,2022,17973,COGNA EDUCAÇÃO S.A.,...,5.87e+09,7.84e+07,0.00399,5.05e+09,-8.19e+08,-0.0957,156,142,298,157


In [32]:
df_stocks.to_csv("../data/magic_stocks.csv", index=False)

In [34]:
cols = ['year', 'codemi', 'cia_nome', 'doc_env', 'rank_roic', 'rank_ey', 'ranks_sum', 'rank_final']
df_stocks.query('year == 2022')[cols].head(20)

Unnamed: 0,year,codemi,cia_nome,doc_env,rank_roic,rank_ey,ranks_sum,rank_final
1135,2022,BRKM,BRASKEM S.A.,2022-03-16 19:43:41,4,4,8,1
1253,2022,SYNE,SYN PROP & TECH ...,2022-02-25 00:12:00,11,1,12,2
...,...,...,...,...,...,...,...,...
1229,2022,PTBL,PBG S/A,2022-03-17 22:16:43,30,26,56,19
1233,2022,RANI,IRANI PAPEL E EM...,2022-02-24 07:33:49,27,29,56,20


In [33]:
df_stocks.query('codneg == "PRIO3"')

Unnamed: 0,cutoff_date,codneg,codisi,especi,premed,totneg,codemi,year,codcvm,cia_nome,...,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
949,2020-04-09,PRIO3,BRPRIOACNOR1,ON NM,5.31,26671,PRIO,2020,22187,PETRO RIO S.A.,...,1570000000.0,878000000.0,0.235,760000000.0,-812000000.0,-1.08,14,100,114,65
1075,2021-04-12,PRIO3,BRPRIOACNOR1,ON NM,19.3,27821,PRIO,2021,22187,PETRO RIO S.A.,...,1700000000.0,943000000.0,0.194,2790000000.0,1090000000.0,0.868,35,4,39,9
1227,2022-04-11,PRIO3,BRPRIOACNOR1,ON NM,23.5,20863,PRIO,2022,22187,PETRO RIO S.A.,...,-750000000.0,2120000000.0,0.361,20600000000.0,21400000000.0,0.0992,21,75,96,35
