df_fin -> input dataframe com os dados financeiros das empresas

df_cod -> input dataframe com os códigos CVM dos ativos

df_sel -> output dataframe que irá armazenar o resultado com o ranking das empresas

In [1]:
from pathlib import Path
import pandas as pd

# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.3g' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 4

In [2]:
# Ler a base ajustada no S3
# df_sel = pd.read_feather('s3://aq-dl/HistoricalQuotations/base_adj.feather')
df_sel = pd.read_feather('/mnt/aq_disk/data/HistoricalQuotations/processed/base_adj.feather')
df_sel

Unnamed: 0,datneg,codneg,codisi,nomres,especi,codbdi,tpmerc,dismes,datven,prazot,...,premed,preult,preofc,preofv,preexe,totneg,quatot,voltot,evento,ajuste
0,2022-02-03,5GTK11,BR5GTKCTF000,INVESTO 5GTK,CI,14,10,100,NaT,0,...,95.7,94.9,94.9,97.7,0,85,2.08e+04,2e+06,,1
1,2022-02-04,5GTK11,BR5GTKCTF000,INVESTO 5GTK,CI,14,10,100,NaT,0,...,95.8,95.8,95.4,95.8,0,50,1.11e+03,1.06e+05,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10938222,2013-12-18,ZNTE6L,BRZNTEACNPB8,FERR ZANETTE,PNB,52,17,104,NaT,0,...,0.82,0.82,0,0,0,1,7.44e+05,6.1e+05,,1
10938223,2012-12-13,ZNTE7L,BRZNTEACNPC6,FERR ZANETTE,PNC*,52,17,111,NaT,0,...,0.82,0.82,0,0,0,1,4.76e+08,3.9e+05,,1


In [3]:
# Filtrar somente cotações após 2011, lote padrão (bdi == 2) e ações ON ou PN
df_sel.query('\
    codbdi == 2 and \
    datneg >= "2011.01.01" and \
    especi.str.contains("ON |PN ")'
    , inplace=True
)
df_sel.reset_index(drop=True, inplace=True)
df_sel

Unnamed: 0,datneg,codneg,codisi,nomres,especi,codbdi,tpmerc,dismes,datven,prazot,...,premed,preult,preofc,preofv,preexe,totneg,quatot,voltot,evento,ajuste
0,2016-10-28,AALR3,BRAALRACNOR6,ALLIAR,ON NM,2,10,100,NaT,0,...,19,18.9,18.9,19,0,4460,6.43e+06,1.22e+08,,0.986
1,2016-10-31,AALR3,BRAALRACNOR6,ALLIAR,ON NM,2,10,100,NaT,0,...,17.9,17.8,17.7,17.8,0,4238,2.56e+06,4.59e+07,,0.986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577535,2022-06-08,YDUQ3,BRYDUQACNOR3,YDUQS PART,ON NM,2,10,103,NaT,0,...,15.1,14.9,14.9,14.9,0,6301,1.53e+06,2.31e+07,,1
577536,2022-06-09,YDUQ3,BRYDUQACNOR3,YDUQS PART,ON NM,2,10,103,NaT,0,...,15.2,15,15,15,0,8285,1.82e+06,2.76e+07,,1


In [4]:
# Manter somente as colunas que serão usadas para fazer o corte nas datas
cols = ['datneg', 'codneg', 'nomres', 'especi', 'premed', 'totneg']
df_sel = df_sel[cols].copy()
# Inserir o cód. dos emissores -> 4 primeiros caracteres do código de negociação
df_sel['codemi'] = df_sel.codneg.str[0:4]
# Inserir o dia do ano e o ano para a operação de corte das ações
df_sel['day_year'] = df_sel.datneg.dt.day_of_year
df_sel['year'] = df_sel.datneg.dt.year
print('Number of companies available for backtesting', df_sel.codemi.nunique())
df_sel

Number of companies available for backtesting 500


Unnamed: 0,datneg,codneg,nomres,especi,premed,totneg,codemi,day_year,year
0,2016-10-28,AALR3,ALLIAR,ON NM,19,4460,AALR,302,2016
1,2016-10-31,AALR3,ALLIAR,ON NM,17.9,4238,AALR,305,2016
...,...,...,...,...,...,...,...,...,...
577535,2022-06-08,YDUQ3,YDUQS PART,ON NM,15.1,6301,YDUQ,159,2022
577536,2022-06-09,YDUQ3,YDUQS PART,ON NM,15.2,8285,YDUQ,160,2022


In [5]:
# Definir a data de corte e remover negociações posteriores ao corte
df_sel.query('day_year >= 100', inplace=True)
df_sel.reset_index(drop=True, inplace=True)
df_sel

Unnamed: 0,datneg,codneg,nomres,especi,premed,totneg,codemi,day_year,year
0,2016-10-28,AALR3,ALLIAR,ON NM,19,4460,AALR,302,2016
1,2016-10-31,AALR3,ALLIAR,ON NM,17.9,4238,AALR,305,2016
...,...,...,...,...,...,...,...,...,...
413604,2022-06-08,YDUQ3,YDUQS PART,ON NM,15.1,6301,YDUQ,159,2022
413605,2022-06-09,YDUQ3,YDUQS PART,ON NM,15.2,8285,YDUQ,160,2022


In [6]:
df_corte = df_sel.groupby(by=['year'])['day_year'].min().reset_index()
df_corte

Unnamed: 0,year,day_year
0,2011,101
1,2012,100
...,...,...
10,2021,102
11,2022,101


In [7]:
# Filtrar o datafreme de ações com dados somente nos dias de corte
# Usar 'year' e 'day_year' como chaves para a operação de união entre os dois dataframes
df_sel = df_sel.merge(
    right=df_corte, how='inner', on=['year', 'day_year']
)
# A coluna 'day_year' não será mais usada
df_sel.drop(columns=['day_year'], inplace=True)
print('Number of companies available for backtesting:', df_sel.codemi.nunique())
df_sel

Number of companies available for backtesting: 377


Unnamed: 0,datneg,codneg,nomres,especi,premed,totneg,codemi,year
0,2017-04-10,AALR3,ALLIAR,ON NM,15.1,315,AALR,2017
1,2017-04-10,ABCB4,ABC BRASIL,PN N2,13.6,1005,ABCB,2017
...,...,...,...,...,...,...,...,...
2465,2016-04-11,VVAR3,VIAVAREJO,ON N2,2.38,1,VVAR,2016
2466,2016-04-11,WEGE3,WEG,ON NM,4.74,9383,WEGE,2016


In [8]:
# Excluir empresas financeiras e prestadoras de serviços públicos (utilities)
excluded_companies = pd.read_csv('../data/external/excluded_companies.csv')
excluded_companies = excluded_companies['company_code'].to_list()
df_sel.query('codemi != @excluded_companies', inplace=True)
print('Number of companies available for backtesting', df_sel.codemi.nunique())
df_sel

Number of companies available for backtesting 311


Unnamed: 0,datneg,codneg,nomres,especi,premed,totneg,codemi,year
0,2017-04-10,AALR3,ALLIAR,ON NM,15.1,315,AALR,2017
2,2017-04-10,AGRO3,BRASILAGRO,ON NM,8.98,199,AGRO,2017
...,...,...,...,...,...,...,...,...
2465,2016-04-11,VVAR3,VIAVAREJO,ON N2,2.38,1,VVAR,2016
2466,2016-04-11,WEGE3,WEG,ON NM,4.74,9383,WEGE,2016


In [9]:
# Carregar dataframe com o código de emissão das empresas
AQ_FOLDER = Path('/mnt/aq_disk/data/AQ/')
df_cod = pd.read_pickle(AQ_FOLDER / 'codemi.pkl')
df_cod

Unnamed: 0,codcvm,cnpj,densoc,situac,codemi
0,60,18451005000104,ACOPALMA CIA IND...,CANCELADA,ZWVZ
1,94,92693019000189,PANATLANTICA SA,ATIVO,PATI
...,...,...,...,...,...
1764,26824,43335774000186,TRAVESSIA SECURI...,ATIVO,TMER
1765,26832,38482780000126,ANEMUS WIND HOLD...,ATIVO,ANEM


In [10]:
# A chave da união será o cód. do emissor (codemi)
# Somente o cód. CVM (codcvm) será inserido na união -> Remover colunas que não serão 
# usadas na operação de merge
df_cod = df_cod[['codcvm', 'codemi']].copy()
df_cod

Unnamed: 0,codcvm,codemi
0,60,ZWVZ
1,94,PATI
...,...,...
1764,26824,TMER
1765,26832,ANEM


In [11]:
# Criar um set com a lista de ativos antes da operação de união dos dataframes
s0 = set(df_sel.codemi.unique())
# Obter os código de emissão dos ativos unindo os dataframes 
df_sel = df_sel.merge(right=df_cod, how='inner', on='codemi')
df_sel.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df_sel.codemi.nunique())
df_sel

Number of companies available for backtesting: 277


Unnamed: 0,datneg,codneg,nomres,especi,premed,totneg,codemi,year,codcvm
0,2017-04-10,AALR3,ALLIAR,ON NM,15.1,315,AALR,2017,24058
1,2018-04-10,AALR3,ALLIAR,ON NM,15,175,AALR,2018,24058
...,...,...,...,...,...,...,...,...,...
1797,2015-04-10,RUMO3,RUMO LOG,ON NM,16.5,14416,RUMO,2015,23450
1798,2016-04-11,RUMO3,RUMO LOG,ON NM,3.31,26340,RUMO,2016,23450


In [12]:
# Verificar a diferença entre os dois sets
s1 = set(df_sel.codemi.unique())
print('Núm. de empresas cujo código não foi localizado', len(s0-s1))
# 34 empresas não tiveram seu código localizado.
# Analisando os dados, tratam-se de empresas cujo código de listagem foi alterado:
# BVMF->B3SA, VVAR->VIIA, etc
(s0 - s1)

Núm. de empresas cujo código não foi localizado 34


{'ABRE',
 'ALLL',
 'BBRK',
 'BPNM',
 'BRDT',
 'BRIN',
 'BTOW',
 'BVMF',
 'CCPR',
 'CNTO',
 'CTAX',
 'DROG',
 'DTEX',
 'ECOD',
 'ENMA',
 'ESTC',
 'FJTA',
 'HRTP',
 'IDNT',
 'INPR',
 'KROT',
 'LIQO',
 'LLXL',
 'MPXE',
 'OHLB',
 'PARC',
 'PRTX',
 'QGEP',
 'RNAR',
 'SNSL',
 'SSBR',
 'TBLE',
 'VAGR',
 'VVAR'}

In [13]:
# Carregar dataframe com os dados financeiros das empresas
df_fin = pd.read_csv(
    '../data/1_companies_financials.csv',
    parse_dates=['doc_env', 'per_ini', 'per_fim']
)
# Renomear coluna com o código CVM para coincidir com as outras bases
df_fin.rename(columns={'cia_id': 'codcvm'}, inplace=True)
# As colunas 'per_ini' e 'cia_nome' não serão usadas
df_fin.drop(columns=['per_ini', 'cia_nome'], inplace=True)
# year = ano em que a informação será usada -> ano seguinte ao fim do período
df_fin['year'] = df_fin.per_fim.dt.year + 1
df_fin

Unnamed: 0,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,year
0,94,2011-03-31 10:16:48,2010-12-31,8.86e+06,-1.9e+07,2.48e+07,0.2,2011
1,94,2011-04-01 17:31:56,2010-12-31,8.86e+06,-1.9e+07,2.11e+07,0.171,2011
...,...,...,...,...,...,...,...,...
4056,80195,2021-03-12 18:33:08,2020-12-31,6.6e+07,1.04e+08,9.22e+07,0.191,2021
4057,90212,2018-05-30 15:43:03,2017-12-31,2.16e+08,-7.08e+07,2.41e+08,0.306,2018


In [14]:
# Incluir os dados contábeis em 'df_sel' 
df_sel = df_sel.merge(right=df_fin, how='inner', on=['year', 'codcvm'])
# A coluna 'codcvm não será mais usada
df_sel.drop(columns=['codcvm'], inplace=True)
df_sel

Unnamed: 0,datneg,codneg,nomres,especi,premed,totneg,codemi,year,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic
0,2017-04-10,AALR3,ALLIAR,ON NM,15.1,315,AALR,2017,2017-03-22 23:42:13,2016-12-31,1.15e+08,3.09e+08,1.01e+08,0.0662
1,2018-04-10,AALR3,ALLIAR,ON NM,15,175,AALR,2018,2018-03-28 20:20:31,2017-12-31,1.18e+08,5.08e+08,7.03e+07,0.0396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,2015-04-10,RUMO3,RUMO LOG,ON NM,16.5,14416,RUMO,2015,2015-03-03 20:02:32,2014-12-31,1.03e+09,6.99e+08,2.07e+08,0.102
1950,2016-04-11,RUMO3,RUMO LOG,ON NM,3.31,26340,RUMO,2016,2016-02-25 20:38:25,2015-12-31,2.99e+08,8e+09,1.05e+09,0.0887


In [15]:
# Calcular os indicadores que dependem do preço do ação
df_sel['market_cap'] = df_sel['shares_outstanding'] * df_sel['premed']
df_sel['enterprise_value'] = df_sel['market_cap'] - df_sel['net_debt']
df_sel['earnings_yield'] = df_sel['ebit'] / df_sel['enterprise_value']
# A coluna 'premed' não será mais usada
df_sel.drop(columns=['premed'], inplace=True)
df_sel

Unnamed: 0,datneg,codneg,nomres,especi,totneg,codemi,year,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,ALLIAR,ON NM,315,AALR,2017,2017-03-22 23:42:13,2016-12-31,1.15e+08,3.09e+08,1.01e+08,0.0662,1.74e+09,1.43e+09,0.0703
1,2018-04-10,AALR3,ALLIAR,ON NM,175,AALR,2018,2018-03-28 20:20:31,2017-12-31,1.18e+08,5.08e+08,7.03e+07,0.0396,1.77e+09,1.26e+09,0.0557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,2015-04-10,RUMO3,RUMO LOG,ON NM,14416,RUMO,2015,2015-03-03 20:02:32,2014-12-31,1.03e+09,6.99e+08,2.07e+08,0.102,1.69e+10,1.62e+10,0.0127
1950,2016-04-11,RUMO3,RUMO LOG,ON NM,26340,RUMO,2016,2016-02-25 20:38:25,2015-12-31,2.99e+08,8e+09,1.05e+09,0.0887,9.9e+08,-7.01e+09,-0.15


In [16]:
# Remover revisões de DFPs publicadas no mesmo dia ou posteriores ao corte
# No livro, o corte é de uma semana
df_sel.query('doc_env.dt.date < datneg')

Unnamed: 0,datneg,codneg,nomres,especi,totneg,codemi,year,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,ALLIAR,ON NM,315,AALR,2017,2017-03-22 23:42:13,2016-12-31,1.15e+08,3.09e+08,1.01e+08,0.0662,1.74e+09,1.43e+09,0.0703
1,2018-04-10,AALR3,ALLIAR,ON NM,175,AALR,2018,2018-03-28 20:20:31,2017-12-31,1.18e+08,5.08e+08,7.03e+07,0.0396,1.77e+09,1.26e+09,0.0557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,2015-04-10,RUMO3,RUMO LOG,ON NM,14416,RUMO,2015,2015-03-03 20:02:32,2014-12-31,1.03e+09,6.99e+08,2.07e+08,0.102,1.69e+10,1.62e+10,0.0127
1950,2016-04-11,RUMO3,RUMO LOG,ON NM,26340,RUMO,2016,2016-02-25 20:38:25,2015-12-31,2.99e+08,8e+09,1.05e+09,0.0887,9.9e+08,-7.01e+09,-0.15


In [17]:
# Manter somente a DFP mais recente ao corte para cada ativo
df_sel.sort_values('doc_env', inplace=True)
df_sel.drop_duplicates(subset=['codneg', 'year'], keep='last', inplace=True)
df_sel

Unnamed: 0,datneg,codneg,nomres,especi,totneg,codemi,year,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
1349,2011-04-11,TOTS3,TOTVS,ON NM,244,TOTS,2011,2011-01-31 19:05:59,2010-12-31,3.15e+07,1.79e+08,2.12e+08,0.261,2.75e+08,9.53e+07,2.22
785,2011-04-11,LREN3,LOJAS RENNER,ON NM,3604,LREN,2011,2011-02-16 19:53:52,2010-12-31,1.22e+08,-2.72e+07,4.04e+08,0.407,8.03e+08,8.3e+08,0.487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1644,2022-04-11,ALLD3,ALLIED,ON NM,273,ALLD,2022,2022-04-27 12:11:23,2021-12-31,9.32e+07,-3.38e+08,4.08e+08,0.347,1.33e+09,1.66e+09,0.245
1785,2022-04-11,RECV3,PETRORECSA,ON NM,2648,RECV,2022,2022-06-02 17:08:52,2021-12-31,2.49e+08,-6.15e+07,2.85e+08,0.157,5.7e+09,5.77e+09,0.0494


In [18]:
# Remover o ativo menos líquido da empresa no ano pelo critério do núm. de negociações
df_sel.sort_values(by=['year', 'codemi', 'totneg'], inplace=True)
df_sel.drop_duplicates(
    subset=['codemi', 'year'], keep='last', inplace=True, ignore_index=True
)
# A coluna 'totneg' não será mais necessária
df_sel.drop(columns='totneg', inplace=True)
df_sel

Unnamed: 0,datneg,codneg,nomres,especi,codemi,year,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,ANHANGUERA,ON NM,AEDU,2011,2011-05-26 11:54:58,2010-12-31,1.46e+08,-5.71e+08,1.57e+08,0.11,1.85e+09,2.42e+09,0.065
1,2011-04-11,ALPA4,ALPARGATAS,PN N1,ALPA,2011,2011-03-30 16:55:23,2010-12-31,3.53e+08,-3.59e+08,3.24e+08,0.328,1.87e+09,2.23e+09,0.146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,2022-04-11,WLMM4,WLM IND COM,PN EJ,WLMM,2022,2022-03-22 22:18:18,2021-12-31,3.64e+07,-1.52e+08,1.36e+08,0.349,1.25e+09,1.4e+09,0.0973
1276,2022-04-11,YDUQ3,YDUQS PART,ON NM,YDUQ,2022,2022-03-15 18:09:20,2021-12-31,3.09e+08,3.69e+09,5.47e+08,0.0788,6.05e+09,2.35e+09,0.232


In [19]:
print('Number of companies available for backtesting', df_sel.codemi.nunique())

Number of companies available for backtesting 243


In [20]:
# O Livro fala em empresas com pelos menos USD 50 milhões de valor de mercado
# Remover empresas com menos de R$ 250 milhões de valor de mercado
df_sel.query('market_cap > 250_000_000', inplace=True)
print('Number of companies available for backtesting', df_sel.codemi.nunique())
df_sel

Number of companies available for backtesting 236


Unnamed: 0,datneg,codneg,nomres,especi,codemi,year,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,ANHANGUERA,ON NM,AEDU,2011,2011-05-26 11:54:58,2010-12-31,1.46e+08,-5.71e+08,1.57e+08,0.11,1.85e+09,2.42e+09,0.065
1,2011-04-11,ALPA4,ALPARGATAS,PN N1,ALPA,2011,2011-03-30 16:55:23,2010-12-31,3.53e+08,-3.59e+08,3.24e+08,0.328,1.87e+09,2.23e+09,0.146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,2022-04-11,WLMM4,WLM IND COM,PN EJ,WLMM,2022,2022-03-22 22:18:18,2021-12-31,3.64e+07,-1.52e+08,1.36e+08,0.349,1.25e+09,1.4e+09,0.0973
1276,2022-04-11,YDUQ3,YDUQS PART,ON NM,YDUQ,2022,2022-03-15 18:09:20,2021-12-31,3.09e+08,3.69e+09,5.47e+08,0.0788,6.05e+09,2.35e+09,0.232


In [21]:
df_sel['rank_roic'] = (df_sel
    .groupby(by=['year'])['roic']
    .rank(method='dense', ascending=False)
)
df_sel['rank_ey'] = (df_sel
    .groupby(by=['year'])['earnings_yield']
    .rank(method='dense', ascending=False)
)
df_sel['ranks_sum'] = df_sel['rank_roic'] + df_sel['rank_ey']
df_sel['rank_final'] = (df_sel
    .groupby(by=['year'])['ranks_sum']
    .rank(method='first', ascending=True)
)
df_sel

Unnamed: 0,datneg,codneg,nomres,especi,codemi,year,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
0,2011-04-11,AEDU3,ANHANGUERA,ON NM,AEDU,2011,2011-05-26 11:54:58,2010-12-31,1.46e+08,-5.71e+08,1.57e+08,0.11,1.85e+09,2.42e+09,0.065,71,67,138,74
1,2011-04-11,ALPA4,ALPARGATAS,PN N1,ALPA,2011,2011-03-30 16:55:23,2010-12-31,3.53e+08,-3.59e+08,3.24e+08,0.328,1.87e+09,2.23e+09,0.146,11,45,56,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,2022-04-11,WLMM4,WLM IND COM,PN EJ,WLMM,2022,2022-03-22 22:18:18,2021-12-31,3.64e+07,-1.52e+08,1.36e+08,0.349,1.25e+09,1.4e+09,0.0973,23,77,100,39
1276,2022-04-11,YDUQ3,YDUQS PART,ON NM,YDUQ,2022,2022-03-15 18:09:20,2021-12-31,3.09e+08,3.69e+09,5.47e+08,0.0788,6.05e+09,2.35e+09,0.232,113,48,161,82


In [22]:
df_sel.sort_values(by=['year', 'rank_final'], inplace=True)
# Renomear a coluna 'datneg'
df_sel.rename(columns={'datneg': 'cutoff_date'}, inplace=True)
# A coluna 'year' já está implícita na coluna 'cutoff_date' e não será mais usada
df_sel.drop(columns=['year'], inplace=True)
# Remover colunas de cálculo intemediário
df_sel.drop(columns=['rank_roic', 'rank_ey', 'ranks_sum'], inplace=True)
df_sel

Unnamed: 0,cutoff_date,codneg,nomres,especi,codemi,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
57,2011-04-11,LREN3,LOJAS RENNER,ON NM,LREN,2011-02-16 19:53:52,2010-12-31,1.22e+08,-2.72e+07,4.04e+08,0.407,8.03e+08,8.3e+08,0.487,1
93,2011-04-11,TOTS3,TOTVS,ON NM,TOTS,2011-01-31 19:05:59,2010-12-31,3.15e+07,1.79e+08,2.12e+08,0.261,2.75e+08,9.53e+07,2.22,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235,2022-04-11,RDNI3,RNI,ON NM,RDNI,2022-03-09 19:56:48,2021-12-31,4.38e+07,4.71e+08,1.98e+07,0.0179,3.51e+08,-1.21e+08,-0.164,156
1144,2022-04-11,COGN3,COGNA ON,ON NM,COGN,2022-03-24 21:24:09,2021-12-31,1.88e+09,5.87e+09,7.84e+07,0.00399,5.05e+09,-8.19e+08,-0.0957,157


In [23]:
# Select only the first 30 companies in the magic rank for each year
df_sel.query('rank_final <= 30', inplace=True)
df_sel.reset_index(drop=True, inplace=True)
print('Number of selected companies for backtesting', df_sel.codemi.nunique())
df_sel

Number of selected companies for backtesting 116


Unnamed: 0,cutoff_date,codneg,nomres,especi,codemi,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,LREN3,LOJAS RENNER,ON NM,LREN,2011-02-16 19:53:52,2010-12-31,1.22e+08,-2.72e+07,4.04e+08,0.407,8.03e+08,8.3e+08,0.487,1
1,2011-04-11,TOTS3,TOTVS,ON NM,TOTS,2011-01-31 19:05:59,2010-12-31,3.15e+07,1.79e+08,2.12e+08,0.261,2.75e+08,9.53e+07,2.22,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,2022-04-11,PDTC3,PADTEC,ON NM,PDTC,2022-03-15 18:02:05,2021-12-31,7.84e+07,2.03e+07,5.53e+07,0.372,4.02e+08,3.81e+08,0.145,29
359,2022-04-11,POSI3,POSITIVO TEC,ON NM,POSI,2022-03-30 18:17:00,2021-12-31,1.42e+08,5.42e+08,3.03e+08,0.176,1.16e+09,6.17e+08,0.492,30


In [24]:
df_sel.to_csv('../data/2_companies_selected.csv', index=False)

In [30]:
# Check data
df_sel.query('cutoff_date > "2022-01-01"').head(4)

Unnamed: 0,cutoff_date,codneg,nomres,especi,codemi,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
330,2022-04-11,BRKM3,BRASKEM,ON N1,BRKM,2022-03-16 19:43:41,2021-12-31,797000000.0,22900000000.0,26000000000.0,0.896,34200000000.0,11400000000.0,2.29,1
331,2022-04-11,SYNE3,SYN PROP TEC,ON NM,SYNE,2022-02-25 00:12:00,2021-12-31,153000000.0,860000000.0,1630000000.0,0.45,862000000.0,2120000.0,771.0,2
332,2022-04-11,GOAU4,GERDAU MET,PN N1,GOAU,2022-02-23 07:39:32,2021-12-31,1090000000.0,6480000000.0,21000000000.0,0.419,12400000000.0,5910000000.0,3.55,3
333,2022-04-11,BRAP4,BRADESPAR,PN N1,BRAP,2022-03-29 06:35:35,2021-12-31,393000000.0,-295000000.0,8020000000.0,1.11,12200000000.0,12400000000.0,0.644,4


In [28]:
df_sel.query('codneg == "PRIO3"')

Unnamed: 0,cutoff_date,codneg,nomres,especi,codemi,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
308,2021-04-12,PRIO3,PETRORIO,ON NM,PRIO,2021-03-02 00:08:01,2020-12-31,145000000.0,1700000000.0,943000000.0,0.194,2790000000.0,1090000000.0,0.868,9
