In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.2f' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 6

In [2]:
# Load dataset not adjusted for dividends
DATA_FOLDER = Path("/mnt/aq_disk/data/HistoricalQuotations/interim")
DATASET1 = DATA_FOLDER / "dataset_95-21.feather"
DATASET2 = DATA_FOLDER / "dataset_22.feather"
cols = ['datneg', 'codneg', 'nomres', 'especi', 'codbdi', 'tpmerc', 'preult', 'premed', 'totneg', 'voltot']
df_magic = (pd
    .concat([pd.read_feather(DATASET1), pd.read_feather(DATASET2)], ignore_index=True)
    [cols]
    .query('tpmerc == 10')
)
df_magic

Unnamed: 0,datneg,codneg,nomres,especi,codbdi,tpmerc,preult,premed,totneg,voltot
0,2020-02-07,A1AP34,ADVANCE AUTO,DRN,2,10,143.12,143.12,2,6139848.00
1,2020-02-10,A1AP34,ADVANCE AUTO,DRN,2,10,142.27,142.27,1,512172.00
2,2020-02-13,A1AP34,ADVANCE AUTO,DRN,2,10,147.37,147.37,1,235792.00
...,...,...,...,...,...,...,...,...,...,...
11055742,2022-07-07,R2NG34,RINGCENTRAL,DRN,2,10,12.54,12.54,1,75.24
11055743,2022-07-07,RADL3,RAIADROGASIL,ON EJ NM,2,10,20.00,19.89,12391,121884011.00
11055744,2022-07-07,RAIL3,RUMO S.A.,ON NM,2,10,16.12,16.24,16254,177053334.00


In [3]:
"""
Filter:
    1. Quotations after 2011
    2. Standard batch (codbdi == 2) -> remove companies in bankruptcy, judicial
    recovery, etc. from input to selection (not from output!)
    3. ON, PN or PNA Shares
"""
df_magic.query('\
    codbdi == 2 and \
    datneg >= "2011.01.01" and \
    especi.str.contains("ON |PN |PNA")'
    , inplace=True
)
df_magic.reset_index(drop=True, inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,especi,codbdi,tpmerc,preult,premed,totneg,voltot
0,2016-10-28,AALR3,ALLIAR,ON NM,2,10,19.20,19.28,4460,122334647.00
1,2016-10-31,AALR3,ALLIAR,ON NM,2,10,18.06,18.17,4238,45857231.00
2,2016-11-01,AALR3,ALLIAR,ON NM,2,10,17.90,17.74,2072,17676981.00
...,...,...,...,...,...,...,...,...,...,...
613575,2022-07-07,QUAL3,QUALICORP,ON NM,2,10,12.47,12.54,7508,21304784.00
613576,2022-07-07,RADL3,RAIADROGASIL,ON EJ NM,2,10,20.00,19.89,12391,121884011.00
613577,2022-07-07,RAIL3,RUMO S.A.,ON NM,2,10,16.12,16.24,16254,177053334.00


In [4]:
# Keep only the columns that will be used to filter the dates
cols = ['datneg', 'codneg', 'nomres', 'premed', 'totneg']
df_magic = df_magic.loc[:, cols]
# Insert the issuers code -> first 4 characters of the trading code
df_magic['codemi'] = df_magic['codneg'].str[0:4]
# Insert the day of the year and the year for the stock filter operation
df_magic['day_year'] = df_magic['datneg'].dt.day_of_year
df_magic['year'] = df_magic['datneg'].dt.year
# Sort the dataframe by asset and date
df_magic.sort_values(by=['codneg', 'datneg'], inplace=True)
print('Number of companies available for backtesting', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting 511


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016
...,...,...,...,...,...,...,...,...
611563,2022-07-18,YDUQ3,YDUQS PART,15.11,15921,YDUQ,199,2022
612406,2022-07-19,YDUQ3,YDUQS PART,14.00,16580,YDUQ,200,2022
613136,2022-07-20,YDUQ3,YDUQS PART,14.07,15143,YDUQ,201,2022


In [5]:
# Create a column with the 30-day moving average of the number of trades for each asset
df_magic['totneg_sma30'] = df_magic.groupby('codneg')['totneg'].transform(lambda x: x.rolling(30, 1).mean())
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year,totneg_sma30
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016,4460.00
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016,4349.00
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016,3590.00
...,...,...,...,...,...,...,...,...,...
611563,2022-07-18,YDUQ3,YDUQS PART,15.11,15921,YDUQ,199,2022,11343.03
612406,2022-07-19,YDUQ3,YDUQS PART,14.00,16580,YDUQ,200,2022,11718.30
613136,2022-07-20,YDUQ3,YDUQS PART,14.07,15143,YDUQ,201,2022,12000.97


In [6]:
# Set the cut-off day and remove post-cut trades
df_magic.query('day_year >= 100', inplace=True)
df_magic.reset_index(drop=True, inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year,totneg_sma30
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016,4460.00
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016,4349.00
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016,3590.00
...,...,...,...,...,...,...,...,...,...
441371,2022-07-18,YDUQ3,YDUQS PART,15.11,15921,YDUQ,199,2022,11343.03
441372,2022-07-19,YDUQ3,YDUQS PART,14.00,16580,YDUQ,200,2022,11718.30
441373,2022-07-20,YDUQ3,YDUQS PART,14.07,15143,YDUQ,201,2022,12000.97


In [7]:
# Calculate the day closest to 100 in each year
df_balancing = df_magic.groupby(by=['year'])['day_year'].min().reset_index()
df_balancing

Unnamed: 0,year,day_year
0,2011,101
1,2012,100
2,2013,100
...,...,...
9,2020,100
10,2021,102
11,2022,101


In [8]:
# Filter stock dataframe with data on cut days only
# Use 'year' and 'day_year' as keys for the union operation between the two dataframes
df_magic = df_magic.merge(
    right=df_balancing, how='inner', on=['year', 'day_year']
)
# The 'day_year' column will no longer be used
df_magic.drop(columns=['day_year'], inplace=True)
print('Number of companies available for backtesting:', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting: 395


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73
1,2017-04-10,ABCB4,ABC BRASIL,18.36,1005,ABCB,2017,2294.30
2,2017-04-10,AGRO3,BRASILAGRO,12.29,199,AGRO,2017,330.10
...,...,...,...,...,...,...,...,...
2587,2016-04-11,VLID3,VALID,31.31,2720,VLID,2016,2778.93
2588,2016-04-11,VVAR3,VIAVAREJO,2.38,1,VVAR,2016,8.40
2589,2016-04-11,WEGE3,WEG,13.55,9383,WEGE,2016,10665.47


In [9]:
# Load the list of excluded companies as per step 1
excluded_companies = pd.read_csv('../data/external/excluded_companies.csv')
excluded_companies = excluded_companies['company_code'].to_list()
df_magic.query('codemi != @excluded_companies', inplace=True)
print('Number of companies available for backtesting', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting 323


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73
2,2017-04-10,AGRO3,BRASILAGRO,12.29,199,AGRO,2017,330.10
3,2017-04-10,ALPA3,ALPARGATAS,10.59,3,ALPA,2017,4.97
...,...,...,...,...,...,...,...,...
2587,2016-04-11,VLID3,VALID,31.31,2720,VLID,2016,2778.93
2588,2016-04-11,VVAR3,VIAVAREJO,2.38,1,VVAR,2016,8.40
2589,2016-04-11,WEGE3,WEG,13.55,9383,WEGE,2016,10665.47


In [10]:
# Load dataframe with CVM code of listed companies
df_cod = pd.read_pickle('/mnt/aq_disk/data/AQ/cod_emissor.pkl')
df_cod

Unnamed: 0,codcvm,cnpj,densoc,situac,codemi
0,60,18451005000104,ACOPALMA CIA IND...,CANCELADA,ZWVZ
1,94,92693019000189,PANATLANTICA SA,ATIVO,PATI
2,108,60664810000174,AÇOS VILLARES SA,CANCELADA,AVIL
...,...,...,...,...,...
1766,26824,43335774000186,TRAVESSIA SECURI...,ATIVO,TMER
1767,26832,38482780000126,ANEMUS WIND HOLD...,ATIVO,ANEM
1768,26840,44841035000129,SAP SECURITIZADO...,ATIVO,SAPS


In [11]:
# The union key will be the issuer code (codemi)
# Only CVM (codcvm) will be inserted into the join
# Remove columns that will not be used in the merge operation
df_cod = df_cod[['codcvm', 'codemi']].copy()
df_cod

Unnamed: 0,codcvm,codemi
0,60,ZWVZ
1,94,PATI
2,108,AVIL
...,...,...
1766,26824,TMER
1767,26832,ANEM
1768,26840,SAPS


In [12]:
# Create a set with the list of assets before the dataframe union operation
s0 = set(df_magic.codemi.unique())
# Get the asset issue codes by joining the dataframes
df_magic = df_magic.merge(right=df_cod, how='inner', on='codemi')
df_magic.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting: 287


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058
...,...,...,...,...,...,...,...,...,...
1869,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537
1870,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450
1871,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450


In [13]:
# Check the difference between the two sets
s1 = set(df_magic.codemi.unique())
print('Núm. de empresas cujo código não foi localizado', len(s0-s1))
# 34 companies did not have their code localized. Analyzing the data, these are
# companies whose listing code has changed : BVMF->B3SA, VVAR->VIIA, etc
print(s0 - s1)

Núm. de empresas cujo código não foi localizado 36
{'IDNT', 'QGEP', 'RNAR', 'ECOD', 'CTAX', 'BRIN', 'LLXL', 'CNTO', 'CCPR', 'BTOW', 'BBRK', 'DTEX', 'TBLE', 'PRTX', 'MPXE', 'HRTP', 'DROG', 'ALLL', 'PARC', 'CELP', 'BPNM', 'VVAR', 'ENMA', 'SNSL', 'TIBR', 'BRDT', 'INPR', 'LIQO', 'ESTC', 'KROT', 'BVMF', 'VAGR', 'OHLB', 'FJTA', 'SSBR', 'ABRE'}


In [14]:
# Load dataframe with the companies' financial data
df_fin = (pd
    .read_csv(
        '../data/magic_financials.csv',
        parse_dates=['doc_env', 'per_ini', 'per_fim']
        )
)
# Rename column with CVM code to match other bases
df_fin.rename(columns={'cia_id': 'codcvm'}, inplace=True)
# The columns 'per_ini' and 'cia_nome' will not be used
df_fin.drop(columns=['per_ini', 'cia_nome'], inplace=True)
# year = year in which the information will be used -> year following the end of the period
df_fin['year'] = df_fin.per_fim.dt.year + 1
df_fin

Unnamed: 0,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,year
0,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.19,2011
1,7510,2011-02-08 20:00:11,2010-12-31,74.76,748.49,76.90,0.04,2011
2,7510,2011-02-10 09:51:21,2010-12-31,74.76,748.49,76.90,0.04,2011
...,...,...,...,...,...,...,...,...
2816,20516,2022-06-20 21:19:43,2022-03-31,354.01,5922.43,2412.70,0.19,2023
2817,26522,2022-06-21 19:15:55,2022-03-31,458.28,674.07,795.16,0.36,2023
2818,25496,2022-06-22 20:02:33,2022-03-31,294.70,857.45,751.94,0.27,2023


In [15]:
# Include accounting data in 'df_magic'
df_magic = df_magic.merge(right=df_fin, how='inner', on=['year', 'codcvm'])
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.06
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.65,70.34,0.03
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.14
1727,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.09
1728,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.34


In [16]:
accounting_columns = ['shares_outstanding', 'net_debt', 'ebit']
df_magic[accounting_columns] = df_magic[accounting_columns]
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.06
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.65,70.34,0.03
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.14
1727,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.09
1728,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.34


In [17]:
# Calculate the indicators that depend on the share price
df_magic['market_cap'] = df_magic['shares_outstanding'] * df_magic['premed']
df_magic['enterprise_value'] = df_magic['market_cap'] + df_magic['net_debt']
df_magic['earnings_yield'] = df_magic['ebit'] / df_magic['enterprise_value']
# The 'premed' column will no longer be used
df_magic.drop(columns=['premed'], inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,ALLIAR,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.06,1764.85,2073.35,0.05
1,2018-04-10,AALR3,ALLIAR,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.65,70.34,0.03,1794.50,2302.15,0.03
2,2019-04-10,AALR3,ALLIAR,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.05,1731.81,2264.15,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726,2014-04-10,CGRA4,GRAZZIOTIN,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.14,378.51,291.13,0.18
1727,2015-04-10,RUMO3,RUMO LOG,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.09,1693.71,2392.94,0.09
1728,2016-04-11,RUMO3,RUMO LOG,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.34,989.74,8993.66,0.12


In [18]:
# There are some companies, such as COGN3, with low market value and excess net debt
# resulting in near zero EV and, consequently, distorted EY (EBIT/EV tending to infinity).
# Although the book does not make this clear, we will be removing these companies from the selection.
df_magic.query('enterprise_value > 100', inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())

Number of companies available for backtesting 234


In [19]:
# Remove revisions of DFPs published on or after the cut. In the book, the cut is one week
df_magic.query('doc_env.dt.date < datneg', inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,ALLIAR,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.06,1764.85,2073.35,0.05
1,2018-04-10,AALR3,ALLIAR,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.65,70.34,0.03,1794.50,2302.15,0.03
2,2019-04-10,AALR3,ALLIAR,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.05,1731.81,2264.15,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726,2014-04-10,CGRA4,GRAZZIOTIN,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.14,378.51,291.13,0.18
1727,2015-04-10,RUMO3,RUMO LOG,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.09,1693.71,2392.94,0.09
1728,2016-04-11,RUMO3,RUMO LOG,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.34,989.74,8993.66,0.12


In [20]:
# Keep only the latest DFP for each asset
df_magic.sort_values('doc_env', inplace=True)
df_magic.drop_duplicates(subset=['codneg', 'year'], keep='last', inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
1246,2011-04-11,TOTS3,TOTVS,244,TOTS,2011,485.33,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.19,994.10,1173.53,0.18
705,2011-04-11,LREN3,LOJAS RENNER,3604,LREN,2011,3465.70,8133,2011-02-16 19:53:52,2010-12-31,122.35,-27.16,404.47,0.31,6682.70,6655.54,0.06
67,2011-04-11,AMAR3,LOJAS MARISA,183,AMAR,2011,407.27,22055,2011-02-18 15:38:17,2010-12-31,184.55,37.21,285.95,0.31,4981.03,5018.24,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,2022-04-11,KRSA3,KORA SAUDE,201,KRSA,2022,802.60,25879,2022-03-30 23:29:03,2021-12-31,767.17,1290.38,132.81,0.13,2631.41,3921.78,0.03
1507,2022-04-11,CEDO4,CEDRO,5,CEDO,2022,16.27,3077,2022-03-31 11:19:14,2021-12-31,10.00,206.73,44.58,0.09,50.20,256.93,0.17
1505,2022-04-11,CEDO3,CEDRO,1,CEDO,2022,7.63,3077,2022-03-31 11:19:14,2021-12-31,10.00,206.73,44.58,0.09,70.00,276.73,0.16


In [21]:
# Remove the least liquid asset from the company (column 'totneg_sma30')
df_magic.sort_values(by=['year', 'codemi', 'totneg_sma30'], inplace=True)
df_magic.drop_duplicates(
    subset=['codemi', 'year'], keep='last', inplace=True, ignore_index=True
)
# Column 'codemi' can now be removed
df_magic.drop(columns='codemi', inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())
df_magic

Number of companies available for backtesting 213


Unnamed: 0,datneg,codneg,nomres,totneg,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,ANHANGUERA,978,2011,1031.77,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.08,5537686.82,5537115.32,0.00
1,2011-04-11,ALPA4,ALPARGATAS,158,2011,210.80,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.21,4011.73,3653.02,0.09
2,2011-04-11,ALSC3,ALIANSCE,102,2011,274.13,21300,2011-03-29 11:28:24,2010-12-31,139.47,-141.72,99.44,0.10,1941.38,1799.66,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106,2022-04-11,WEGE3,WEG,25960,2022,28835.07,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.30,138427.55,136999.53,0.03
1107,2022-04-11,WLMM4,WLM IND COM,14,2022,11.37,11070,2022-03-22 22:18:18,2021-12-31,36.41,-152.00,136.13,0.27,1258.13,1106.12,0.12
1108,2022-04-11,YDUQ3,YDUQS PART,10252,2022,14513.70,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.06,6092.14,9784.98,0.06


In [22]:
# Check a company
df_magic.query('codneg.str.startswith("BRAP")')

Unnamed: 0,datneg,codneg,nomres,totneg,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
10,2011-04-11,BRAP4,BRADESPAR,3861,2011,3451.13,18724,2011-03-25 16:48:14,2010-12-31,349.55,2471.05,2453.4,0.2,15083.0,17554.04,0.14


In [23]:
# The Book talks about companies with at least USD 50 million in market value
# Remove companies with less than BRL 250 million in market value
df_magic.query('market_cap > 250', inplace=True)
# Remove stocks with very low liquidity
df_magic.query('totneg_sma30 > 100', inplace=True)
# The 'totneg' and 'totneg_sma30' columns are no longer needed
df_magic.drop(columns=['totneg', 'totneg_sma30'], inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())
df_magic

Number of companies available for backtesting 195


Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,ANHANGUERA,2011,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.08,5537686.82,5537115.32,0.00
1,2011-04-11,ALPA4,ALPARGATAS,2011,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.21,4011.73,3653.02,0.09
2,2011-04-11,ALSC3,ALIANSCE,2011,21300,2011-03-29 11:28:24,2010-12-31,139.47,-141.72,99.44,0.10,1941.38,1799.66,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,2022-04-11,VVEO3,VIVEO,2022,25682,2022-03-30 18:25:34,2021-12-31,286.12,-76.53,492.30,0.25,4506.44,4429.92,0.11
1106,2022-04-11,WEGE3,WEG,2022,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.30,138427.55,136999.53,0.03
1108,2022-04-11,YDUQ3,YDUQS PART,2022,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.06,6092.14,9784.98,0.06


In [24]:
# Create the ranks
df_magic['rank_roic'] = (df_magic
    .groupby(by=['year'])['roic']
    .rank(method='dense', ascending=False)
)
df_magic['rank_ey'] = (df_magic
    .groupby(by=['year'])['earnings_yield']
    .rank(method='dense', ascending=False)
)
df_magic['ranks_sum'] = df_magic['rank_roic'] + df_magic['rank_ey']
df_magic['rank_final'] = (df_magic
    .groupby(by=['year'])['ranks_sum']
    .rank(method='first', ascending=True)
)
cols_integer = ['rank_roic', 'rank_ey', 'ranks_sum', 'rank_final']
df_magic[cols_integer] = df_magic[cols_integer].astype(int)
df_magic

Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
0,2011-04-11,AEDU3,ANHANGUERA,2011,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.08,5537686.82,5537115.32,0.00,55,71,126,65
1,2011-04-11,ALPA4,ALPARGATAS,2011,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.21,4011.73,3653.02,0.09,12,34,46,18
2,2011-04-11,ALSC3,ALIANSCE,2011,21300,2011-03-29 11:28:24,2010-12-31,139.47,-141.72,99.44,0.10,1941.38,1799.66,0.06,50,55,105,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,2022-04-11,VVEO3,VIVEO,2022,25682,2022-03-30 18:25:34,2021-12-31,286.12,-76.53,492.30,0.25,4506.44,4429.92,0.11,35,55,90,40
1106,2022-04-11,WEGE3,WEG,2022,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.30,138427.55,136999.53,0.03,23,114,137,66
1108,2022-04-11,YDUQ3,YDUQS PART,2022,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.06,6092.14,9784.98,0.06,107,91,198,106


In [25]:
df_magic.query('year == 2012').sort_values('rank_final')

Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
175,2012-04-09,VALE5,VALE,2012,4170,2012-02-16 11:24:28,2011-12-31,5365.31,39166.50,53087.42,0.24,217616.77,256783.27,0.21,9,2,11,1
112,2012-04-09,ETER3,ETERNIT,2012,5762,2012-03-12 17:47:08,2011-12-31,89.50,-20.48,117.30,0.22,813.55,793.08,0.15,11,5,16,2
117,2012-04-09,FHER3,FER HERINGER,2012,20621,2012-03-14 08:59:20,2011-12-31,48.47,611.98,306.05,0.20,640.30,1252.28,0.24,16,1,17,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,2012-04-09,AEDU3,ANHANGUERA,2012,18961,2012-03-31 23:15:31,2011-12-31,145.69,292.39,86.10,0.04,3752.98,4045.37,0.02,65,71,136,69
130,2012-04-09,IMCH3,IMC HOLDINGS,2012,22438,2012-03-14 21:14:16,2011-12-31,83.68,104.34,33.84,0.03,1394.12,1498.46,0.02,69,70,139,70
118,2012-04-09,FIBR3,FIBRIA,2012,12793,2012-02-01 20:40:29,2011-12-31,467.59,9264.58,377.66,0.01,7004.53,16269.10,0.02,71,69,140,71


In [26]:
df_magic.sort_values(by=['year', 'rank_final'], inplace=True)
df_magic.rename(columns={'datneg': 'rebalance_on'}, inplace=True)
# The 'year' column is already implied in the 'rebalance_on' column and can be removed
df_magic.drop(columns=['year'], inplace=True)
# Remove intermediate calculation columns
df_magic.drop(columns=['rank_roic', 'rank_ey', 'ranks_sum'], inplace=True)
df_magic

Unnamed: 0,rebalance_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
6,2011-04-11,AUTM3,AUTOMETAL,22381,2011-02-28 14:52:34,2010-12-31,94.42,252.11,247.23,1.12,1606.17,1858.28,0.13,1
23,2011-04-11,ETER3,ETERNIT,5762,2011-03-11 11:18:46,2010-12-31,89.50,-56.72,123.66,0.30,987.18,930.47,0.13,2
82,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.19,994.10,1173.53,0.18,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,2022-04-11,DASA3,DASA,19623,2022-03-29 12:56:36,2021-12-31,560.51,4032.79,88.52,0.01,12953.41,16986.20,0.01,134
998,2022-04-11,ELMD3,ELETROMIDIA,25569,2022-03-29 19:03:12,2021-12-31,139.14,28.30,9.11,0.01,2177.62,2205.92,0.00,135
972,2022-04-11,AMAR3,LOJAS MARISA,22055,2022-03-16 19:26:20,2021-12-31,261.67,1201.01,4.80,0.00,761.45,1962.46,0.00,136


In [27]:
df_magic.sort_values('rank_final')
df_magic

Unnamed: 0,rebalance_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
6,2011-04-11,AUTM3,AUTOMETAL,22381,2011-02-28 14:52:34,2010-12-31,94.42,252.11,247.23,1.12,1606.17,1858.28,0.13,1
23,2011-04-11,ETER3,ETERNIT,5762,2011-03-11 11:18:46,2010-12-31,89.50,-56.72,123.66,0.30,987.18,930.47,0.13,2
82,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.19,994.10,1173.53,0.18,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,2022-04-11,DASA3,DASA,19623,2022-03-29 12:56:36,2021-12-31,560.51,4032.79,88.52,0.01,12953.41,16986.20,0.01,134
998,2022-04-11,ELMD3,ELETROMIDIA,25569,2022-03-29 19:03:12,2021-12-31,139.14,28.30,9.11,0.01,2177.62,2205.92,0.00,135
972,2022-04-11,AMAR3,LOJAS MARISA,22055,2022-03-16 19:26:20,2021-12-31,261.67,1201.01,4.80,0.00,761.45,1962.46,0.00,136


In [28]:
# Check if Petrobras (state owned) is the ranking
df_magic.query('codneg == "PETR4"')

Unnamed: 0,rebalance_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final


In [29]:
# Test Braskem rank
df_magic.query('codneg == "BRKM5"')

Unnamed: 0,rebalance_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
12,2011-04-11,BRKM5,BRASKEM,4820,2011-03-17 09:14:46,2010-12-31,801.66,9867.90,3214.96,0.18,18085.56,27953.46,0.12,14
101,2012-04-09,BRKM5,BRASKEM,4820,2012-03-26 20:08:37,2011-12-31,801.66,12006.80,1929.90,0.06,11560.01,23566.81,0.08,49
188,2013-04-10,BRKM5,BRASKEM,4820,2013-02-07 10:13:32,2012-12-31,797.27,14051.87,1538.60,0.05,12453.28,26505.15,0.06,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,2018-04-10,BRKM5,BRASKEM,4820,2018-03-29 05:11:07,2017-12-31,797.26,17569.80,9359.06,0.36,38204.58,55774.38,0.17,1
678,2019-04-10,BRKM5,BRASKEM,4820,2019-03-13 19:01:57,2018-12-31,797.22,17259.68,8303.94,0.24,38282.43,55542.12,0.15,3
982,2022-04-11,BRKM5,BRASKEM,4820,2022-03-16 19:43:41,2021-12-31,797.21,22861.69,26043.55,0.84,36137.43,58999.13,0.44,1


In [30]:
# Select only the first 30 companies in the magic rank for each year
df_magic.query('rank_final <= 30', inplace=True)
df_magic.sort_values(['rebalance_on', 'rank_final'], inplace=True, ignore_index=True)
print('Number of selected companies for backtesting', df_magic.codneg.nunique())
df_magic

Number of selected companies for backtesting 114


Unnamed: 0,rebalance_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,AUTM3,AUTOMETAL,22381,2011-02-28 14:52:34,2010-12-31,94.42,252.11,247.23,1.12,1606.17,1858.28,0.13,1
1,2011-04-11,ETER3,ETERNIT,5762,2011-03-11 11:18:46,2010-12-31,89.50,-56.72,123.66,0.30,987.18,930.47,0.13,2
2,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.19,994.10,1173.53,0.18,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2022-04-11,ODPV3,ODONTOPREV,20125,2022-02-23 18:10:22,2021-12-31,531.29,-331.56,507.95,0.71,5791.11,5459.55,0.09,28
358,2022-04-11,BEEF3,MINERVA,20931,2022-02-23 21:43:02,2021-12-31,606.70,6102.70,2035.42,0.22,8251.13,14353.83,0.14,29
359,2022-04-11,JHSF3,JHSF PART,20605,2022-02-24 19:37:23,2021-12-31,686.22,1099.50,1113.51,0.17,4412.42,5511.92,0.20,30


In [31]:
# Save file
df_magic.to_csv('../data/magic_stocks.csv', index=False)

In [32]:
# Test file
pd.read_csv('../data/magic_stocks.csv')

Unnamed: 0,rebalance_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,AUTM3,AUTOMETAL,22381,2011-02-28 14:52:34,2010-12-31,94.42,252.11,247.23,1.12,1606.17,1858.28,0.13,1
1,2011-04-11,ETER3,ETERNIT,5762,2011-03-11 11:18:46,2010-12-31,89.50,-56.72,123.66,0.30,987.18,930.47,0.13,2
2,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.19,994.10,1173.53,0.18,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2022-04-11,ODPV3,ODONTOPREV,20125,2022-02-23 18:10:22,2021-12-31,531.29,-331.56,507.95,0.71,5791.11,5459.55,0.09,28
358,2022-04-11,BEEF3,MINERVA,20931,2022-02-23 21:43:02,2021-12-31,606.70,6102.70,2035.42,0.22,8251.13,14353.83,0.14,29
359,2022-04-11,JHSF3,JHSF PART,20605,2022-02-24 19:37:23,2021-12-31,686.22,1099.50,1113.51,0.17,4412.42,5511.92,0.20,30


In [33]:
# Some of the stocks will be selected in multiple periods
print(df_magic.codneg.value_counts().head(5).to_markdown())

|       |   codneg |
|:------|---------:|
| GRND3 |        9 |
| BEEF3 |        9 |
| ODPV3 |        8 |
| RAPT4 |        8 |
| TGMA3 |        8 |


In [34]:
# Check 2022 data
df_22 = df_magic.query('rebalance_on > "2022-01-01"')[['codneg', 'nomres', 'roic', 'earnings_yield']].reset_index(drop=True)
df_22.index += 1
df_22[['roic', 'earnings_yield']] = df_22[['roic', 'earnings_yield']].round(2)
print(df_22.to_markdown(mode='github'))

|    | codneg   | nomres       |   roic |   earnings_yield |
|---:|:---------|:-------------|-------:|-----------------:|
|  1 | BRKM5    | BRASKEM      |   0.84 |             0.44 |
|  2 | ENAT3    | ENAUTA PART  |   0.76 |             0.6  |
|  3 | USIM5    | USIMINAS     |   0.42 |             0.74 |
|  4 | CMIN3    | CSNMINERACAO |   0.7  |             0.37 |
|  5 | TASA4    | TAURUS ARMAS |   0.79 |             0.29 |
|  6 | GOAU4    | GERDAU MET   |   0.34 |             1.09 |
|  7 | MRFG3    | MARFRIG      |   0.43 |             0.33 |
|  8 | VALE3    | VALE         |   0.5  |             0.29 |
|  9 | DEXP3    | DEXXOS PAR   |   0.41 |             0.26 |
| 10 | GGBR4    | GERDAU       |   0.34 |             0.36 |
| 11 | LEVE3    | METAL LEVE   |   0.37 |             0.25 |
| 12 | CURY3    | CURY S/A     |   0.51 |             0.2  |
| 13 | SYNE3    | SYN PROP TEC |   0.27 |             0.91 |
| 14 | LAVV3    | LAVVI        |   0.27 |             0.33 |
| 15 | ALLD3    | ALLIED

In [35]:
df_magic.query('codneg == "PRIO3"')

Unnamed: 0,rebalance_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
224,2018-04-10,PRIO3,PETRORIO,22187,2018-03-16 00:18:21,2017-12-31,13.19,-476.29,42.21,0.09,690.99,214.71,0.2,15
272,2020-04-09,PRIO3,PETRORIO,22187,2020-02-22 00:56:46,2019-12-31,143.19,1511.96,924.34,1.25,3800.15,5312.12,0.17,3
