In [1]:
import pandas as pd
import numpy as np

# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.4g' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 4

In [2]:
# Load stock list
df_stocks = pd.read_csv("../data/2_magic_stocks.csv", parse_dates=["cutoff_date"])
df_stocks.rank_final = df_stocks.rank_final.astype(int)
df_stocks

Unnamed: 0,cutoff_date,codneg,nomres,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,LREN3,LOJAS RENNER,2011-02-16 19:53:52,2010-12-31,1.223e+08,-2.716e+07,4.045e+08,0.4069,8.029e+08,8.301e+08,0.4873,1
1,2011-04-11,TOTS3,TOTVS,2011-01-31 19:05:59,2010-12-31,3.146e+07,1.794e+08,2.117e+08,0.261,2.747e+08,9.529e+07,2.221,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,2022-04-11,PDTC3,PADTEC,2022-03-15 18:02:05,2021-12-31,7.845e+07,2.034e+07,5.534e+07,0.3722,4.017e+08,3.813e+08,0.1451,29
359,2022-04-11,POSI3,POSITIVO TEC,2022-03-30 18:17:00,2021-12-31,1.418e+08,5.416e+08,3.031e+08,0.1759,1.158e+09,6.166e+08,0.4916,30


In [3]:
# Some of the stocks will be selected in multiple periods
df_stocks.codneg.value_counts()

NATU3    9
LREN3    8
        ..
VIVA3    1
PDTC3    1
Name: codneg, Length: 117, dtype: int64

In [4]:
# Slice dataframe with columns that will be used
cols = ['cutoff_date', 'codneg']
df_stocks = df_stocks.loc[:, cols]
df_stocks

Unnamed: 0,cutoff_date,codneg
0,2011-04-11,LREN3
1,2011-04-11,TOTS3
...,...,...
358,2022-04-11,PDTC3
359,2022-04-11,POSI3


In [5]:
# Load complete B3 adjusted price data
# s3://aq-dl/HistoricalQuotations/base_adj.feather
file_path = "/mnt/aq_disk/data/HistoricalQuotations/processed/base_adj.feather"
df_b3 = pd.read_feather(file_path)
df_b3

Unnamed: 0,datneg,codneg,codisi,nomres,especi,codbdi,tpmerc,dismes,datven,prazot,...,premed,preult,preofc,preofv,preexe,totneg,quatot,voltot,evento,ajuste
0,2022-02-03,5GTK11,BR5GTKCTF000,INVESTO 5GTK,CI,14,10,100,NaT,0,...,95.7,94.86,94.86,97.7,0,85,2.085e+04,1.995e+06,,1
1,2022-02-04,5GTK11,BR5GTKCTF000,INVESTO 5GTK,CI,14,10,100,NaT,0,...,95.77,95.84,95.37,95.84,0,50,1107,1.06e+05,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10938222,2013-12-18,ZNTE6L,BRZNTEACNPB8,FERR ZANETTE,PNB,52,17,104,NaT,0,...,0.82,0.82,0,0,0,1,7.442e+05,6.102e+05,,1
10938223,2012-12-13,ZNTE7L,BRZNTEACNPC6,FERR ZANETTE,PNC*,52,17,111,NaT,0,...,0.82,0.82,0,0,0,1,4.76e+08,3.903e+05,,1


In [6]:
# Select only stocks after 2011 (first year with available accounting data) and remove
# other assets (stock options, ETFs, etc)
df_prices = df_b3.copy()
df_prices.query('\
    codbdi == 2 and \
    datneg >= "2011.01.01" and \
    especi.str.contains("ON |PN ")'
    , inplace=True
)
df_prices.reset_index(drop=True, inplace=True)
# Remove columns that will not be used for backtesting
# Daily average stock price (premed) will be used for entering and exiting positions
cols = ['datneg', 'codneg', 'nomres', 'preult', 'premed']
df_prices = df_prices.loc[:, cols]
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed
0,2016-10-28,AALR3,ALLIAR,18.93,19.01
1,2016-10-31,AALR3,ALLIAR,17.81,17.92
...,...,...,...,...,...
577535,2022-06-08,YDUQ3,YDUQS PART,14.92,15.08
577536,2022-06-09,YDUQ3,YDUQS PART,14.99,15.16


In [7]:
# Join price dataframe with magic stocks dataframe
# Since there are stocks in multiple periods, this is a one-to-many join
df_prices = df_prices.merge(right=df_stocks, how='inner')
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date
0,2014-01-15,ABEV3,AMBEV S/A,13.08,13.12,2014-04-10
1,2014-01-16,ABEV3,AMBEV S/A,12.91,12.97,2014-04-10
...,...,...,...,...,...,...
849319,2022-06-08,YDUQ3,YDUQS PART,14.92,15.08,2020-04-09
849320,2022-06-09,YDUQ3,YDUQS PART,14.99,15.16,2020-04-09


In [8]:
# Create a list of ordered unique cutoff dates
values = list(df_prices.cutoff_date.sort_values().drop_duplicates())
# Add one extra year to the end of the list
values.append(values[-1] + pd.DateOffset(years=1))
keys = list(range(len(values)))
# Create a dictionary where the keys are the cutoff dates
cutoff_dict = dict(zip(keys, values))
cutoff_dict

{0: Timestamp('2011-04-11 00:00:00'),
 1: Timestamp('2012-04-09 00:00:00'),
 2: Timestamp('2013-04-10 00:00:00'),
 3: Timestamp('2014-04-10 00:00:00'),
 4: Timestamp('2015-04-10 00:00:00'),
 5: Timestamp('2016-04-11 00:00:00'),
 6: Timestamp('2017-04-10 00:00:00'),
 7: Timestamp('2018-04-10 00:00:00'),
 8: Timestamp('2019-04-10 00:00:00'),
 9: Timestamp('2020-04-09 00:00:00'),
 10: Timestamp('2021-04-12 00:00:00'),
 11: Timestamp('2022-04-11 00:00:00'),
 12: Timestamp('2023-04-11 00:00:00')}

In [9]:
# Rank the cutoff dates so we have the keys for mapping with the cutoff_dict
df_prices['cutoff_key'] = df_prices['cutoff_date'].rank(method='dense').astype(int)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,cutoff_key
0,2014-01-15,ABEV3,AMBEV S/A,13.08,13.12,2014-04-10,4
1,2014-01-16,ABEV3,AMBEV S/A,12.91,12.97,2014-04-10,4
...,...,...,...,...,...,...,...
849319,2022-06-08,YDUQ3,YDUQS PART,14.92,15.08,2020-04-09,10
849320,2022-06-09,YDUQ3,YDUQS PART,14.99,15.16,2020-04-09,10


In [10]:
# Map values so that we have the next year cutoff dates
df_prices['next_cutoff'] = df_prices['cutoff_key'].map(cutoff_dict)
df_prices.drop(columns='cutoff_key', inplace=True)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,next_cutoff
0,2014-01-15,ABEV3,AMBEV S/A,13.08,13.12,2014-04-10,2015-04-10
1,2014-01-16,ABEV3,AMBEV S/A,12.91,12.97,2014-04-10,2015-04-10
...,...,...,...,...,...,...,...
849319,2022-06-08,YDUQ3,YDUQS PART,14.92,15.08,2020-04-09,2021-04-12
849320,2022-06-09,YDUQ3,YDUQS PART,14.99,15.16,2020-04-09,2021-04-12


In [11]:
# Select prices that are between each of the cutoff intervals
df_prices.query('cutoff_date <= datneg <= next_cutoff', inplace=True)
df_prices.sort_values(['codneg', 'datneg'], inplace=True, ignore_index=True)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,next_cutoff
0,2014-04-10,ABEV3,AMBEV S/A,13.16,13.1,2014-04-10,2015-04-10
1,2014-04-11,ABEV3,AMBEV S/A,13.49,13.34,2014-04-10,2015-04-10
...,...,...,...,...,...,...,...
81570,2021-04-09,YDUQ3,YDUQS PART,30.57,30.53,2020-04-09,2021-04-12
81571,2021-04-12,YDUQ3,YDUQS PART,30.58,30.59,2020-04-09,2021-04-12


In [12]:
df_prices.to_csv('../data/3_prices.csv', index=False)