In [2]:
import yfinance as yf
import pathlib
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from backtesting_v2_9 import *

In [3]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'

XP_DATA_DIR = DATA_DIR / 'xp'
PROCESSED_XP_DATA_DIR = XP_DATA_DIR / 'processed'

PRICES_DIR = DATA_DIR / 'prices'
RAW_PRICES_DIR = PRICES_DIR / 'raw'

RAW_PRICES_DIR.mkdir(parents=True, exist_ok=True)

with open(PROCESSED_XP_DATA_DIR / 'data.pkl', 'rb') as f:
    data = pickle.load(f)

# Data Extraction

In [4]:
SIM_DATE_FROM = "2021-12-01"
SIM_DATE_TO = "2024-05-31"
CDI = 11.59

In [5]:
tickers = set()

for df in data.values():
    # Loop for indices
    for ticker in df.index:
        tickers.add(ticker)
        
stocks_amount = len(tickers)
print(f"Stocks amount: {stocks_amount}")

Stocks amount: 33


In [7]:
for ticker in tickers:

    data = yf.download(ticker, start=SIM_DATE_FROM, end=SIM_DATE_TO)
    df = data.dropna()

    # Save to csv in prices folder
    df.to_csv(RAW_PRICES_DIR / f'{ticker}.csv')

[*********************100%%**********************]  1 of 1 completed


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******

# Data Processing

In [8]:
PROCESSED_PRICES_DIR = PRICES_DIR / 'processed'

In [9]:
# Code by Raul Ikeda

# Limpeza dos dados: problema de sincronia
tickers = list(tickers)

df = pd.read_csv(RAW_PRICES_DIR / '{}.csv'.format(tickers[0]), index_col='Date')
amount_of_columns = len(df.columns)

#df.set_index('formatted_date')

# Junta todos para sincronizar as datas
for ticker in tickers[1:]:
    table = pd.read_csv(RAW_PRICES_DIR / '{}.csv'.format(ticker), index_col='Date')
    # Junção dos dois dataframes, usando a data como chave comum
    df = df.join(table, how='inner',rsuffix=f'_{ticker}')
    
# Separa tudo para salvar os dados sincronizados
df_0 = df.iloc[:,0:amount_of_columns]
df_0.to_csv(PROCESSED_PRICES_DIR / '{}.csv'.format(tickers[0]))

i = amount_of_columns
for ticker in tickers[1:]:
    df_i = df.iloc[:,i:i+amount_of_columns]
    df_i.rename(columns = {f'Open_{ticker}':'Open', f'High_{ticker}':'High', f'Low_{ticker}':'Low',
                           f'Close_{ticker}':'Close', f'Adj Close_{ticker}':'Adj Close', 
                           f'Volume_{ticker}':'Volume'}, inplace = True)
    
    df_i.to_csv(PROCESSED_PRICES_DIR / '{}.csv'.format(ticker))
    
    i+= amount_of_columns


In [10]:
df = pd.DataFrame()
for ticker in tickers:
    df[ticker] = pd.read_csv(PROCESSED_PRICES_DIR / '{}.csv'.format(ticker))['Adj Close'].iloc[:]

df = df.dropna()
df

Unnamed: 0,RAIZ4.SA,TOTS3.SA,CPLE6.SA,ITUB4.SA,SMFT3.SA,GGBR4.SA,ELET3.SA,PETZ3.SA,PETR4.SA,WEGE3.SA,...,SOMA3.SA,RENT3.SA,GMAT3.SA,VALE3.SA,SMTO3.SA,JBSS3.SA,SBSP3.SA,ARZZ3.SA,VIVA3.SA,RADL3.SA
0,4.923124,28.930948,5.128630,19.539875,16.659586,16.114634,31.568790,18.324335,11.870057,31.075983,...,12.677909,47.320843,5.661659,57.687241,31.882263,30.082554,31.784008,64.599976,23.172520,20.379776
1,5.102473,29.377386,5.262932,20.278564,17.071175,17.032127,33.255669,17.956856,12.775752,31.295971,...,12.413173,50.428009,5.721360,60.365013,32.644024,31.082943,34.348442,67.965347,22.695717,20.772228
2,5.398399,29.600601,5.212569,20.208214,17.874754,17.149120,34.113567,18.999704,12.955945,31.353357,...,13.217185,51.498138,5.860662,59.034340,32.383907,29.577929,34.928715,69.732635,23.401382,21.323544
3,5.228017,29.717064,5.153812,20.445650,18.315744,17.549372,34.393112,18.791134,13.014510,31.573351,...,13.364262,51.413662,6.069616,62.237808,33.238556,29.870083,35.387318,71.175529,24.135654,21.295507
4,5.308724,30.454651,5.237750,20.190624,18.119749,17.826466,33.872585,19.585688,13.226236,33.055897,...,13.638802,50.587589,6.258670,62.697800,33.350037,30.463230,35.555786,70.237274,23.840038,21.753378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,2.830000,29.680000,9.400000,32.141708,22.330000,18.450001,36.139999,3.960000,36.810001,39.029999,...,6.030000,45.700001,7.620000,65.050003,27.340000,29.480000,75.360001,50.950001,22.150000,26.660000
618,2.830000,29.059999,9.300000,31.831884,21.860001,18.719999,36.200001,3.830000,36.610001,38.340000,...,6.070000,45.400002,7.510000,65.080002,26.910000,29.360001,75.610001,50.959999,22.209999,26.059999
619,2.900000,28.820000,9.300000,31.721947,21.600000,18.510000,36.200001,3.800000,37.009998,38.349998,...,6.030000,44.939999,7.560000,65.300003,27.049999,28.959999,75.070000,50.970001,22.490000,25.920000
620,2.850000,28.500000,9.310000,31.552042,22.000000,18.469999,35.860001,3.800000,37.799999,38.230000,...,6.060000,44.560001,7.500000,63.889999,27.500000,28.950001,74.629997,50.990002,22.480000,25.830000


In [11]:
COMPILED_PRICES_DIR = PRICES_DIR / 'compiled'
COMPILED_PRICES_DIR.mkdir(parents=True, exist_ok=True)

with open (COMPILED_PRICES_DIR / 'compiled.pkl', 'wb') as f:
    pickle.dump(df, f)