### Extração dos dados da API do Yahoo Finance

In [46]:
import yfinance as yf
import time
import pandas as pd

In [47]:
cryptos = ['BTC-USD', 'ETH-USD', 'SOL-USD']

try:
    for crypto in cryptos:
        data = yf.download(crypto, period='max', interval='1d', auto_adjust=True)
        data.to_csv(f'../data/raw/{crypto}.csv')
        if crypto != cryptos[-1]:
            time.sleep(10)

except Exception as err:
    print(f"Não foi possivel baixar os dados: {err}")

[*********************100%***********************]  1 of 1 completed

Não foi possivel baixar os dados: Cannot save file into a non-existent directory: '..\data\raw'





### Carregamento, Ajuste e Exportação dos Dados

#### Bitcoin

In [48]:
data = pd.read_csv('../../data/raw/SOL-USD.csv')
data = data.iloc[2:,:]
data.head(5)

Unnamed: 0,Price,Close,High,Low,Open,Volume
2,2020-04-10,0.9510539770126344,1.3134870529174805,0.6941869854927063,0.8320050239562988,87364276
3,2020-04-11,0.7768189907073975,1.0490729808807373,0.7650200128555298,0.9510539770126344,43862444
4,2020-04-12,0.8825070261955261,0.9566699862480164,0.7624260187149048,0.7854480147361755,38736897
5,2020-04-13,0.7778319716453552,0.8916029930114746,0.7739760279655457,0.8907600045204163,18211285
6,2020-04-14,0.6619250178337097,0.7964720129966736,0.6281690001487732,0.7778319716453552,16747614


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 2 to 2027
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Price   2026 non-null   object
 1   Close   2026 non-null   object
 2   High    2026 non-null   object
 3   Low     2026 non-null   object
 4   Open    2026 non-null   object
 5   Volume  2026 non-null   object
dtypes: object(6)
memory usage: 95.1+ KB


In [50]:
cols = ['Close', 'High', 'Low', 'Open', 'Volume']
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce')
data.dtypes

Price      object
Close     float64
High      float64
Low       float64
Open      float64
Volume      int64
dtype: object

Apliquei a função pd.to_numeric em cada uma dessas colunas que converte os valores pra número (float ou int)<br>
Se algum valor não puder ser convertido (por exemplo, "ABC", "?", "-"), o parâmetro errors='coerce' manda ele transformar em NaN (valor nulo)

In [51]:
data = data.rename(columns={'Price': 'Date'})
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

Date      datetime64[ns]
Close            float64
High             float64
Low              float64
Open             float64
Volume             int64
dtype: object

In [52]:
data = data.set_index('Date')
data = data.sort_index()
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-10,0.951054,1.313487,0.694187,0.832005,87364276
2020-04-11,0.776819,1.049073,0.76502,0.951054,43862444
2020-04-12,0.882507,0.95667,0.762426,0.785448,38736897
2020-04-13,0.777832,0.891603,0.773976,0.89076,18211285
2020-04-14,0.661925,0.796472,0.628169,0.777832,16747614


Agora já ajustei o nome da coluna, transformei para data, coloquei a data como indice do meu dataset e ordenei por garantia, porque vi que a ordem era importante para uma LSTM

In [53]:
data.isna().sum()

Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64

In [54]:
data[data.duplicated() == True]

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [55]:
data.shape

(2026, 5)

In [56]:
data.to_csv('../../data/processed/SOL-USD_raw.csv', index=True)

In [57]:
limite = data['Volume'].quantile(0.999)
data = data[data['Volume'] <= limite]
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-10,0.951054,1.313487,0.694187,0.832005,87364276
2020-04-11,0.776819,1.049073,0.76502,0.951054,43862444
2020-04-12,0.882507,0.95667,0.762426,0.785448,38736897
2020-04-13,0.777832,0.891603,0.773976,0.89076,18211285
2020-04-14,0.661925,0.796472,0.628169,0.777832,16747614


In [58]:
data.shape

(2023, 5)

In [59]:
data.to_csv('../../data/processed/SOL-USD_all.csv', index=True)

In [60]:
data_lag = data.copy()

n_steps = 30
for time_step in range(1, n_steps+1):
    data_lag['closeLag_'+str(time_step)] = data_lag['Close'].shift(-time_step)

data_lag = data_lag.dropna()
data_lag.isna().sum()


Close          0
High           0
Low            0
Open           0
Volume         0
closeLag_1     0
closeLag_2     0
closeLag_3     0
closeLag_4     0
closeLag_5     0
closeLag_6     0
closeLag_7     0
closeLag_8     0
closeLag_9     0
closeLag_10    0
closeLag_11    0
closeLag_12    0
closeLag_13    0
closeLag_14    0
closeLag_15    0
closeLag_16    0
closeLag_17    0
closeLag_18    0
closeLag_19    0
closeLag_20    0
closeLag_21    0
closeLag_22    0
closeLag_23    0
closeLag_24    0
closeLag_25    0
closeLag_26    0
closeLag_27    0
closeLag_28    0
closeLag_29    0
closeLag_30    0
dtype: int64

In [61]:
data_lag.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume,closeLag_1,closeLag_2,closeLag_3,closeLag_4,closeLag_5,...,closeLag_21,closeLag_22,closeLag_23,closeLag_24,closeLag_25,closeLag_26,closeLag_27,closeLag_28,closeLag_29,closeLag_30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-10,0.951054,1.313487,0.694187,0.832005,87364276,0.776819,0.882507,0.777832,0.661925,0.646651,...,0.728794,0.715876,0.679535,0.667491,0.641837,0.622584,0.612978,0.604922,0.604527,0.537034
2020-04-11,0.776819,1.049073,0.76502,0.951054,43862444,0.882507,0.777832,0.661925,0.646651,0.690816,...,0.715876,0.679535,0.667491,0.641837,0.622584,0.612978,0.604922,0.604527,0.537034,0.515273
2020-04-12,0.882507,0.95667,0.762426,0.785448,38736897,0.777832,0.661925,0.646651,0.690816,0.660728,...,0.679535,0.667491,0.641837,0.622584,0.612978,0.604922,0.604527,0.537034,0.515273,0.538576
2020-04-13,0.777832,0.891603,0.773976,0.89076,18211285,0.661925,0.646651,0.690816,0.660728,0.681096,...,0.667491,0.641837,0.622584,0.612978,0.604922,0.604527,0.537034,0.515273,0.538576,0.550532
2020-04-14,0.661925,0.796472,0.628169,0.777832,16747614,0.646651,0.690816,0.660728,0.681096,0.606969,...,0.641837,0.622584,0.612978,0.604922,0.604527,0.537034,0.515273,0.538576,0.550532,0.569118


In [62]:
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-10,0.951054,1.313487,0.694187,0.832005,87364276
2020-04-11,0.776819,1.049073,0.76502,0.951054,43862444
2020-04-12,0.882507,0.95667,0.762426,0.785448,38736897
2020-04-13,0.777832,0.891603,0.773976,0.89076,18211285
2020-04-14,0.661925,0.796472,0.628169,0.777832,16747614


In [63]:
data_lag.shape

(1993, 35)

In [64]:
data_lag.to_csv('../../data/processed/SOL-USD_lag_all.csv', index=True)

In [65]:
data = data.loc['2021-01-01':]
data_lag = data_lag.loc['2021-01-01':]

In [66]:
data.shape

(1757, 5)

In [67]:
data_lag.shape

(1727, 35)

In [68]:
data.to_csv('../../data/processed/SOL-USD_cropped.csv', index=True)
data_lag.to_csv('../../data/processed/SOL-USD_lag_cropped.csv', index=True)

In [69]:
data_crypto_all = pd.read_csv('../../data/processed/SOL-USD_crypto_all.csv', index_col='Date', parse_dates=True)
data_crypto_cropped = data_crypto_all.loc['2021-01-01':]
data_crypto_cropped.to_csv('../../data/processed/SOL-USD_crypto_cropped.csv', index=True)
