### Extração dos dados da API do Yahoo Finance

In [22]:
import yfinance as yf
import time
import pandas as pd

In [23]:
cryptos = ['BTC-USD', 'ETH-USD', 'SOL-USD']

try:
    for crypto in cryptos:
        data = yf.download(crypto, period='max', interval='1d', auto_adjust=True)
        data.to_csv(f'../data/raw/{crypto}.csv')
        if crypto != cryptos[-1]:
            time.sleep(10)

except Exception as err:
    print(f"Não foi possivel baixar os dados: {err}")

[*********************100%***********************]  1 of 1 completed

Não foi possivel baixar os dados: Cannot save file into a non-existent directory: '..\data\raw'





### Carregamento, Ajuste e Exportação dos Dados

#### Bitcoin

In [24]:
data = pd.read_csv('../../data/raw/BTC-USD.csv')
data = data.iloc[2:,:]
data.head(5)

Unnamed: 0,Price,Close,High,Low,Open,Volume
2,2014-09-17,457.3340148925781,468.1740112304688,452.4219970703125,465.864013671875,21056800
3,2014-09-18,424.4400024414063,456.8599853515625,413.10400390625,456.8599853515625,34483200
4,2014-09-19,394.7959899902344,427.8349914550781,384.5320129394531,424.1029968261719,37919700
5,2014-09-20,408.9039916992188,423.2959899902344,389.8829956054688,394.6730041503906,36863600
6,2014-09-21,398.8210144042969,412.4259948730469,393.1809997558594,408.0849914550781,26580100


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4058 entries, 2 to 4059
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Price   4058 non-null   object
 1   Close   4058 non-null   object
 2   High    4058 non-null   object
 3   Low     4058 non-null   object
 4   Open    4058 non-null   object
 5   Volume  4058 non-null   object
dtypes: object(6)
memory usage: 190.3+ KB


In [26]:
cols = ['Close', 'High', 'Low', 'Open', 'Volume']
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce')
data.dtypes

Price      object
Close     float64
High      float64
Low       float64
Open      float64
Volume      int64
dtype: object

Apliquei a função pd.to_numeric em cada uma dessas colunas que converte os valores pra número (float ou int)<br>
Se algum valor não puder ser convertido (por exemplo, "ABC", "?", "-"), o parâmetro errors='coerce' manda ele transformar em NaN (valor nulo)

In [27]:
data = data.rename(columns={'Price': 'Date'})
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

Date      datetime64[ns]
Close            float64
High             float64
Low              float64
Open             float64
Volume             int64
dtype: object

In [28]:
data = data.set_index('Date')
data = data.sort_index()
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-09-17,457.334015,468.174011,452.421997,465.864014,21056800
2014-09-18,424.440002,456.859985,413.104004,456.859985,34483200
2014-09-19,394.79599,427.834991,384.532013,424.102997,37919700
2014-09-20,408.903992,423.29599,389.882996,394.673004,36863600
2014-09-21,398.821014,412.425995,393.181,408.084991,26580100


Agora já ajustei o nome da coluna, transformei para data, coloquei a data como indice do meu dataset e ordenei por garantia, porque vi que a ordem era importante para uma LSTM

In [29]:
data.isna().sum()

Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64

In [30]:
data[data.duplicated() == True]

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [31]:
data.shape

(4058, 5)

In [32]:
data.to_csv('../../data/processed/BTC-USD_raw.csv', index=True)

In [33]:
limite = data['Volume'].quantile(0.999)
data = data[data['Volume'] <= limite]
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-09-17,457.334015,468.174011,452.421997,465.864014,21056800
2014-09-18,424.440002,456.859985,413.104004,456.859985,34483200
2014-09-19,394.79599,427.834991,384.532013,424.102997,37919700
2014-09-20,408.903992,423.29599,389.882996,394.673004,36863600
2014-09-21,398.821014,412.425995,393.181,408.084991,26580100


In [34]:
data.shape

(4053, 5)

In [35]:
data.to_csv('../../data/processed/BTC-USD_all.csv', index=True)

In [36]:
data_lag = data.copy()

n_steps = 30
for time_step in range(1, n_steps+1):
    data_lag['closeLag_'+str(time_step)] = data_lag['Close'].shift(-time_step)

data_lag = data_lag.dropna()
data_lag = data_lag.drop(columns=['High', 'Low', 'Open', 'Volume'])
data_lag.isna().sum()


Close          0
closeLag_1     0
closeLag_2     0
closeLag_3     0
closeLag_4     0
closeLag_5     0
closeLag_6     0
closeLag_7     0
closeLag_8     0
closeLag_9     0
closeLag_10    0
closeLag_11    0
closeLag_12    0
closeLag_13    0
closeLag_14    0
closeLag_15    0
closeLag_16    0
closeLag_17    0
closeLag_18    0
closeLag_19    0
closeLag_20    0
closeLag_21    0
closeLag_22    0
closeLag_23    0
closeLag_24    0
closeLag_25    0
closeLag_26    0
closeLag_27    0
closeLag_28    0
closeLag_29    0
closeLag_30    0
dtype: int64

In [37]:
data_lag.head(5)

Unnamed: 0_level_0,Close,closeLag_1,closeLag_2,closeLag_3,closeLag_4,closeLag_5,closeLag_6,closeLag_7,closeLag_8,closeLag_9,...,closeLag_21,closeLag_22,closeLag_23,closeLag_24,closeLag_25,closeLag_26,closeLag_27,closeLag_28,closeLag_29,closeLag_30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-09-17,457.334015,424.440002,394.79599,408.903992,398.821014,402.152008,435.790985,423.204987,411.574005,404.424988,...,352.940002,365.026001,361.562012,362.299011,378.549011,390.414001,400.869995,394.77301,382.556,383.757996
2014-09-18,424.440002,394.79599,408.903992,398.821014,402.152008,435.790985,423.204987,411.574005,404.424988,399.519989,...,365.026001,361.562012,362.299011,378.549011,390.414001,400.869995,394.77301,382.556,383.757996,391.441986
2014-09-19,394.79599,408.903992,398.821014,402.152008,435.790985,423.204987,411.574005,404.424988,399.519989,377.181,...,361.562012,362.299011,378.549011,390.414001,400.869995,394.77301,382.556,383.757996,391.441986,389.54599
2014-09-20,408.903992,398.821014,402.152008,435.790985,423.204987,411.574005,404.424988,399.519989,377.181,375.46701,...,362.299011,378.549011,390.414001,400.869995,394.77301,382.556,383.757996,391.441986,389.54599,382.845001
2014-09-21,398.821014,402.152008,435.790985,423.204987,411.574005,404.424988,399.519989,377.181,375.46701,386.944,...,378.549011,390.414001,400.869995,394.77301,382.556,383.757996,391.441986,389.54599,382.845001,386.475006


In [38]:
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-09-17,457.334015,468.174011,452.421997,465.864014,21056800
2014-09-18,424.440002,456.859985,413.104004,456.859985,34483200
2014-09-19,394.79599,427.834991,384.532013,424.102997,37919700
2014-09-20,408.903992,423.29599,389.882996,394.673004,36863600
2014-09-21,398.821014,412.425995,393.181,408.084991,26580100


In [39]:
data_lag.shape

(4023, 31)

In [41]:
data_lag.to_csv('../../data/processed/BTC-USD_lag_all.csv', index=True)

In [42]:
data = data.loc['2017-01-01':]
data_lag = data_lag.loc['2017-01-01':]

In [43]:
data.shape

(3216, 5)

In [44]:
data_lag.shape

(3186, 31)

In [46]:
data.to_csv('../../data/processed/BTC-USD_cropped.csv', index=True)
data_lag.to_csv('../../data/processed/BTC-USD_lag_cropped.csv', index=True)

In [48]:
data_crypto_all = pd.read_csv('../../data/processed/BTC-USD_crypto_all.csv', index_col='Date', parse_dates=True)
data_crypto_cropped = data_crypto_all.loc['2017-01-01':]
data_crypto_cropped.to_csv('../../data/processed/BTC-USD_crypto_cropped.csv', index=True)
