### Extração dos dados da API do Yahoo Finance

In [1]:
import yfinance as yf
import time
import pandas as pd

In [2]:
cryptos = ['BTC-USD', 'ETH-USD', 'SOL-USD']

try:
    for crypto in cryptos:
        data = yf.download(crypto, period='max', interval='1d', auto_adjust=True)
        data.to_csv(f'../data/raw/{crypto}.csv')
        if crypto != cryptos[-1]:
            time.sleep(10)

except Exception as err:
    print(f"Não foi possivel baixar os dados: {err}")

[*********************100%***********************]  1 of 1 completed

Não foi possivel baixar os dados: Cannot save file into a non-existent directory: '..\data\raw'





### Carregamento, Ajuste e Exportação dos Dados

#### Bitcoin

In [3]:
data = pd.read_csv('../../data/raw/ETH-USD.csv')
data = data.iloc[2:,:]
data.head(5)

Unnamed: 0,Price,Close,High,Low,Open,Volume
2,2017-11-09,320.8840026855469,329.4519958496094,307.0559997558594,308.6449890136719,893249984
3,2017-11-10,299.25299072265625,324.7179870605469,294.5419921875,320.6709899902344,885985984
4,2017-11-11,314.6809997558594,319.4530029296875,298.1919860839844,298.58599853515625,842300992
5,2017-11-12,307.9079895019531,319.15301513671875,298.51300048828125,314.69000244140625,1613479936
6,2017-11-13,316.71600341796875,328.4150085449219,307.0249938964844,307.0249938964844,1041889984


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2909 entries, 2 to 2910
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Price   2909 non-null   object
 1   Close   2909 non-null   object
 2   High    2909 non-null   object
 3   Low     2909 non-null   object
 4   Open    2909 non-null   object
 5   Volume  2909 non-null   object
dtypes: object(6)
memory usage: 136.5+ KB


In [5]:
cols = ['Close', 'High', 'Low', 'Open', 'Volume']
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce')
data.dtypes

Price      object
Close     float64
High      float64
Low       float64
Open      float64
Volume      int64
dtype: object

Apliquei a função pd.to_numeric em cada uma dessas colunas que converte os valores pra número (float ou int)<br>
Se algum valor não puder ser convertido (por exemplo, "ABC", "?", "-"), o parâmetro errors='coerce' manda ele transformar em NaN (valor nulo)

In [6]:
data = data.rename(columns={'Price': 'Date'})
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

Date      datetime64[ns]
Close            float64
High             float64
Low              float64
Open             float64
Volume             int64
dtype: object

In [7]:
data = data.set_index('Date')
data = data.sort_index()
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-11-09,320.884003,329.451996,307.056,308.644989,893249984
2017-11-10,299.252991,324.717987,294.541992,320.67099,885985984
2017-11-11,314.681,319.453003,298.191986,298.585999,842300992
2017-11-12,307.90799,319.153015,298.513,314.690002,1613479936
2017-11-13,316.716003,328.415009,307.024994,307.024994,1041889984


Agora já ajustei o nome da coluna, transformei para data, coloquei a data como indice do meu dataset e ordenei por garantia, porque vi que a ordem era importante para uma LSTM

In [8]:
data.isna().sum()

Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64

In [9]:
data[data.duplicated() == True]

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [10]:
data.shape

(2909, 5)

In [11]:
data.to_csv('../../data/processed/ETH-USD_raw.csv', index=True)

In [12]:
limite = data['Volume'].quantile(0.999)
data = data[data['Volume'] <= limite]
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-11-09,320.884003,329.451996,307.056,308.644989,893249984
2017-11-10,299.252991,324.717987,294.541992,320.67099,885985984
2017-11-11,314.681,319.453003,298.191986,298.585999,842300992
2017-11-12,307.90799,319.153015,298.513,314.690002,1613479936
2017-11-13,316.716003,328.415009,307.024994,307.024994,1041889984


In [13]:
data.shape

(2906, 5)

In [14]:
data.to_csv('../../data/processed/ETH-USD_all.csv', index=True)

In [15]:
data_lag = data.copy()

n_steps = 30
for time_step in range(1, n_steps+1):
    data_lag['closeLag_'+str(time_step)] = data_lag['Close'].shift(-time_step)

data_lag = data_lag.dropna()
data_lag.isna().sum()


Close          0
High           0
Low            0
Open           0
Volume         0
closeLag_1     0
closeLag_2     0
closeLag_3     0
closeLag_4     0
closeLag_5     0
closeLag_6     0
closeLag_7     0
closeLag_8     0
closeLag_9     0
closeLag_10    0
closeLag_11    0
closeLag_12    0
closeLag_13    0
closeLag_14    0
closeLag_15    0
closeLag_16    0
closeLag_17    0
closeLag_18    0
closeLag_19    0
closeLag_20    0
closeLag_21    0
closeLag_22    0
closeLag_23    0
closeLag_24    0
closeLag_25    0
closeLag_26    0
closeLag_27    0
closeLag_28    0
closeLag_29    0
closeLag_30    0
dtype: int64

In [16]:
data_lag.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume,closeLag_1,closeLag_2,closeLag_3,closeLag_4,closeLag_5,...,closeLag_21,closeLag_22,closeLag_23,closeLag_24,closeLag_25,closeLag_26,closeLag_27,closeLag_28,closeLag_29,closeLag_30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-11-09,320.884003,329.451996,307.056,308.644989,893249984,299.252991,314.681,307.90799,316.716003,337.631012,...,447.114014,466.540009,463.449005,465.852997,470.20401,463.281006,428.588013,434.40799,456.031006,473.502014
2017-11-10,299.252991,324.717987,294.541992,320.67099,885985984,314.681,307.90799,316.716003,337.631012,333.356995,...,466.540009,463.449005,465.852997,470.20401,463.281006,428.588013,434.40799,456.031006,473.502014,441.721008
2017-11-11,314.681,319.453003,298.191986,298.585999,842300992,307.90799,316.716003,337.631012,333.356995,330.924011,...,463.449005,465.852997,470.20401,463.281006,428.588013,434.40799,456.031006,473.502014,441.721008,515.135986
2017-11-12,307.90799,319.153015,298.513,314.690002,1613479936,316.716003,337.631012,333.356995,330.924011,332.394012,...,465.852997,470.20401,463.281006,428.588013,434.40799,456.031006,473.502014,441.721008,515.135986,651.43103
2017-11-13,316.716003,328.415009,307.024994,307.024994,1041889984,337.631012,333.356995,330.924011,332.394012,347.612,...,470.20401,463.281006,428.588013,434.40799,456.031006,473.502014,441.721008,515.135986,651.43103,702.767029


In [17]:
data.head(5)

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-11-09,320.884003,329.451996,307.056,308.644989,893249984
2017-11-10,299.252991,324.717987,294.541992,320.67099,885985984
2017-11-11,314.681,319.453003,298.191986,298.585999,842300992
2017-11-12,307.90799,319.153015,298.513,314.690002,1613479936
2017-11-13,316.716003,328.415009,307.024994,307.024994,1041889984


In [18]:
data_lag.shape

(2876, 35)

In [19]:
data_lag.to_csv('../../data/processed/ETH-USD_lag_all.csv', index=True)

In [20]:
data = data.loc['2020-06-01':]
data_lag = data_lag.loc['2020-06-01':]

In [21]:
data.shape

(1971, 5)

In [22]:
data_lag.shape

(1941, 35)

In [23]:
data.to_csv('../../data/processed/ETH-USD_cropped.csv', index=True)
data_lag.to_csv('../../data/processed/ETH-USD_lag_cropped.csv', index=True)

In [24]:
data_crypto_all = pd.read_csv('../../data/processed/ETH-USD_crypto_all.csv', index_col='Date', parse_dates=True)
data_crypto_cropped = data_crypto_all.loc['2020-06-01':]
data_crypto_cropped.to_csv('../../data/processed/ETH-USD_crypto_cropped.csv', index=True)
