#### Importação das bibliotecas

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer

from scikeras.wrappers import KerasRegressor

from keras.models import Sequential
from keras.layers import Input, LSTM, Dropout, Dense
from keras.callbacks import EarlyStopping


#### Carregamento e ajuste dos dados

In [None]:
data = pd.read_csv('./data/BTC-USD.csv')
data = data.iloc[2:,:]
data.head(10)

In [None]:
data.info()

Aqui notamos que estao todas as colunas com o tipo incorreto, vamos ajustar cada uma delas

In [None]:
cols = ['Close', 'High', 'Low', 'Open', 'Volume']
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce')
data.dtypes

Apliquei a função pd.to_numeric em cada uma dessas colunas que converte os valores pra número (float ou int)<br>
Se algum valor não puder ser convertido (por exemplo, "ABC", "?", "-"), o parâmetro errors='coerce' manda ele transformar em NaN (valor nulo)

agora so falta colocar a coluna de data, para o tipo data

In [None]:
data = data.rename(columns={'Price': 'Date'})
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

In [None]:
data = data.set_index('Date')
data = data.sort_index()
data.head(15)


Agora já ajustei o nome da coluna, transformei para data, coloquei a data como indice do meu dataset e ordenei por garantia, porque vi que a ordem era importante para uma LSTM

In [None]:
data.isna().sum()

In [None]:
data[data.duplicated() == True]

#### Explorando um pouco os dados

In [None]:
data.describe().transpose()

In [None]:
data.median().to_frame(name='median')

In [None]:
data.var().to_frame(name='var')

In [None]:
plt.Figure()
sns.heatmap(data.corr(), annot=True, cmap="YlGn", fmt=".2f")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.plot(data.index, data['Close'], label='Close', color='green')
plt.plot(data.index, data['Open'], label='Open', color='blue')
plt.plot(data.index, data['High'], label='High', color='orange')
plt.plot(data.index, data['Low'], label='Low', color='red')
plt.plot(data.index, data['Volume'], label='Volume', color='purple')

plt.title('Atributos ao longo do tempo')
plt.xlabel('Data')
plt.ylabel('Valores')
plt.legend()
plt.show()

Aqui nao conseguimos visualizar bem porque o valor do volume é bem superior aos demais, para melhor visualização vou normalizar os dados

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(data)
data_scaled = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)

In [None]:
plt.figure(figsize=(20,8))

plt.plot(data_scaled.index, data_scaled['Close'], label='Close', color='green')
plt.plot(data_scaled.index, data_scaled['Open'], label='Open', color='blue')
plt.plot(data_scaled.index, data_scaled['High'], label='High', color='orange')
plt.plot(data_scaled.index, data_scaled['Low'], label='Low', color='red')
plt.plot(data_scaled.index, data_scaled['Volume'], label='Volume', color='purple')

plt.title('Atributos ao longo do tempo')
plt.xlabel('Data')
plt.ylabel('Valores')
plt.legend()
plt.show()

podemos notar que close, open, high e low possuem praticamente a mesma tendencia como acusado pela matriz de correlação, já o volume tem o comportamento diferente, com picos e quedas meio bruscas

#### Aplicação de Modelo para predição

##### OBJETIVOS

- [ ] Comparar com outros métodos

##### Univariado

In [None]:
# def create_sequences(data=[], seq_len=7):
#     X, y = [], []
#     for i in range(len(data) - seq_len):
#         X.append(data[i:i+seq_len])
#         y.append(data[i+seq_len])
#     return np.array(X), np.array(y)


In [None]:
# serie = data['Close']

# train_size = int(0.7 * len(serie))

# train_data = serie[:train_size].values.reshape(-1, 1)
# test_data = serie[train_size:].values.reshape(-1, 1)

# # print(test_data)

# scaler = MinMaxScaler(feature_range=(0,1))
# data_train_scaled = scaler.fit_transform(train_data)
# data_test_scaled = scaler.transform(test_data)


In [None]:
# SEQ_LEN = 14

# X_train_scaled, y_train_scaled = create_sequences(data_train_scaled, SEQ_LEN)
# X_test_scaled, y_test_scaled = create_sequences(data_test_scaled, SEQ_LEN)

In [None]:

# model = Sequential([
#     LSTM(100, input_shape=(SEQ_LEN, 1), return_sequences=True),
#     Dropout(0.2),
#     LSTM(50, return_sequences=False),
#     Dropout(0.2),
#     Dense(1)
# ])

# es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# history = model.fit(
#     X_train_scaled, y_train_scaled,
#     epochs=100,
#     batch_size=32,
#     validation_split=0.1,
#     callbacks=[es],
#     shuffle=False
# )

In [None]:
# plt.figure(figsize=(12,4))
# plt.plot(history.history['loss'], label="Training Loss", color='blue') 
# plt.plot(history.history['val_loss'], label="Validation Loss", color='purple')
# plt.legend()
# plt.show()

In [None]:
# y_pred_scaled = model.predict(X_test_scaled)
# y_pred = scaler.inverse_transform(y_pred_scaled)
# y_test = scaler.inverse_transform(y_test_scaled)

# print(y_test)


##### Multivariado

In [None]:
def create_sequences(X, y, seq_len=14):
    Xs, ys = [], []
    for i in range(len(X) - seq_len):
        Xs.append(X[i:i+seq_len])   
        ys.append(y[i+seq_len])    
    return np.array(Xs), np.array(ys)


X = data[['High', 'Low', 'Open', 'Volume']]
y = data[['Close']]

scaler_X = MinMaxScaler(feature_range=(0,1))
scaler_y = MinMaxScaler(feature_range=(0,1))

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)


In [None]:
def create_lstm_model(seq_len=14, n_features=4, neurons=100, drop=0.2, optimizer='Adam'):
  model = Sequential([
      Input(shape=(seq_len, n_features)),
      LSTM(neurons, return_sequences=True),
      Dropout(drop),
      LSTM((neurons//2), return_sequences=False),
      Dropout(drop),
      Dense(1),
  ])
  model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
  return model

seq_len_options = [7, 14, 30]
split_options = [3, 5, 10]
models = []

for seq_len in seq_len_options:

  print(f"===== JANELA DE {seq_len} =====")

  X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_len)

  train_size = int(0.8 * len(X_seq))

  X_train_scaled, X_test_scaled = X_seq[:train_size], X_seq[train_size:]
  y_train_scaled, y_test_scaled = y_seq[:train_size], y_seq[train_size:]

  for split in split_options:
    print(f"_____ SPLIT DE {split} _____")

    model = KerasRegressor(model=create_lstm_model, verbose=0, seq_len=seq_len, n_features=X_train_scaled.shape[2])

    param_grid = {
                    'model__neurons': [32, 64, 96],
                    'model__drop': [0.05, 0.1, 0.2],
                    'batch_size': [16, 32],   
                    'epochs': [30, 50],
                    'model__optimizer': ['Adam'], 
                  }

    tscv = TimeSeriesSplit(n_splits=split)

    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, refit=True, scoring="neg_root_mean_squared_error", n_jobs = -1, error_score='raise')

    early_stopping = EarlyStopping(monitor='val_mae', patience=10, restore_best_weights=True)

    grid_result = grid.fit(X_train_scaled, y_train_scaled, callbacks=[early_stopping], validation_split=0.2)

    models.append({
      "model": grid_result.best_estimator_,
      "rmse": -grid_result.best_score_,
      "n_splits": split,
      "seq_len": seq_len,
      "params": grid_result.best_params_
    })

    print(f"Menor RMSE: {-grid_result.best_score_:.3f} usando: {grid_result.best_params_}")

In [None]:
best_model_entry = min(models, key=lambda x: x["rmse"])
best_model = best_model_entry["model"]
best_model

In [None]:
y_pred_scaled = grid_result.best_estimator_.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test = scaler_y.inverse_transform(y_test_scaled)


In [None]:
# Métricas
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")

mae_percent = (mae / len(y_test)) * 100
rmse_percent = (rmse / len(y_test)) * 100
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"MAE(%): {mae_percent:.2f}%")
print(f"RMSE(%): {rmse_percent:.2f}%")
print(f"MAPE(%): {mape:.2f}%")

Metodo Univariado ["Close"]

MAE: 1823.2193996762269
RMSE: 2849.412183517845
MAE(%): 3.34%
RMSE(%): 5.23%
MAPE(%): 2.91%

In [None]:
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2:.2f}")


In [None]:
y_pred_plot = np.empty_like(data['Close'])
y_pred_plot[:] = np.nan
y_pred_plot[-len(y_pred):] = y_pred.reshape(-1)

plt.figure(figsize=(20,8))
plt.plot(data.index, y_pred_plot, color="red", marker=",", label='Predicted Close')
plt.plot(data.index, np.array(data['Close']), color="black", marker=",", label='Total True Close')
plt.title('Close: total real vs predito')
plt.legend()
plt.show()

In [None]:
dias_analise=30

plt.figure(figsize=(20,8))
plt.plot(data.index[:dias_analise], y_pred[:dias_analise], color="red", marker=",", label='Predicted Close')
plt.plot(data.index[:dias_analise], y_test[:dias_analise], color="black", marker=",", label='True Close')
plt.title('Close: real vs predito')
plt.legend()
plt.grid(True, linestyle="--")
plt.show()