In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint


path = 'data'

In [2]:
def load_file(file_path):
    
    return pd.read_csv(file_path)

def feature_engineering(df_raw, estrategy='simple'):
    
    if estrategy == 'simple':
        df = df_raw[['timestamp', 'close']]
        df = df.set_index('timestamp', drop=True)
        
    return df

def format_data(data, labels, janela_de_tempo, janela_de_predicao):
    hist = []
    target = []

    for i in range(len(data)-(janela_de_tempo + janela_de_predicao)):
        x = data[i:i+janela_de_tempo]
        y = labels[i+janela_de_tempo:i+janela_de_tempo+janela_de_predicao]

        hist.append(x)
        target.append(y)
    #convertendo de lista para array
    hist = np.array(hist)
    target = np.array(target)
    
    return hist, target

def split_train_test(hist, target, train_part=0.7):
    
    #Forma correta (na prática) de normalizar
    valued=int(hist.shape[0]*train_part)

    X_train = hist[:valued,:]
    X_test = hist[valued:,:]

    y_train = target[:valued,:]
    y_test = target[valued:,:]
    
    return X_train, X_test, y_train, y_test

def normalize_data(X_train, X_test, y_train, y_test, janela_de_tempo):
    ##Normalizando...

    sc = MinMaxScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)

    X_train = X_train.reshape((len(X_train), janela_de_tempo, 1))
    X_test = X_test.reshape((len(X_test), janela_de_tempo, 1))

    sc.fit(y_train)
    y_train = sc.transform(y_train)
    y_test = sc.transform(y_test)
    return X_train, X_test, y_train, y_test, sc


def create_model(janela_de_tempo, estrategy='simple'):
    
    model = None
    
    if estrategy=='simple' :
        model = tf.keras.Sequential()

        #encoder
        model.add(LSTM(25, input_shape=(janela_de_tempo,1)))
        model.add(Dropout(0.10))

        #Gate do decoder
        model.add(RepeatVector(y_train.shape[1]))

        #decoder
        model.add(LSTM(25, return_sequences=True))
        model.add(TimeDistributed(Dense(10)))
        model.add(Dense(1))

        print(model.summary())
    return model

In [3]:
#list_files = os.listdir(path)
list_files = ['BCHUSDT-5m-data.csv', "ETCUSDT-5m-data.csv"]
print(f'Existem {len(list_files)} arquivos')
for file in list_files:
    print(file)

Existem 27 arquivos
ADAUSDT-5m-data.csv
ALGOUSDT-5m-data.csv
ATOMUSDT-5m-data.csv
AVAXUSDT-5m-data.csv
AXSUSDT-5m-data.csv
BCHUSDT-5m-data.csv
BNBUSDT-5m-data.csv
DOGEUSDT-5m-data.csv
DOTUSDT-5m-data.csv
ETCUSDT-5m-data.csv
FILUSDT-5m-data.csv
FTMUSDT-5m-data.csv
HBARUSDT-5m-data.csv
LINKUSDT-5m-data.csv
LTCUSDT-5m-data.csv
LUNAUSDT-5m-data.csv
MATICUSDT-5m-data.csv
NEARUSDT-5m-data.csv
SHIBUSDT-5m-data.csv
SOLUSDT-5m-data.csv
THETAUSDT-5m-data.csv
TRXUSDT-5m-data.csv
UNIUSDT-5m-data.csv
VETUSDT-5m-data.csv
XLMUSDT-5m-data.csv
XRPUSDT-5m-data.csv
XTZUSDT-5m-data.csv


In [4]:

file = 'BCHUSDT-5m-data.csv'

# Parametros do experimento
estrategy = 'simple'
optimizer='adam' 
loss='binary_crossentropy'
janela_de_tempo = 60   # Quantidade de slots utilizados pra predicao
janela_de_predicao = 10 # Quanditade de slots pra frente que serao preditos 
epochs = 2
batch_size = 32

file_path = os.path.join(path,file)
df_raw = load_file(file_path)
#del df_raw
df = feature_engineering(
    df_raw, 
    estrategy=estrategy
)


df.loc[:, 'rotulo'] = df['close']
data = df.iloc[:, 0]
labels = df['rotulo']

hist, target = format_data(data, labels, janela_de_tempo, janela_de_predicao)
X_train, X_test, y_train, y_test = split_train_test(hist, target, train_part=0.7)

X_train, X_test, y_train, y_test, sc = normalize_data(X_train, X_test, y_train, y_test, janela_de_tempo)

model = create_model(janela_de_tempo, estrategy='simple')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 25)                2700      
                                                                 
 dropout (Dropout)           (None, 25)                0         
                                                                 
 repeat_vector (RepeatVector  (None, 10, 25)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 10, 25)            5100      
                                                                 
 time_distributed (TimeDistr  (None, 10, 10)           260       
 ibuted)                                                         
                                                                 
 dense_1 (Dense)             (None, 10, 1)             1

In [5]:
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test),  batch_size=batch_size)

Epoch 1/2
Epoch 2/2


In [6]:
history.history

{'loss': [0.42545509338378906, 0.4232957661151886],
 'accuracy': [1.3444656360661611e-05, 1.3444656360661611e-05],
 'val_loss': [0.5340894460678101, 0.5186417698860168],
 'val_accuracy': [0.0, 0.0]}

In [None]:
history

In [None]:
X_train.shape

In [None]:
pred = model.predict(X_test)
pred

# Visualização

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline
sns.set(style="darkgrid", font_scale=1.5)

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
epocas_a_pular = 0

loss = history.history['loss'][epocas_a_pular:]
val_loss=history.history['val_loss'][epocas_a_pular:]
epoch_count = range(1, len(loss) + 1)
plt.figure(figsize=(12,8))
plt.plot(epoch_count, loss, 'r--')
plt.plot(epoch_count, val_loss,  )
plt.legend(['Training Loss', 'Validation Loss'])

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show();

In [None]:
dias_pra_frente = 5
pred = model.predict(X_test)

plt.figure(figsize=(12,8))
plt.plot(y_test[:,dias_pra_frente], color='blue', label='Real')
plt.plot([i[dias_pra_frente] for i in pred], color='red', label='Prediction')
plt.title('Preço')
plt.legend()
plt.show()

O modelo parece estar bem ajustado. Vamos verificar o valor real (sem normalização):

In [None]:
y_train.shape

In [None]:
y_test.shape , pred[:, :, 0].shape

In [None]:
type(y_test[0]), type(pred[0])

In [None]:
pred_transformed = sc.inverse_transform(pred[:, :, 0])
y_test_transformed = sc.inverse_transform(y_test)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(y_test_transformed, color='blue', label='Real')
plt.plot(pred_transformed, color='red', label='Prediction')
plt.title('Preço real')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,6))
df_aux = df
sns.lineplot(x=df_aux.reset_index().index, y="close", data=df_aux).set_title("Variações de Preço");