In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import itertools
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

2024-05-07 21:18:19.045180: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-07 21:18:19.067448: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-07 21:18:19.067471: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-07 21:18:19.068091: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-07 21:18:19.072237: I tensorflow/core/platform/cpu_feature_guar

### 1. Carga de datos

In [2]:
def load_data(filepath):
    return pd.read_csv(filepath, parse_dates=['Date'])

### 2. Division y tratamiento de datos

In [3]:
def create_sequences(data, target, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:(i + n_steps)])
        y.append(target[i + n_steps])
    return np.array(X), np.array(y)

In [4]:
def prepare_data(df, target_column, n_steps):
    features = df.drop(columns=[target_column, 'LMCADY_std_5d_log'])

    # agregar dia mes y dia de la semana
    features['Month'] = df['Date'].dt.month
    features['Day'] = df['Date'].dt.day
    features['Weekday'] = df['Date'].dt.weekday

    features = features.drop(columns = ['Date'])

    target = df[target_column]

    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    X, y = create_sequences(features_scaled, target, n_steps)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle = False)

    return X_train, X_test, y_train, y_test

### 3. Arquitectura y compilacion

In [5]:
def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [6]:
def build_model(input_shape, md_prm: dict):
    model = Sequential([
        Conv1D(filters=md_prm['filters'], kernel_size=md_prm['kernel_size'], activation=md_prm['activation'], input_shape=input_shape),
        MaxPooling1D(pool_size=md_prm['pool_size']),
        Dropout(md_prm['dropout']),
        Flatten(),
        Dense(md_prm['dense_units'], activation=md_prm['dense_activation']),
        Dense(1)
    ])
    model.compile(optimizer=Adam(), loss='mse', metrics=[rmse])

    return model

### 4. Entrenamiento del modelo

In [7]:
def train_model(model, X_train, y_train, epochs, batch_size, verbose):
    # Configuración de EarlyStopping
    early_stopping = EarlyStopping(
        monitor             ='val_rmse',
        patience            =20,
        verbose             =1,
        restore_best_weights=True
    )

    # Entrenamiento del modelo con el callback de EarlyStopping
    history = model.fit(
        X_train, y_train,
        epochs          =epochs,
        batch_size      =batch_size,
        verbose         =verbose,
        validation_split=0.1,
        callbacks       =[early_stopping]
    )

    return history

In [8]:
# y_train.std()

In [9]:
# history = train_model(model, X_train, y_train, epochs=100, batch_size=32,)

### 5. Evaluacion de resultado

In [10]:
def evaluate_model(model, X_test, y_test):
    loss = model.evaluate(X_test, y_test)
    print(f'Test Loss: {loss}')

In [11]:
# evaluate_model(model, X_test, y_test)

In [12]:
def plot_history(history):
    # plotear loss de entrenamiento y validación
    figsize = (8,3)

    plt.figure(figsize=figsize)
    plt.plot(history.history['loss'], label='Training loss')
    plt.plot(history.history['val_loss'], label='Validation loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)
    plt.show()

    # otras metricas
    for key in history.history.keys():
        if key not in ['loss', 'val_loss', 'val_rmse']:
            plt.figure(figsize=figsize)
            plt.plot(history.history[key], label=f'Training {key}')
            plt.plot(history.history[f'val_{key}'], label=f'Validation {key}')
            plt.title(f'Model {key.capitalize()}')
            plt.ylabel(key.capitalize())
            plt.xlabel('Epoch')
            plt.legend()
            plt.grid(True)
            plt.show()

In [None]:
def plot_predictions(y_true, y_pred, title="Prediction vs Actual Data"):
    plt.figure(figsize=(10, 5))
    plt.plot(y_true, label='Actual Values', marker='o', linestyle='-')
    plt.plot(y_pred, label='Predicted Values', marker='x', linestyle='--')
    
    plt.title(title)
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

In [14]:
# df = load_data("./input/copper_log_returns_5d_final.csv")
# n_steps = 25
# target_column = 'LMCADY_acu_5d_log'
# X_train, X_test, y_train, y_test = prepare_data(df, target_column, n_steps)
# y_test.std()
# model = build_model(X_train.shape[1:])
# history = train_model(model, X_train, y_train, epochs=100, batch_size=16)
# evaluate_model(model, X_test, y_test)

In [20]:
# n_steps_options     = [25, 50, 100, 150, 200]
# batch_size_options  = [32, 64, 128, 256]
n_steps_options     = [27,31,32,33]
batch_size_options  = [16,32]
epochs_options      = [100]

filters_options     = [128]
kernel_size_options = [2, 3, 4]
pool_size_options   = [1]
dense_units_options = [50,100]
dense_activation_options = ['relu']
activation_options  = ['relu']
dropout_options     = [0.1]


model_params_combinations = list(itertools.product(
    filters_options, kernel_size_options, pool_size_options, dense_units_options,
    dense_activation_options, activation_options, dropout_options
))
target_column = 'LMCADY_acu_5d_log'

In [22]:
# number of iterations
n_iter = len(n_steps_options) * len(batch_size_options) * len(epochs_options) * len(model_params_combinations)
print(f'Total iterations: {n_iter}')

Total iterations: 48


In [16]:
def run_model_iterations():
    top_results = []

    i = 1
    for n_steps, batch_size, epochs, (filters, kernel_size, pool_size, dense_units, dense_activation, activation, dropout) in itertools.product(n_steps_options, batch_size_options, epochs_options, model_params_combinations):
        try:
            # Carga de datos
            df = load_data("./input/copper_log_returns_5d_final.csv")

            print(f"\n{i}) Testing with n_steps={n_steps}, batch_size={batch_size}, epochs={epochs}, filters={filters}")

            # Preparación de los datos
            X_train, X_test, y_train, y_test = prepare_data(df, 'LMCADY_acu_5d_log', n_steps)

            # Configuración de parámetros del modelo
            model_params = {
                'filters'           : filters,
                'kernel_size'       : kernel_size,
                'pool_size'         : pool_size,
                'dense_units'       : dense_units,
                'dense_activation'  : dense_activation,
                'activation'        : activation,
                'dropout'           : dropout
            }

            # Construcción y entrenamiento del modelo
            model   = build_model(X_train.shape[1:], model_params)
            history = train_model(model, X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)

            # Evaluación del modelo
            y_pred  = model.predict(X_test)
            rmse_ob = sqrt(mean_squared_error(y_test, y_pred))
            std_dev = y_test.std()

            # Calcular la diferencia absoluta
            difference = rmse_ob - std_dev

            # Guardar y ordenar resultados basados en la diferencia absoluta
            result = {
                'rmse': rmse_ob,
                'params': model_params,
                'history': history,
                'batch_size': batch_size,
                'n_steps': n_steps,
                'predictions': y_pred,
                'actuals': y_test
            }
            top_results.append(result)
            top_results = sorted(top_results, key=lambda x: x['difference'])[:5]

            print(f"\n{i}) RMSE: {rmse_ob:.6f}, Std Dev: {std_dev:.6f}, Difference: {difference:.6f}")
            print(f"Tested with n_steps={n_steps}, batch_size={batch_size}, epochs={epochs}, filters={filters}")
            print(f"Model params: {model_params}")


        except Exception as e:
            print(f"An error occurred: {e}. Skipping this combination.")
        i += 1

    return top_results

In [17]:
top_3_results = run_model_iterations()


1) Testing with n_steps=27, batch_size=16, epochs=100, filters=128


2024-05-07 21:18:20.226981: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-07 21:18:20.246216: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-07 21:18:20.246347: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Restoring model weights from the end of the best epoch: 9.
Epoch 29: early stopping

1) RMSE: 0.030816, Std Dev: 0.030614, Difference: 0.000202
Tested with n_steps=27, batch_size=16, epochs=100, filters=128
Model params: {'filters': 128, 'kernel_size': 2, 'pool_size': 1, 'dense_units': 50, 'dense_activation': 'relu', 'activation': 'relu', 'dropout': 0.1}

2) Testing with n_steps=27, batch_size=16, epochs=100, filters=128
Restoring model weights from the end of the best epoch: 3.
Epoch 23: early stopping

2) RMSE: 0.030849, Std Dev: 0.030614, Difference: 0.000235
Tested with n_steps=27, batch_size=16, epochs=100, filters=128
Model params: {'filters': 128, 'kernel_size': 2, 'pool_size': 1, 'dense_units': 100, 'dense_activation': 'relu', 'activation': 'relu', 'dropout': 0.1}

3) Testing with n_steps=27, batch_size=16, epochs=100, filters=128
Restoring model weights from the end of the best epoch: 10.
Epoch 30: early stopping

3) RMSE: 0.030769, Std Dev: 0.030614, Difference: 0.000155
Test

KeyboardInterrupt: 

In [19]:
for i, result in enumerate(top_3_results, 1):
    rmse            = result['rmse']
    model_params    = result['params']
    history         = result['history']
    batch_size      = result['batch_size']
    n_steps         = result['n_steps']
    y_test          = result['y_test']
    y_pred          = result['predictions']

    print(f"\nTop {i} Model")
    print(f"Best RMSE: {rmse:.6f} vs. a std of {y_test.std():.6f}")
    print(f"Model parameters: {model_params}")
    print(f"n_steps: {n_steps}, batch_size: {batch_size}")

    plot_history(history)

    plot_predictions(y_test, y_pred, title=f"Top {i} Model Predictions vs Actual")

NameError: name 'top_3_results' is not defined