In [1]:
%load_ext autoreload
%autoreload 2

In [28]:
import polars as pl
import pandas as pd
from utils import (remove_columns_with_nulls_above_threshold,
                    daily_price_dateset_manipulation, 
                    apy_dateset_manipulation, 
                    tvl_dateset_manipulation,
                    dimensionality_reduction)
import xgboost as xgb
import numpy as np
import onnxruntime as rt
import onnx
from tqdm import tqdm
from arch import arch_model
from giza_actions.task import task
from giza_actions.action import Action, action
from giza_actions.model import GizaModel
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFE
from lightgbm import LGBMRegressor
import torch
from torch import nn
from hummingbird.ml import convert

import certifi
import os
os.environ['SSL_CERT_FILE'] = certifi.where()

In [3]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np




In [72]:
@task(name=f'Join and postprocessing')
def loading_and_processing():
    """
    Loads and processes the main, APY, and TVL datasets, joining them on the date column and performing postprocessing.

    Returns:
    A DataFrame ready for further analysis or model training, containing combined and processed features from all datasets.
    """
    
    df_main = daily_price_dateset_manipulation()
    apy_df = apy_dateset_manipulation()
    tvl_df = tvl_dateset_manipulation()

    df_main = df_main.merge(tvl_df.to_pandas(), on = "date", how = "inner")
    df_main = df_main.merge(apy_df.to_pandas(), on = "date", how = "inner")

    df_main = remove_columns_with_nulls_above_threshold(df_main, 0.05)
    return df_main

@task(name=f'prepare dataset')
def prepare_dataset(df, test_n = 60):
    X = df.drop(['WETH_future_vol', 'date'], axis=1)
    y = df['WETH_future_vol']

    X_train = X.iloc[:-test_n]
    X_test = X.iloc[-test_n:]
    y_train = y.iloc[:-test_n]
    y_test = y.iloc[-test_n:]

    X_train = dimensionality_reduction(X_train, y_train)
    X_test = X_test[X_train.columns]
    return X_train, X_test, y_train, y_test

@task(name=f'train model')
def train_model(X, y):

    params = {
    'learning_rate': 0.005,
    'n_estimators': 1000,
    'early_stopping_rounds': 50, 
    'verbose': -1
    }

    tscv = TimeSeriesSplit(n_splits=5)

    optimal_rounds = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = LGBMRegressor(**params)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
        optimal_rounds.append(model.best_iteration_)

    optimal_rounds_avg = sum(optimal_rounds) / len(optimal_rounds)

    model_full = LGBMRegressor(learning_rate=0.005, 
                            n_estimators=int(optimal_rounds_avg * 1.1),
                            max_depth = 6,
                            min_data_in_leaf = 20,
                            num_leaves = 15,
                            feature_fraction = 0.6,
                            bagging_fraction = 0.6,
                            lambda_l1 = 0.05,
                            objective='regression', 
                            verbose = -1)
    model_full.fit(X, y)
    return model_full

@task(name=f'Test model')
def test_model(X_test, y_test, y_train, model):
    """
    Tests the trained model using the test dataset and prints classification metrics.

    Parameters:
    - X_test: Feature DataFrame for testing.
    - y_test: Actual target values for the test dataset.
    - model: The trained neural network model.
    """
    # Realizar predicciones en el conjunto de test
    y_pred_test = model.predict(X_test)

    original_y_test = np.exp(y_test) - 1
    original_y_preds = np.exp(y_pred_test) - 1
    # Calcular métricas en el conjunto de test
    mse_test = mean_squared_error(original_y_test, original_y_preds)
    mae_test = mean_absolute_error(original_y_test, original_y_preds)
    r2_test = r2_score(original_y_test, original_y_preds)

    mse_benchmark = mean_squared_error(original_y_test, np.full(len(original_y_test), np.mean(np.exp(y_train) - 1)))
    mae_benchmark = mean_absolute_error(original_y_test, np.full(len(original_y_test), np.mean(np.exp(y_train) - 1)))
    r2_benchmark = r2_score(original_y_test, np.full(len(original_y_test), np.mean(np.exp(y_train) - 1)))

    print("test_metrics: " + str(mse_test), str(mae_test), str(r2_test))
    print("test_metrics: " + str(mse_benchmark), str(mae_benchmark), str(r2_benchmark))

@task(name="Convert To ONNX")
def convert_to_onnx(model, sample_input, onnx_file_path):
    """
    Converts a PyTorch model to the ONNX format and saves it to a specified file path.

    Parameters:
    - model: The PyTorch model to be converted.
    - sample_len: The length of the input sample, specifying the input size.
    - onnx_file_path: The file path where the ONNX model will be saved.

    This function takes a trained PyTorch model and a sample input size, exports the model to the ONNX format,
    and saves it to the provided file path. It specifies model input/output names and handles dynamic batch sizes
    for flexibility in model deployment.
    """

    onnx_gbt = convert(model, 'onnx', sample_input)
    try:
        out = onnx_gbt.predict(sample_input)
    except:
        print(f"Error converting to onnx")
    onnx.save_model(onnx_gbt.model, onnx_file_path)
    print(f"Model has been converted to ONNX and saved to {onnx_file_path}")
    
@action(name=f'Execution', log_prints=True )
def execution():
    """
    Main execution action that processes data, trains a model, tests the model, and converts it to ONNX format.

    This action performs the following steps:
    - Loads and processes the main dataset.
    - Prepares datasets for training and testing.
    - Saves a subset of the test dataset for example predictions.
    - Trains a neural network model using the prepared training dataset.
    - Tests the trained model using the test dataset and prints classification metrics.
    - Converts the trained model to the ONNX format for deployment.

    The ONNX model is saved to a predefined file path, and the action demonstrates an end-to-end workflow from data
    preprocessing to model deployment in the ONNX format.
    """
    df = loading_and_processing()
    X_train, X_test, y_train, y_test = prepare_dataset(df)
    X_test[int(len(X_test) * 0.6):].to_csv("./example_token_vol.csv", header = False)
    model = train_model(X_train,y_train)
    test_model(X_test, y_test, y_train, model)
    
    onnx_file_path = "lgbm-token-vol.onnx"
    convert_to_onnx(model, X_test[:1].to_numpy(), onnx_file_path)

if __name__ == "__main__":
    action_deploy = Action(entrypoint=execution, name="lgbm-token-vol-action")
    action_deploy.serve(name="lgbm-token-vol-deployment")


coroutine 'Action.serve' was never awaited


In [73]:
execution()

The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.
The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.
The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000944 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24435
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 113
[LightGBM] [Info] Start training from score 0.031529
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24435
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 113
[LightGBM] [Info] Start training from score 0.031529
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24435
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 113
[LightGBM] [Info] Start tra

In [5]:
df = loading_and_processing()

The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.
The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.
The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.


In [16]:
X_train, X_test, y_train, y_test = prepare_dataset(df)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24435
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 113
[LightGBM] [Info] Start training from score 0.031529
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24435
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 113
[LightGBM] [Info] Start training from score 0.031529
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24435
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 113
[LightGBM] [Info] Start tra

In [20]:

model = train_model(X_train,y_train)

In [54]:

#x = torch.rand(1, *shape, requires_grad=False)



In [None]:
@task(name=f'Join and postprocessing')
def loading_and_processing():
    """
    Loads and processes the main, APY, and TVL datasets, joining them on the date column and performing postprocessing.

    Returns:
    A DataFrame ready for further analysis or model training, containing combined and processed features from all datasets.
    """
    
    df_main = daily_price_dateset_manipulation()
    apy_df = apy_dateset_manipulation()
    tvl_df = tvl_dateset_manipulation()

    df_main = df_main.merge(tvl_df.to_pandas(), on = "date", how = "inner")
    df_main = df_main.merge(apy_df.to_pandas(), on = "date", how = "inner")

    df_main = remove_columns_with_nulls_above_threshold(df_main, 0.05)
    return df_main

@task(name=f'prepare dataset')
def prepare_dataset(df, test_n = 60):
    X = df.drop(['WETH_future_vol', 'date'], axis=1)
    y = df['WETH_future_vol']

    X_train = X.iloc[:-test_n]
    X_test = X.iloc[-test_n:]
    y_train = y.iloc[:-test_n]
    y_test = y.iloc[-test_n:]

    X_train = dimensionality_reduction(X)
    X_test = X_test[X_train.columns]
    return X_train, X_test, y_train, y_test

@task(name=f'train model')
def train_model(X, y):

    params = {
    'learning_rate': 0.005,
    'n_estimators': 1000,
    'early_stopping_rounds': 50, 
    'verbose': -1
    }

    tscv = TimeSeriesSplit(n_splits=5)

    optimal_rounds = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = LGBMRegressor(**params)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
        optimal_rounds.append(model.best_iteration_)

    optimal_rounds_avg = sum(optimal_rounds) / len(optimal_rounds)

    model_full = LGBMRegressor(learning_rate=0.005, 
                            n_estimators=int(optimal_rounds_avg * 1.1),
                            max_depth = 6,
                            min_data_in_leaf = 20,
                            num_leaves = 15,
                            feature_fraction = 0.6,
                            bagging_fraction = 0.6,
                            lambda_l1 = 0.1,
                            objective='regression', 
                            verbose = -1)
    model_full.fit(X, y)
    return model

@task(name=f'Test model')
def test_model(X_test, y_test, y_train, model):
    """
    Tests the trained model using the test dataset and prints classification metrics.

    Parameters:
    - X_test: Feature DataFrame for testing.
    - y_test: Actual target values for the test dataset.
    - model: The trained neural network model.
    """
    # Realizar predicciones en el conjunto de test
    y_pred_test = model.predict(X_test)

    original_y_test = np.exp(y_test) - 1
    original_y_preds = np.exp(y_pred_test) - 1
    # Calcular métricas en el conjunto de test
    mse_test = mean_squared_error(original_y_test, original_y_preds)
    mae_test = mean_absolute_error(original_y_test, original_y_preds)
    r2_test = r2_score(original_y_test, original_y_preds)

    mse_benchmark = mean_squared_error(original_y_test, np.full(len(original_y_test), np.mean(np.exp(y_train) - 1)))
    mae_benchmark = mean_absolute_error(original_y_test, np.full(len(original_y_test), np.mean(np.exp(y_train) - 1)))
    r2_benchmark = r2_score(original_y_test, np.full(len(original_y_test), np.mean(np.exp(y_train) - 1)))

    print("test_metrics: " + str(mse_test), str(mae_test), str(r2_test))
    print("test_metrics: " + str(mse_benchmark), str(mae_benchmark), str(r2_benchmark))

    
    
@task(name="Convert To ONNX")
def convert_to_onnx(model, sample_input, onnx_file_path):
    """
    Converts a PyTorch model to the ONNX format and saves it to a specified file path.

    Parameters:
    - model: The PyTorch model to be converted.
    - sample_len: The length of the input sample, specifying the input size.
    - onnx_file_path: The file path where the ONNX model will be saved.

    This function takes a trained PyTorch model and a sample input size, exports the model to the ONNX format,
    and saves it to the provided file path. It specifies model input/output names and handles dynamic batch sizes
    for flexibility in model deployment.
    """

    torch.onnx.export(
        model,  # Model being exported
        sample_input,  # Model input (or a tuple for multiple inputs)
        onnx_file_path,  # Where to save the model
        export_params=True,  # Store the trained parameter weights inside the model file
        opset_version=11,  # ONNX version to export the model to
        do_constant_folding=True,  # Whether to execute constant folding for optimization
        input_names=["input"],  # Model's input names
        output_names=["output"],  # Model's output names
        dynamic_axes={
            "input": {0: "batch_size"},  # Variable length axes
            "output": {0: "batch_size"},
        },
    )
    print(f"Model has been converted to ONNX and saved to {onnx_file_path}")
    
@action(name=f'Execution', log_prints=True )
def execution():
    """
    Main execution action that processes data, trains a model, tests the model, and converts it to ONNX format.

    This action performs the following steps:
    - Loads and processes the main dataset.
    - Prepares datasets for training and testing.
    - Saves a subset of the test dataset for example predictions.
    - Trains a neural network model using the prepared training dataset.
    - Tests the trained model using the test dataset and prints classification metrics.
    - Converts the trained model to the ONNX format for deployment.

    The ONNX model is saved to a predefined file path, and the action demonstrates an end-to-end workflow from data
    preprocessing to model deployment in the ONNX format.
    """
    df = load_and_df_processing()
    X_train, X_test, y_train, y_test = prepare_datasets(df)
    X_test[int(len(X_test) * 0.6):].write_csv("./example_token_trend.csv")
    model = prepare_and_train(X_train,y_train)
    test_model(X_test, y_test, y_train, model)
    
    onnx_file_path = "pytorch-token-trend_action_model.onnx"
    convert_to_onnx(model, X_test.shape[1], onnx_file_path)

if __name__ == "__main__":
    action_deploy = Action(entrypoint=execution, name="pytorch-token-trend-action")
    action_deploy.serve(name="pytorch-token-trend-deployment")


In [None]:
cutoff_index = int(len(price_df) * 0.85)
df_train = price_df[:cutoff_index]
df_test = price_df[cutoff_index:]

from arch import arch_model
import numpy as np

returns = df_train["returns"].to_numpy()
predictions = []
actuals = df_test["returns"].to_numpy()
n_test = len(df_test)

for i in tqdm(range(n_test), desc="Fitting GARCH Models"):
    # Adjust the slicing to ensure there's enough data for fitting
    # The '-15' ensures we stop fitting 15 days before the forecast point
    train_data = returns[:-(n_test + 15) + i]  # Adjust this line as necessary

    if len(train_data) > 0:  # Ensure there's data to fit the model
        garch_model = arch_model(train_data, vol='Garch', p=1, q=1, mean='Zero')
        result = garch_model.fit(disp='off')
        pred = result.forecast(horizon=1).mean.iloc[-1]  # Forecast the next point
        predictions.append(pred)
    else:
        print(f"Skipping model at iteration {i} due to insufficient data.")
        continue

# Calculate RMSE
rmse = np.sqrt(np.mean((np.array(predictions) - actuals) ** 2))
print(f"RMSE: {rmse}")