In [None]:
!apt-get install -y python3.10-venv

In [None]:
!python -m venv my_darts_env
!source my_darts_env/bin/activate

In [None]:
!pip install darts catboost torch torchvision --no-cache-dir

import catboost
from darts.datasets import WeatherDataset
from darts.models import CatBoostModel

import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
import matplotlib.pyplot as plt
from google.colab import drive
import os
import torch
import darts
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.callbacks import EarlyStopping
from torchmetrics import MeanAbsolutePercentageError
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time
import datetime
from darts.models import RandomForest
from darts.models import NBEATSModel
from darts.models import NHiTSModel
from darts.models import XGBModel


print(catboost.__version__)

In [None]:
drive.mount('/content/gdrive')

In [None]:
os.getcwd()

In [None]:
path = "/content/gdrive/MyDrive/Artigo TFT/Dados/tucurui.csv"

def read_data(path):
  # reads data
  data = pd.read_csv(path, delimiter=';', decimal=',').dropna()

  #specify study range (past 3 years)
  data = data.iloc[-365*3:-1,:]

  # some formating
  current_date_format = "%d/%m/%Y"

  data['Data'] = pd.to_datetime(data['Data'], format=current_date_format)

  # create time series
  prec = data[['Data', 'UPH610010000']].copy()
  prec_ts = TimeSeries.from_dataframe(prec, time_col="Data", value_cols=['UPH610010000'],fill_missing_dates=False, freq='D')

  vazao= data[['Data', 'VazaoNatural']].copy()
  vazao_ts = TimeSeries.from_dataframe(vazao, time_col="Data", value_cols=['VazaoNatural'],fill_missing_dates=True, freq='D')

  return prec_ts, vazao_ts

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) * 100

def nash_sutcliffe_efficiency(y_true, y_pred):
    numerator = np.sum((y_true - y_pred) ** 2)
    denominator = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (numerator / denominator)

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def split_data(train_test, prec_ts, vazao_ts):
  # build train, val, test sets for flow(target) and rain data
  flow_train, flow_test = vazao_ts.split_before(train_test)

  prec_train, prec_test = prec_ts.split_before(train_test)

  return flow_train, flow_test, prec_train, prec_test

def data_scaling():
    # data scaling
    #inicialize scaler on flow data sets
    transformer_flow = Scaler()

    #transform the data on flow sets
    trans_flow_train = transformer_flow.fit_transform(flow_train)
    trans_flow = transformer_flow.transform(vazao_ts)

    trans_flow_test = transformer_flow.transform(flow_test)

    #inicialize scaler on rainfall data sets
    transformer_prec = Scaler()

    #transform the data on rainfall sets
    trans_prec_train = transformer_prec.fit_transform(prec_train)
    trans_prec = transformer_prec.transform(prec_ts)

    trans_prec_test = transformer_prec.transform(prec_test)

    return transformer_flow, transformer_prec, trans_flow_train,trans_flow,trans_flow_test,trans_prec_train,trans_prec,trans_prec_test

def build_samples_df(flow_test):

    # Define the starting date
    start_date = datetime.datetime(2022, 12, 31)

    # Define the window size and stride
    window_size = 14
    stride = 1

    # Convert TimeSeries to a pandas DataFrame
    timeseries=flow_test
    timeseries_df = timeseries.pd_dataframe()

    # Generate the samples
    num_samples = len(timeseries_df) - window_size + 1
    samples = []
    for i in range(num_samples):
        sample_start = start_date + datetime.timedelta(days=i)
        sample_dates = pd.date_range(start=sample_start, periods=window_size)
        if set(sample_dates).issubset(timeseries_df.index):
            sample = timeseries_df.loc[sample_dates]
            samples.append(TimeSeries.from_dataframe(sample))

    df_samples = pd.DataFrame()


    for i in range(len(samples)):

      valores = (samples[i].values())


      date = samples[i].start_time()

      temp = pd.DataFrame(valores).T
      temp['Date'] = date
      temp.set_index('Date', inplace=True)

      df_samples = pd.concat([df_samples,temp])
    return df_samples, samples

def build_backtest_df(backtest, transformer_flow):
    df_backtest = pd.DataFrame()
    for i in range(len(backtest)):
      valores = transformer_flow.inverse_transform(backtest[i]).values()

      date = backtest[i].start_time()

      temp = pd.DataFrame(valores).T
      temp['Date'] = date
      temp.set_index('Date', inplace=True)

      df_backtest = pd.concat([df_backtest,temp])
    return df_backtest



prec_ts, vazao_ts = read_data(path)
flow_train, flow_test, prec_train, prec_test = split_data(0.8,prec_ts, vazao_ts)
transformer_flow,transformer_prec, trans_flow_train,trans_flow,trans_flow_test,trans_prec_train,trans_prec,trans_prec_test = data_scaling()

df_samples, samples=build_samples_df(flow_test)


In [None]:

models = [

    RandomForest(lags=30, output_chunk_length=14, n_estimators=200, criterion="absolute_error"),
    NBEATSModel(input_chunk_length=30, output_chunk_length=14, n_epochs=15, activation='LeakyReLU'),
    NHiTSModel(input_chunk_length=30, output_chunk_length=14, n_epochs=15, activation='LeakyReLU'),
    XGBModel(lags=30,output_chunk_length=14),
    CatBoostModel(lags=30, output_chunk_length=14)
]


# Initialize an empty dictionary to store metrics for each model
metrics_dict = {}

In [None]:

# Iterate over each model
for model in models:
    # Fit the model
    model.fit(trans_flow_train)

    # Perform backtesting
    backtest = model.historical_forecasts(series=trans_flow_test,
                                          forecast_horizon=14,
                                          stride=1,
                                          retrain=False,
                                          overlap_end=False,
                                          last_points_only=False,
                                          verbose=False)

    # Build backtest dataframe
    df_backtest = build_backtest_df(backtest, transformer_flow)

    # Initialize a list to store metrics for the current model
    metricas = []

    # Compute metrics for each forecast horizon
    for i in range(14):
        mape = mean_absolute_percentage_error(df_backtest.iloc[:, i], df_samples.iloc[:, i])
        smape = symmetric_mean_absolute_percentage_error(df_backtest.iloc[:, i], df_samples.iloc[:, i])
        mae = mean_absolute_error(df_backtest.iloc[:, i], df_samples.iloc[:, i])
        rmse = root_mean_squared_error(df_backtest.iloc[:, i], df_samples.iloc[:, i])
        metricas.append({"mape": mape, "smape": smape, "mae": mae, "rmse": rmse})

    # Save metrics for the current model configuration in the dictionary
    model_name = type(model).__name__
    metrics_dict[model_name] = metricas

# At this point, metrics_dict contains metrics for each model


In [None]:
import pandas as pd

# Create an empty list to store the DataFrame for each model
df_list = []

# Iterate over the metrics dictionary
for model_name, metrics in metrics_dict.items():
    # Convert the list of metrics to a DataFrame
    df_model = pd.DataFrame(metrics)
    # Add a column for the model name
    df_model['model'] = model_name
    # Append the DataFrame to the list
    df_list.append(df_model)

# Concatenate all the DataFrames
df_metrics = pd.concat(df_list, ignore_index=True)

# Calculate the average of each metric for each model
df_avg_metrics = df_metrics.groupby('model').mean().reset_index()

# Display the resulting DataFrame
df_avg_metrics


Unnamed: 0,model,mape,smape,mae,rmse
0,CatBoostModel,15.504689,14.84489,1889.471823,2557.57322
1,NBEATSModel,16.07258,17.97123,2020.062962,2598.716751
2,NHiTSModel,10.847856,11.259498,1277.594818,1631.471154
3,RandomForest,11.363671,11.574293,1409.596098,1963.63935
4,XGBModel,19.537356,16.854947,2012.887415,2752.612661
