In [None]:
wdir = "/home/daniele/documents/github/ftt01/phd/projects/hydrological_forecasting/"

In [None]:
# IMPORTs
import os
import sys

# to link the lib in py scripts as well
os.chdir(wdir)
sys.path.insert(0, os.path.join(os.path.abspath(os.getcwd()), 'lib'))
from lib import *

In [None]:
import glob
from dateutil import tz
from shutil import rmtree

In [None]:
def append_data(current_data, additional_data):

    current_data = current_data.reset_index()
    additional_data = additional_data.reset_index()

    current_data = pd.concat([current_data[current_data['datetime'].isin(
        additional_data['datetime']) == False], additional_data], ignore_index=True)

    # print(data)
    current_data.dropna(subset=['datetime'], inplace=True)
    current_data.sort_values(by=['datetime'], inplace=True)

    current_data = current_data.set_index('datetime')
    current_data = current_data[current_data.index.notnull()]

    return current_data

In [None]:
def dirtier(current_obs_data, previous_day_data, yesterday_fct_data):

    return current_obs_data + yesterday_fct_data - previous_day_data

In [None]:
# SETUP
# basins = ['passirio', 'plan']
basins = ['plan']
variables = ['temperature', 'precipitation', 'streamflow']

init_forecasting_hour = 10
lead_hours = 38
lag_hours = 24*7

output_types = ['mean','median','first_quantile','third_quantile']

start_date_str = '20210615T00:00:00'
end_date_str = '20211016T00:00:00'
timezone_str = 'Europe/Rome'
timezone = ZoneInfo(timezone_str)

# ## Passirio basin
# lat = ( 46.68, 46.945 )
# lon = ( 11.015, 11.38 )

# # Plan basin
# lat = (46.7145853, 46.8251415)
# lon = (11.0198472, 11.117037)

In [None]:
start_date = dt.datetime.strptime(start_date_str, '%Y%m%dT%H:%M:%S').replace(tzinfo=ZoneInfo(timezone_str))
end_date = dt.datetime.strptime(end_date_str, '%Y%m%dT%H:%M:%S').replace(tzinfo=ZoneInfo(timezone_str))

dates = [start_date + dt.timedelta(days=x)
         for x in range(0, (end_date-start_date).days)]

In [None]:
# To create the input for the machine learning models
# for each day we create here a timeseries with the 7 days lag before the init_hour [9AM] and the forecasting for the following 38 hours

for output_type in output_types:

    print(output_type)

    for variable in variables:

        print(variable)

        for basin in basins:

            print(basin)

            obs_data_path = "/media/windows/projects/hydro_forecasting/machine_learning/data/observed/{basin}/{variable}/"
            output_path = "/media/windows/projects/hydro_forecasting/machine_learning/data/observed/{basin}/{variable}/daily/obs/{output_type}/"

            obs_data_path = obs_data_path.format(
                basin=basin, variable=variable, output_type=output_type)
            output_path = output_path.format(
                basin=basin, variable=variable, output_type=output_type)
            mkNestedDir(output_path)

            stations = glob.glob(obs_data_path + '*.txt')
            print(obs_data_path)

            station = stations[0]

            station_data = pd.read_csv( station, parse_dates=[0], header=None, skiprows=4, index_col=0, names=['values'] )
            station_data.index.name = 'datetime'

            infer_dst = np.array([False] * station_data.shape[0])
            station_data = station_data.tz_localize(timezone_str, ambiguous=infer_dst)

            for date in dates:

                # print( date )

                start_datetime = date + \
                    dt.timedelta(hours=int(init_forecasting_hour))
                start_datetime_wlag = start_datetime - \
                    dt.timedelta(hours=lag_hours)
                end_datetime = date + \
                    dt.timedelta(
                        hours=int(init_forecasting_hour) + lead_hours)

                current_obs_data = station_data[start_datetime_wlag:end_datetime]
 
                current_obs_data.index = [ dt.datetime.strftime(i, format='%Y-%m-%d %H:%M:%S') for i in current_obs_data.index ]
                current_obs_data.index.name = "datetime"
                    
                current_obs_data.to_csv(
                    output_path + dt.datetime.strftime(start_datetime, format='%Y%m%d') + '.csv', sep=';')