In [None]:
wdir = "/home/daniele/documents/github/ftt01/phd/projects/hydrological_forecasting/"

In [None]:
# IMPORTs
import sys
import os
import glob
import subprocess
import pandas as pd
from dask import dataframe as dd
from dateutil import tz

from shutil import rmtree

# to link the lib in py scripts as well
os.chdir(wdir)
sys.path.insert(0, os.path.join(os.path.abspath(os.getcwd()), 'lib'))
from lib import *

In [None]:
def append_data(current_data, additional_data):

    current_data = current_data.reset_index()
    additional_data = additional_data.reset_index()

    current_data = pd.concat([current_data[current_data['datetime'].isin(
        additional_data['datetime']) == False], additional_data], ignore_index=True)

    # print(data)
    current_data.dropna(subset=['datetime'], inplace=True)
    # current_data.sort_values(by=['datetime'], inplace=True)

    current_data = current_data.set_index('datetime')
    current_data = current_data[current_data.index.notnull()]

    return current_data

In [None]:
def dirtier( variable, current_obs_data, previous_day_data, yesterday_fct_data):
    
    # print(len(current_obs_data))
    # print(len(previous_day_data))
    # print(len(yesterday_fct_data))
    if variable == 'temperature':
        current_obs_data['values'] = current_obs_data['values'] - (previous_day_data['values'].values - yesterday_fct_data['values'].values)
    elif variable == 'precipitation':
        current_obs_data['values'] = current_obs_data['values'] / 1.5

    return current_obs_data

In [None]:
# SETUP
# basins = ['passirio', 'plan']
basins = ['plan']
variables = ['temperature', 'precipitation', 'streamflow']

# init_ref = '03'
init_forecasting_hour = 10
lead_hours = 38
lag_hours = 24*7

output_types = ['mean','median','first_quantile','third_quantile']

start_date_str = '20210615T00:00:00'
end_date_str = '20211016T00:00:00'
timezone_str = 'Europe/Rome'
timezone = ZoneInfo(timezone_str)

In [None]:
start_date = dt.datetime.strptime(start_date_str, '%Y%m%dT%H:%M:%S').replace(tzinfo=ZoneInfo(timezone_str))
end_date = dt.datetime.strptime(end_date_str, '%Y%m%dT%H:%M:%S').replace(tzinfo=ZoneInfo(timezone_str))

dates = [start_date + dt.timedelta(days=x)
         for x in range(0, (end_date-start_date).days)]

In [None]:
for output_type in output_types:

    # print(output_type)

    for variable in variables:

        print(variable)

        for basin in basins:

            print(basin)

            obs_data_path = "/media/windows/projects/hydro_forecasting/machine_learning/data/observed/{basin}/{variable}/daily/"
            fct_data_path = "/media/windows/projects/hydro_forecasting/machine_learning/data/forecast/icon-d2-eps_45h/postprocess_test/{basin}/{variable}/{output_type}/"
            output_path = "/media/windows/projects/hydro_forecasting/machine_learning/data/forecast/icon-d2-eps_45h/postprocess_test/{basin}/{variable}/deterministic/fullfilled/{output_type}/"

            obs_data_path = obs_data_path.format(
                basin=basin, variable=variable, output_type=output_type)
            fct_data_path = fct_data_path.format(
                basin=basin, variable=variable, output_type=output_type)
            output_path = output_path.format(
                basin=basin, variable=variable, output_type=output_type)
            mkNestedDir(output_path)

            stations = glob.glob(obs_data_path + '*.csv')
            forecasts = glob.glob(fct_data_path + '*.csv')

            forecast_dates = [os.path.basename(d)[:-4] for d in forecasts]

            for date in dates:

                # print(date)

                date_as_str = dt.datetime.strftime(date, '%Y%m%d')
                # print(date_as_str)

                station_data_path = obs_data_path + date_as_str + ".csv"

                station_data = pd.read_csv(station_data_path, parse_dates=[0], header=0, sep=';')
                station_data.set_index('datetime', inplace=True)
                
                infer_dst = np.array([False] * station_data.shape[0])
                station_data = station_data.tz_localize(timezone_str, ambiguous=infer_dst)

                # 
                if variable == 'precipitation':
                    station_data = station_data.resample("H").sum()
                elif variable == 'temperature':
                    station_data = station_data.resample("H").mean()
                elif variable == 'streamflow':
                    station_data = station_data.resample("H").mean()
                else:
                    print("NOT A VALID VARIABLE")

                start_datetime = date + \
                    dt.timedelta(hours=int(init_forecasting_hour))
                # print( start_datetime )
                # start_datetime_wlag = start_datetime - dt.timedelta(hours=lag_hours)
                # print( start_datetime_wlag )
                end_datetime = start_datetime + dt.timedelta(hours=lead_hours)
                # print( end_datetime )

                # current_obs_data = station_data[start_datetime_wlag:end_datetime]

                # current_obs_data.plot()

                # print(current_obs_data)
                # print(dt.datetime.strftime(start_datetime_wlag, format='%Y-%m-%d %H:%M:%s'))
                # print(dt.datetime.strftime(end_datetime, format='%Y-%m-%d %H:%M:%s'))

                if date_as_str in forecast_dates:

                    current_fct_data = fct_data_path + date_as_str + ".csv"
                    # print( current_fct_data )

                    fct_data = pd.read_csv(
                        current_fct_data, parse_dates=[0], header=0, sep=';')
                    fct_data.set_index('datetime', inplace=True)

                    infer_dst = np.array([False] * fct_data.shape[0])
                    fct_data = fct_data.tz_localize(timezone_str, ambiguous=infer_dst)

                    fct_data = fct_data[start_datetime:end_datetime]

                    # fct_data = apply_bias_correction( variable, fct_data, fct_data_path, obs_data_path, dates, date )

                    # current_data = append_data(station_data, fct_data)
                    current_data = fct_data
                else:

                    if date != start_date:

                        previous_day_start = start_datetime - \
                            dt.timedelta(days=1)
                        # # print( previous_day_start )
                        previous_day_end = previous_day_start + \
                            dt.timedelta(hours=int(lead_hours))
                        # print( previous_day_end )

                        # previous_day_data = station_data[previous_day_start:previous_day_end]
                        # print( previous_day_data )

                        yesterday_as_str = dt.datetime.strftime(
                            date - dt.timedelta(days=1), '%Y%m%d')

                        yesterday_station_data_path = obs_data_path + yesterday_as_str + ".csv"

                        yesterday_station_data = pd.read_csv(yesterday_station_data_path, parse_dates=[0], header=0, sep=';')
                        yesterday_station_data.set_index(
                            'datetime', inplace=True)

                        infer_dst = np.array([False] * yesterday_station_data.shape[0])
                        yesterday_station_data = yesterday_station_data.tz_localize(timezone_str, ambiguous=infer_dst)

                        if variable == 'precipitation':
                            yesterday_station_data = yesterday_station_data.resample(
                                "H").sum()
                        elif variable == 'temperature':
                            yesterday_station_data = yesterday_station_data.resample(
                                "H").mean()
                        elif variable == 'streamflow':
                            yesterday_station_data = yesterday_station_data.resample(
                                "H").mean()
                        else:
                            print("NOT A VALID VARIABLE")

                        yesterday_fct_data_path = output_path + yesterday_as_str + ".csv"

                        # print(yesterday_fct_data_path)

                        yesterday_fct_data = pd.read_csv(
                            yesterday_fct_data_path, parse_dates=[0], header=0, sep=';')
                        yesterday_fct_data.set_index(
                            'datetime', inplace=True)

                        infer_dst = np.array([False] * yesterday_fct_data.shape[0])
                        yesterday_fct_data = yesterday_fct_data.tz_localize(timezone_str, ambiguous=infer_dst)

                        yesterday_fct_data = yesterday_fct_data[previous_day_start:previous_day_end]

                        dirty_data = dirtier(
                            variable, station_data, yesterday_station_data, yesterday_fct_data)

                        # dirty_data = apply_bias_correction( variable, dirty_data, fct_data_path, obs_data_path, dates, date )

                        current_data = append_data(station_data, dirty_data)

                    else:
                        current_data = station_data

                # current_data.plot()
                
                current_data.index = [ dt.datetime.strftime(i, format='%Y-%m-%d %H:%M:%S') for i in current_data.index ]
                current_data.index.name = "datetime"
            
                current_data.to_csv(
                    output_path + dt.datetime.strftime(start_datetime, format='%Y%m%d') + '.csv', sep=';')


In [None]:
# # To create the input for the machine learning models
# # for each day we create here a timeseries with the 7 days lag before the init_hour [9AM] and the forecasting for the following 38 hours

# for output_type in output_types:

#     # print(output_type)

#     for variable in variables:

#         print(variable)

#         for basin in basins:

#             print(basin)

#             obs_data_path = "/media/windows/projects/hydrological_forecasting/machine_learning/data/observed/{basin}/{variable}/daily/obs/{output_type}/"
#             fct_data_path = "/media/windows/projects/hydrological_forecasting/machine_learning/data/forecast/icon-d2-eps_45h/postprocess/{basin}/{variable}/{output_type}/"
#             output_path = "/media/windows/projects/hydrological_forecasting/machine_learning/data/observed/{basin}/{variable}/daily/filled/{output_type}/"

#             obs_data_path = obs_data_path.format(
#                 basin=basin, variable=variable, output_type=output_type)
#             fct_data_path = fct_data_path.format(
#                 basin=basin, variable=variable, output_type=output_type)
#             output_path = output_path.format(
#                 basin=basin, variable=variable, output_type=output_type)
#             mkNestedDir(output_path)

#             stations = glob.glob(obs_data_path + '*.csv')
#             forecasts = glob.glob(fct_data_path + '*.csv')

#             forecast_dates = [os.path.basename(d)[:-4] for d in forecasts]

#             for date in dates:

#                 # print(date)

#                 date_as_str = dt.datetime.strftime(date, '%Y%m%d')
#                 # print(date_as_str)

#                 station_data_path = obs_data_path + date_as_str + ".csv"

#                 station_data = pd.read_csv(station_data_path, parse_dates=['datetime'], header=0, sep=';')
#                 station_data.set_index('datetime', inplace=True)
#                 if variable == 'precipitation':
#                     station_data = station_data.resample("H").sum()
#                 elif variable == 'temperature':
#                     station_data = station_data.resample("H").mean()
#                 else:
#                     print('WRONG variable')

#                 start_datetime = date + dt.timedelta(hours=int(init_forecasting_hour))
#                 # print( start_datetime )
#                 start_datetime_wlag = start_datetime - dt.timedelta(hours=lag_hours)
#                 # print( start_datetime_wlag )
#                 end_datetime = start_datetime + dt.timedelta(hours=lead_hours)
#                 # print( end_datetime )

#                 # current_obs_data = station_data[start_datetime_wlag:end_datetime]

#                 # current_obs_data.plot()
                
#                 # print(current_obs_data)
#                 # print(dt.datetime.strftime(start_datetime_wlag, format='%Y-%m-%d %H:%M:%s'))
#                 # print(dt.datetime.strftime(end_datetime, format='%Y-%m-%d %H:%M:%s'))

#                 if date_as_str in forecast_dates:

#                         current_fct_data = fct_data_path + date_as_str + ".csv"
#                         # print( current_fct_data )

#                         fct_data = pd.read_csv(
#                             current_fct_data, parse_dates=['datetime'], header=0, sep=';')
#                         fct_data.set_index('datetime', inplace=True)

#                         fct_data = fct_data[start_datetime:end_datetime]

#                         fct_data = apply_bias_correction( variable, fct_data, fct_data_path, obs_data_path, dates, date )

#                         current_data = append_data(station_data, fct_data)

#                 else:

#                     if date_as_str != dt.datetime.strftime(start_date, '%Y%m%d'):
                        
#                         previous_day_start = start_datetime - \
#                             dt.timedelta(days=1)
#                         # print( previous_day_start )
#                         previous_day_end = previous_day_start + \
#                             dt.timedelta(hours=int(lead_hours))
#                         # print( previous_day_end )
#                         previous_day_data = station_data[previous_day_start:previous_day_end]
#                         # print( previous_day_data )

#                         yesterday_as_str = dt.datetime.strftime(
#                             date - dt.timedelta(days=1), '%Y%m%d')

#                         yesterday_fct_data_path = output_path + yesterday_as_str + ".csv"

#                         # print(yesterday_fct_data_path)

#                         yesterday_fct_data = pd.read_csv(
#                             yesterday_fct_data_path, parse_dates=['datetime'], header=0, sep=';')
#                         yesterday_fct_data.set_index(
#                             'datetime', inplace=True)

#                         yesterday_fct_data = yesterday_fct_data[previous_day_start:previous_day_end]

#                         dirty_data = dirtier(station_data[start_datetime:end_datetime], previous_day_data, yesterday_fct_data)

#                         dirty_data = apply_bias_correction( variable, dirty_data, fct_data_path, obs_data_path, dates, date )

#                         current_data = append_data(station_data, dirty_data)

#                     else:
#                         current_data = station_data

#                 current_data.plot()

#                 current_data.to_csv(
#                     output_path + dt.datetime.strftime(start_datetime, format='%Y%m%d') + '.csv', sep=';')