In [1]:
import sys

lib_dir = "/home/daniele/documents/github/ftt01/phd/share/lib"
sys.path.insert( 0, lib_dir )

from lib import *
import subprocess
import psycopg2
# import datetime as dt

In [2]:
class local_args():

    def __init__(self) -> None:
        pass

    def add_latitude(self,  latitude):
        self.latitude = latitude

    def add_longitude(self,  longitude):
        self.longitude = longitude
    
    def add_neighbors(self,  nn):
        self.neighbors = nn

    def add_start_date(self,  start_date):
        self.start_date = start_date
    
    def add_end_date(self,  end_date):
        self.end_date = end_date

    def add_variable(self,  variable):
        self.variable = variable
    
    def add_meta_grid(self,  meta_grid):
        self.meta_grid = meta_grid

    def add_output_path(self,  output_path):
        self.output_path = output_path
        mkNestedDir( output_path )

In [3]:
# try:
# input_parser = argparse.ArgumentParser()
# input_parser.add_argument('polygon', type=str)
# input_parser.add_argument('start_date', type=str)
# input_parser.add_argument('end_date', type=str)
# input_parser.add_argument('variable', type=str)
# input_parser.add_argument('output_path', type=str)
# input_parser.add_argument('meta_grid', type=str)
# args = input_parser.parse_args()
# except:
args = local_args()
args.add_start_date("2010-01-01T00:00:00")
args.add_end_date("2019-12-31T23:59:00")
args.add_variable("2t")
args.add_output_path("/media/windows/projects/bias_correction/applications/era5land/data/pre_processed/")
args.add_meta_grid("/media/windows/projects/bias_correction/applications/era5land/data/pre_processed/grid.csv")
args.add_latitude(46.42)
args.add_longitude(10.95)
args.add_neighbors(1)

In [4]:
class Data():

    def __init__(self) -> None:
        self.features = pd.DataFrame()
        self.outputs = pd.DataFrame()
        self.start_date = pd.to_datetime(
            dt.datetime.strptime(
                "19500101T00:00", "%Y%m%dT%H:%M"))
        self.end_date = pd.to_datetime( dt.datetime.now() )
    
    def add_output(self, df, name, ffill=(False,None)):
        df = df.rename(columns={df.columns[0]:name})
        if ffill[0] is True:
            df = df.fillna(method="ffill").replace(np.NaN, ffill[1])
        self.outputs = pd.concat([self.outputs, df], axis=1)
        
    def add_feature(self, df, name, ffill=(False,None)):
        df = df.rename(columns={df.columns[0]:name})
        if ffill[0] is True:
            # print("filling")
            df = df.fillna(method="ffill").replace(np.NaN, ffill[1])
        self.features = pd.concat([self.features, df], axis=1)
    
    def update_dates(self):
        start_date = self.outputs.index[0]
        end_date = self.outputs.index[-1]

        if start_date > self.start_date:
            self.start_date = start_date
        
        if end_date < self.end_date:
            self.end_date = end_date
    
    def batch_data(self, start_date, prediction_hour, lag_hours, lead_hours):
        
        c_start_date = dt.datetime(start_date.year, start_date.month, start_date.day, prediction_hour)

        ## lag hours
        c_end_date = c_start_date - dt.timedelta(hours=1)
        c_lag_date = c_end_date - dt.timedelta( hours = lag_hours )
        if self.start_date > c_lag_date:
            first_available = self.start_date + dt.timedelta( hours = lag_hours )
            raise Exception(f"Start date not allowed: {start_date}, first available {first_available}")
        lag_block = self.features[c_lag_date:c_end_date]
        
        ## lead hours
        c_end_forecast = c_start_date + dt.timedelta(hours=lead_hours)
        lead_block = self.outputs[c_start_date:c_end_forecast]

        # print(f"Lag dates: {lag_block.index}")
        # print(f"Start date: {c_start_date}")
        # print(f"Lead dates: {lead_block.index}")

        return lag_block, lead_block

In [5]:
start_date = dt.datetime.strptime( args.start_date, '%Y-%m-%dT%H:%M:%S' )
end_date = dt.datetime.strptime( args.end_date, '%Y-%m-%dT%H:%M:%S' )

In [6]:
variables = [
    "2t",
    "tp"
]

In [7]:
test = Data()

for v in variables:

    if (v == "tp"):
        ffill = (True,0)
    else:
        ffill = (False,None)

    df = extract_era5land(start_date, end_date, v, args.latitude, args.longitude, args.neighbors)
    df_streamflow = extract_pab(start_date, end_date, 'Q', args.latitude, args.longitude)
   
    for i in range(args.neighbors):
        test.add_feature(df[[df.columns[i]]], v, ffill=ffill)
        
test.add_output(df_streamflow[[df_streamflow.columns[i]]], "streamflow", ffill=(False,0))
test.update_dates()

In [8]:
lead_hours = 24
lag_hours = 24

prediction_hour = 10
prediction_interval = 24

In [9]:
features_blocks = []
outputs_blocks = []

In [10]:
end_date = test.end_date

In [11]:
end_date

Timestamp('2019-12-31 23:00:00', freq='H')

In [12]:
end_of_dates = False
c_start_date = start_date + dt.timedelta(hours=lag_hours)

In [13]:
c_start_date

datetime.datetime(2010, 1, 2, 0, 0)

In [14]:
lag_hours

24

In [15]:
while end_of_dates is False:
    
    c_end_date = c_start_date + dt.timedelta(hours=lead_hours)
    if c_end_date > end_date:
        # print("HERE")
        break

    lag_block, lead_block = test.batch_data(c_start_date, prediction_hour, lag_hours, lead_hours)

    # print(f"Lag block length: {lag_block.shape[0]}")
    # print(f"Lead block length: {lead_block.shape[0]}")

    if (lag_block.shape[0] == lag_hours+1) and \
        (lead_block.shape[0] == lag_hours+1) and \
            not((lag_block.isna().any().any()) or \
                (lead_block.isna().any().any())):

        features_blocks.append(lag_block.values.tolist())
        outputs_blocks.append(lead_block.values.tolist())

    else:
        print(f"Current start date: {c_start_date}")

        print(f"Block lag length: {lag_block.shape[0]}")
        print(f"Block lead length: {lead_block.shape[0]}")
        
        # print(f"Block lag: {lag_block}")
        # print(f"Block lead: {lead_block}")

    c_start_date = c_start_date + dt.timedelta(hours=prediction_interval)

Current start date: 2010-03-27 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2011-03-26 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-02-14 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-02-15 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-02-22 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-02-23 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-02-24 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-02-25 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-04-17 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-07-05 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2017-07-06 00:00:00
Block lag length: 25
Block lead length: 25
Current start date: 2018-07-03 00:00:00
Block lag length: 25
Block lead length: 25
Curr

In [16]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Function

class DataBuilder(Dataset):
    def __init__(self, x, y, device):
        self.device = device
        self.x = self.numpyToTensor(x)
        self.y = self.numpyToTensor(y)
        self.len = self.x.shape[0]

    def __getitem__(self, index):      
        return self.x[index], self.y[index]

    def __len__(self):
        return self.len

    def numpyToTensor(self,x):
        return torch.from_numpy(x).to(self.device)

In [17]:
features_blocks = np.array(features_blocks).astype("float32")
outputs_blocks = np.array(outputs_blocks).astype("float32")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = DataBuilder( features_blocks, outputs_blocks, device )

dataset.x.shape
dataset.y.shape

torch.Size([3635, 25, 2])

torch.Size([3635, 25, 1])

In [None]:
## for each member of the ensemble [neighbors]
### create a dataset
### batch data

In [None]:
start_date = start_date + dt.timedelta( hours = lag_hours )
c_start_date = dt.datetime(start_date.year, start_date.month, start_date.day, prediction_hour)
c_end_forecast = c_start_date + dt.timedelta(hours=lead_hours)

c_lag_date = c_start_date - dt.timedelta(hours=lag_hours)

lag_block = test.dataset[c_lag_date:c_start_date]
c_start_forecast = c_start_date+dt.timedelta(hours=1)
lead_block = test.dataset[c_start_forecast:c_end_forecast]

In [None]:
c_lag_date

In [None]:
c_start_date

In [None]:
c_end_forecast

In [None]:
full_df["2010-01-01 00:00":"2010-01-03 00:00"]

In [None]:
full_df.plot(figsize=(10,10))

In [None]:
output_filename = args.output_path + args.variable + '_' + start_date.strftime('%Y%m%dT%H%M%S') + '_' + end_date.strftime('%Y%m%dT%H%M%S') + '.csv'

In [None]:
output_filename

In [None]:
full_df.to_csv( output_filename )