In [2]:
import sys

lib_dir = "/home/daniele/documents/github/ftt01/phd/share/lib"
sys.path.insert( 0, lib_dir )

In [2]:
from lib import *

In [3]:
import logging
from pathlib import Path

In [4]:
def export_data( df_datetime_values, datetime_format_output, output_path_filename ):

    df_datetime_values.reset_index(inplace=True)
    df = df_datetime_values.copy()
    
    df.loc[:,['datetime']] = df['datetime'].apply(lambda x: dt.datetime.strftime( x, datetime_format_output ) )
    df.loc[:,['values']] = df['values'].apply(lambda y: round( y,2 ))

    df.set_index('datetime',inplace=True)

    mkNestedDir( os.path.dirname(output_path_filename) )
    df.to_csv( output_path_filename )

In [5]:
try:
    input_parser = argparse.ArgumentParser()
    input_parser.add_argument('configuration_file', type=str)
    args = input_parser.parse_args()
    configuration_file = args.configuration_file
except:
    configuration_file = "../../etc/conf/extract/config.json"

In [6]:
with open(configuration_file) as config_file:
    configuration = json.load(config_file)

    project_name = configuration["project_name"]

    provider_name = configuration["provider_name"]
    model_name = configuration["model"]["name"]
    n_ensemble = configuration["model"]["ensemble"]
    lead_hours = configuration["model"]["lead_hours"]
    releases = configuration["model"]["releases"]
    
    input_path = configuration["input_path"]
    output_path = configuration["output_path"]
    mkNestedDir(output_path)
    log_path = Path( configuration["log_path"] )
    mkNestedDir(log_path)

    start_date = configuration["start_date"]
    end_date = configuration["end_date"]
    datetime_format_input = configuration["datetime_format_input"]

    datetime_format_output = configuration["datetime_format_output"]
    current_tz = configuration["timezone"]
    
    if configuration["logging_level"] == "info":
        logging_level = logging.INFO
    elif configuration["logging_level"] == "debug":
        logging_level = logging.DEBUG
    else:
        logging_level = logging.ERROR
    
    script_version = configuration["script_version"]

    roi_config_file = configuration["roi_config"]
    with open(roi_config_file) as roi_config_f:
        roi_configuration = json.load(roi_config_f)

        roi_key = roi_configuration["main"]["key"]
        roi_name = roi_configuration["main"]["name"]
        basins = roi_configuration["basins"]

    roi_config_f.close()

config_file.close()

In [7]:
logging.basicConfig(
    filename = str(log_path) + "/" + provider_name + "_extract.log",
    format = '%(asctime)s - %(message)s',
    filemode = 'a',
    level = logging_level)

In [8]:
computation_start = dt.datetime.now()

In [9]:
logging.info( "Project name: " + project_name )
logging.info( "Provider name: " + provider_name )

logging.info( "Input path: " + input_path )
logging.info( "Output path: " + output_path )
logging.info( "Log filename path: " + str(log_path) )

In [10]:
if lead_hours == -1:
    start_datetime = dt.datetime.strptime( start_date + 'T00:00:00', '%Y%m%dT%H:%M:%S' ).replace(tzinfo=tz.gettz(current_tz))
    end_datetime = dt.datetime.strptime( end_date + 'T23:59:59', '%Y%m%dT%H:%M:%S', ).replace(tzinfo=tz.gettz(current_tz))
    days_to_compute = [ dt.datetime.strftime( start_datetime + dt.timedelta(days=i), format='%Y%m%d' ) for i in range( (end_datetime - start_datetime).days+1 ) ]

    logging.info( "Extract from: " + str(start_datetime) )
    logging.info( "Extract to: " + str(end_datetime) )
else:
    start_date = dt.datetime.strptime( start_date, '%Y%m%d' ).replace(tzinfo=tz.gettz(current_tz))
    end_date = dt.datetime.strptime( end_date, '%Y%m%d' ).replace(tzinfo=tz.gettz(current_tz))

    logging.info( "Extract from: " + str(start_date) )
    logging.info( "Extract to: " + str(end_date) )

In [None]:
# path_basins = glob.glob( input_path + "*/" )

In [None]:
for subbasin in basins:

    subbasin_key = subbasin['key']
    subbasin_name = subbasin['name']
    logging.info( "Processing subbasin: " + subbasin_key + " | " + subbasin_name )

    for stations in subbasin['ground_stations']:
        
        variable = stations['variable']
        logging.info( "Variable: " + variable )
        station_id = stations['station_id']
        logging.info( "Station ID: " + str(station_id) )

        try:
            c_data = pd.read_csv( input_path + variable + "/" + str(station_id) + ".csv" )
        except:
            logging.error( "Station ID not valid: " + str(station_id) )
            continue
        
        c_data['datetime'] = [ t.replace(tzinfo=tz.gettz(current_tz)) for t in pd.to_datetime( c_data['datetime'], format=datetime_format_input ) ]
        c_data.set_index( 'datetime', inplace=True )

        if lead_hours == -1:

            to_export_data = pd.DataFrame( index=pd.date_range( 
                start=start_datetime, end=end_datetime, freq='H' )
            )
            to_export_data.index.name = 'datetime'
            to_export_data = pd.merge( 
                to_export_data,c_data[start_datetime:end_datetime], 
                how="left", left_index=True, right_index=True )
            
            output_path_filename = output_path + roi_key + "/" + \
                subbasin_key + "/" + variable + "/" + \
                    dt.datetime.strftime( start_datetime, "%Y%m%d" ) + dt.datetime.strftime( end_datetime, "%Y%m%d" ) + ".csv"
            
            export_data( to_export_data, datetime_format_output, output_path_filename )

        else:
            for rel in releases:
                curr_start_datetime = start_date + dt.timedelta( hours=rel+1 )
                while curr_start_datetime.date() <= end_date.date():

                    curr_end_datetime = curr_start_datetime + dt.timedelta( hours=lead_hours-1 )
                    
                    logging.debug( "Current start datetime: " + str(curr_start_datetime) )
                    logging.debug( "Current end datetime: " + str(curr_end_datetime) )
                    
                    to_export_data = pd.DataFrame( index=pd.date_range(
                        start=curr_start_datetime, end=curr_end_datetime, freq='H' )
                    )
                    to_export_data.index.name = 'datetime'
                    to_export_data = pd.merge( 
                        to_export_data,c_data[curr_start_datetime:curr_end_datetime], 
                        how="left", left_index=True, right_index=True )

                    output_path_filename = output_path + roi_key + "/" + \
                        subbasin_key + "/" + "R{release}".format(release=str(rel).zfill(3)) + \
                            "/" + variable + "/" + \
                                dt.datetime.strftime( curr_start_datetime, "%Y%m%d" ) + ".csv"
                    export_data( to_export_data, datetime_format_output, output_path_filename )
                    del to_export_data

                    curr_start_datetime = curr_start_datetime + dt.timedelta( days=1 )

In [None]:
send_email(
    subject="Meteo Alto Adige extracted: " + project_name,
    body="Started at " + computation_start.strftime(format="%Y-%m-%dT%H:%M:%SZ%z") + 
        "\nFinish at " + dt.datetime.now().strftime(format="%Y-%m-%dT%H:%M:%SZ%z") +
        "\nJSON config: " + json.dumps(configuration, indent=2, default=str)
)