### READ DATA AND CONVERT INTO CSV
1. setup the wdir to the directory where the grib2 data is collected
2. each directory of the wdir will be opened and the grib inside it processed
    - each grib file will be converted in a grid, based on DWD weights

In [None]:
import sys

lib_dir = "/home/daniele/documents/github/ftt01/phd/share/lib"
sys.path.insert( 0, lib_dir )

In [None]:
from lib import *

In [None]:
# wdir = "/home/daniele/documents/github/ftt01/phd/projects/hydrological_forecasting/"

In [None]:
# IMPORTs
import sys, os
import glob
import subprocess
import pandas as pd
from dask import dataframe as dd

In [None]:
def regrid_dir(path_dir, regridded=False):

    if regridded == False:

        os.chdir(path_dir)

        subprocess.run('''docker run --rm \
            --volume $(pwd):/local \
            --env INPUT_FILE=/local \
            --env OUTPUT_FILE=/local \
            deutscherwetterdienst/regrid:icon-d2-eps \
            /convert.sh''',
                       shell=True, check=True,
                       executable='/bin/bash')

In [None]:
def transform(file_path, destination):

    copy_process = "cp {} {}".format( file_path, destination + os.path.basename(file_path))
    subprocess.run(copy_process, shell=True,
                   check=True, executable='/bin/bash')

    os.chdir(os.path.dirname(destination))

    filename = os.path.basename(file_path)

    transform_command = '''docker run --rm \
        --volume $(pwd):/local \
            deutscherwetterdienst/python-eccodes \
                grib_copy -B stepRange {filename} temporal_{filename}'''
    transform_command = transform_command.format(filename=filename)
    print(transform_command)
    subprocess.run(transform_command,
                   shell=True, check=True,
                   executable='/bin/bash')
    
    subprocess.run('''rm {}'''.format(filename),
                   shell=True, check=True,
                   executable='/bin/bash')

In [None]:
# SETUP
# data_path = '/media/local_ssd/projects//tmp/hydrological_forecasting/machine_learning/data/forecast/icon-d2-eps_45h/input/todo/'
# output_path = '/media/local_ssd/projects//tmp/hydrological_forecasting/machine_learning/data/forecast/icon-d2-eps_45h/output_passirio/'
data_path = '/media/local_ssd/projects/alperia/db/data/tmp/'
output_path = '/media/local_ssd/projects/alperia/db/data/tmp/output/'

dummy_output = output_path + 'dummy_output.csv'

variables = ['tot_prec','t_2m']
init_ref = '03'
init_lead_time = 1
lead_hours = 45
ensemble_number = 20

### if the regrid on the dirs is already done
regridded = True

## Passirio basin
basin = 'passirio'
lat = ( 46.68, 46.945 )
lon = ( 11.015, 11.38 )

# ## Plan basin
# basin = 'plan'
# lat = ( 46.7145853, 46.8251415 )
# lon = ( 11.0198472, 11.117037 )

In [None]:
dirs = glob.glob( data_path + '*/' )

In [None]:
# # Include standard modules
# import argparse

# # Initiate the parser
# parser = argparse.ArgumentParser()

# # Add long and short argument
# parser.add_argument("--path", "-p", help="set input path")
# parser.add_argument("--variable", "-var", help="set variable to process")
# parser.add_argument("--leadtime", "-l", help="set starting lead time")

# # Read arguments from the command line
# args = parser.parse_args()

# # Check for --path
# if args.path:
#     print("Set path to %s" % args.path)
#     dirs = [args.path]
# # Check for --path
# if args.variable:
#     print("Set variable to %s" % args.variable)
#     variables = [args.variable]
# # Check for --path
# if args.leadtime:
#     print("Set leadtime to %s" % args.leadtime)
#     init_lead_time = int(args.leadtime)

In [None]:
lead_time_array = []
for n in range(1, lead_hours+1):
    n = str(n).zfill(3)
    lead_time_array.append(n)

for el in dirs:

    # print(el)
    
    tmp = el.split('/')
    current_date = str(tmp[len(tmp) - 2])

    print("Current date: " + current_date)

    for variable in variables:

        print("Variable: " + variable)

        for n in range(init_lead_time, lead_hours+1):

            # data_df = None

            lead_mins = str( 60*n )

            n = str(n).zfill(3)
            print("Lead hour: " + n)

            original_file_to_read = el + 'icon-d2*' + current_date + \
                init_ref + '_' + n + '*' + variable + '*.grib2'
            print("Original file to regrid: " + original_file_to_read)

            # create inner dirs structure
            current_file_path = el + n + '/' + basin + '/' + variable + '/'
            mkNestedDir(current_file_path)
            os.chdir(current_file_path)

            original_current_file = glob.glob(original_file_to_read)

            if len(original_current_file) == 0:

                copy_process = "cp {} {}".format(
                    dummy_output, current_file_path + os.path.basename(current_file))
                subprocess.run(copy_process, shell=True,
                               check=True, executable='/bin/bash')

            else:

                # identify and move current_file into current_file_path
                file_to_read = current_file_path + '*regridded*' + current_date + \
                    init_ref + '_' + n + '*' + variable + '*.grib2'
                print("Check INNER file regridded: " + file_to_read)

                current_file = glob.glob(file_to_read)

                if len(current_file) == 0:

                    transform( original_current_file[0], current_file_path)
                    regrid_dir( current_file_path )
                    current_file = glob.glob(file_to_read)

                    current_file = current_file[0]

                else:
                    current_file = current_file[0]
                    print("Current file: " + current_file)
                
                os.chdir( current_file_path )
                # extract data to output.csv
                extraction_process = '''docker run --rm --volume $(pwd):/local \
                    deutscherwetterdienst/python-eccodes grib_get_data -p date,time,stepRange,shortName {} > output.csv'''
                extraction_process = extraction_process.format(
                    os.path.basename(current_file))
                subprocess.run(extraction_process, shell=True,
                               check=True, executable='/bin/bash')

            # extract data to process
            # read exported data and cut to the ROI
            data_df = dd.read_csv(current_file_path + 'output.csv', sep='\s+', header=None, skiprows=1,
                                  names=['lat', 'lon', 'values', 'date', 'time', 'step_range', 'name'], comment="L")

            data_df = data_df.astype({'lat': float, 'lon': float, 'values': float,
                                      'date': str, 'time': str, 'step_range': str, 'name': str})

            data_df = data_df[data_df['lat'] >= lat[0]]
            data_df = data_df[data_df['lat'] <= lat[1]]
            data_df = data_df[data_df['lon'] >= lon[0]]
            data_df = data_df[data_df['lon'] <= lon[1]]

            if variable == "temperature":
                data_df = data_df[data_df['step_range'] == lead_mins]
            elif variable == "precipitation":
                data_df = data_df[data_df['step_range'] == '0-' + lead_mins]

            data_df = data_df.compute()

            # export data to the new structure
            interruptor = int(len(data_df) / ensemble_number)

            metadata = True
            for ens in range(ensemble_number):

                print("Ensemble #:" + str(ens+1))

                # create ensamble directory
                m = str(ens+1).zfill(3)
                ens_file_path = current_file_path + m + '/'
                mkNestedDir(ens_file_path)

                current_data = pd.DataFrame(
                    columns=['ID', 'lat', 'lon', 'values'])
                ids = []
                lats = []
                lons = []
                vals = []

                for i in range(interruptor*ens, interruptor*(1+ens)):

                    point_id = i + 1 - interruptor*ens

                    ids.append(point_id)
                    lats.append(data_df.iloc[i]['lat'])
                    lons.append(data_df.iloc[i]['lon'])
                    vals.append(data_df.iloc[i]['values'])

                current_data['ID'] = ids
                current_data['lat'] = lats
                current_data['lon'] = lons
                current_data['values'] = vals

                current_data.to_csv(ens_file_path + 'output.csv')