In [1]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

In [2]:
# basic libraries for data analysis
import json
import numpy as np 
import pandas as pd
import geopandas as gpd
import xarray as xr

from itertools import product

# specialized libraries
import xagg as xa # spatial aggregation of data taken into consideration overlap of grid cells to shapefile

import multiprocessing # parallel processing
import tqdm # timing

from contextlib import contextmanager
import sys, os
@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

### Input variables

In [3]:
input_dir_precip = ''
input_dir_wvf = ''
input_dir_rh = ''
output_data_file = ''

In [4]:
warn_areas_Calabria = gpd.read_file('zip://../Shapefiles/Calabria/WarningAreas.zip')
warn_areas_Calabria = warn_areas_Calabria.drop(columns=['id']) # keep only columns of interest

### Create final format of shapefiles and data needed per analysed area

In [5]:
def prepros_shpf(shapefile_input):
    
    output_shp = shapefile_input.to_crs("EPSG:4326") # convert to lat/lon coordinate system 
    output_shp.columns = ['name_area', 'geometry'] # change names of columns for consistency

    output_shp['Area'] = len(output_shp)
    areal_full = output_shp.dissolve(by='Area')
    areal_full.name_area = 'Full'
    output_shp = pd.concat([output_shp, areal_full]).drop(columns='Area')
    
    # find boundary for gridded data based on the warning areas (& add 0.1 degree extra boundary for security)
    total_bounds = output_shp.total_bounds
    total_bounds = [np.floor(i) if count in [0, 1] else np.ceil(i) for count, i in enumerate(total_bounds)]
    
    return [output_shp, total_bounds]

In [6]:
# # dictionary of names with the subdomains used and their shapefiles in geopandas dataframes
domains_used = {
                'Calabria': prepros_shpf(warn_areas_Calabria),
                }

del(warn_areas_Calabria)

### Auxiliary functions for spatial aggregation & extremes identification

In [7]:
def generate_data_subset(data_full, dom_name_used):
    
    bbox_used = domains_used[dom_name_used][1]
    dt_sbst = data_full.sel(longitude=slice(bbox_used[0], bbox_used[2]), 
                            latitude=slice(bbox_used[3], bbox_used[1]))
    
    return dt_sbst    

In [8]:
def spatial_aggreg(input_data):
    
    data_xr_used, domain_name_used = input_data # data used    
    
    # create auxiliary data for the domain of interest
    warn_areas = domains_used[domain_name_used][0]
    data_xr_final = generate_data_subset(data_xr_used, domain_name_used)
    
    weightmap = xa.pixel_overlaps(data_xr_final, warn_areas) # overlap of pixels & polygons
    
    aggregated = xa.aggregate(data_xr_final, weightmap) # calculation of areal average
    aggregated = aggregated.to_dataset()[data_xr_used.name] # convert to dataarray
    aggregated = aggregated.rename({'pix_idx': 'WarnArea'}) # rename coordinate
    
    return aggregated

In [9]:
def spatial_aggregation_forecasts(dataset_used):
    with suppress_stdout():
        spat_aggr = [spatial_aggreg([dataset_used, i_area]) for i_area in domains_used]
        
    return spat_aggr

### ECMWF Reforecsts

Use dates that have the same Cycle so that the forecast data are consistent. Details about changes in cycles are available at https://www.ecmwf.int/en/forecasts/documentation-and-support/changes-ecmwf-model

In [10]:
# Use dates for Cycle 46r1 11 June 2019 - 30 June 2020
start_date = '20190611'
end_date = '20200630'

in_dates = pd.date_range(start_date, end_date)

# keep Mondays (0) and Thursdays (3)
kept_dates = (in_dates.weekday == 0) | (in_dates.weekday == 3)
in_dates = in_dates[kept_dates]
in_dates = in_dates.strftime('%Y%m%d')

del(start_date, end_date, kept_dates)

In [11]:
def frcst_precip_init_date(init_date_used):
    
    ' Get the reforecast data for the selected initialization date '
    ' There is no need to derive ens mean for precipitation data, cause the mean is very biased and wrong! '
    
    # get the data of the control member (cf)
    file_name = input_dir_precip+'cf/Precipitation_cf_'+init_date_used+'.grb'
    control_forecast = xr.open_dataarray(file_name, engine='cfgrib')
    control_forecast = control_forecast.assign_coords({'number': 0})
    
    # get the data of the ensemble members (pf)
    file_name = input_dir_precip+'pf/Precipitation_pf_'+init_date_used+'.grb'
    ensemble_forecast = xr.open_dataarray(file_name, engine='cfgrib')
    
    final = xr.concat([control_forecast, ensemble_forecast], dim='number') # combine cf and pf data
    
    # Precip is a cumulative variable, so for daily values we need differences of next with day of interest
    final = xr.concat([final.isel(step=0), final.diff('step')], dim='step') # no +00 lead time is available
    
    # actual step is previous days, since precip is cumulative, so it gives data till the end date of interest
    final = final.assign_coords({'step': final.step.values-np.timedelta64(1, 'D')})
    
    final = final.where(final>0) # mask values that are <=0 (it happens due to interpolations)
    final = final.fillna(0) # replace NaNs (were previously negative or 0) with 0
    final = final.reset_coords(drop=True)
    
    aggr_data = spatial_aggregation_forecasts(final)
    
    del(control_forecast, ensemble_forecast, final)
    
    return aggr_data

In [12]:
pool = multiprocessing.Pool() # object for multiprocessing
prec_data = list(tqdm.tqdm(pool.imap(frcst_precip_init_date, in_dates), 
                           total=len(in_dates), position=0, leave=True))
pool.close(); pool.join()

precip_final = {}
for i_ind, i_area in enumerate(domains_used):
    i_precip = [i_pr[i_ind] for i_pr in prec_data]
    precip_final[i_area] = xr.concat(i_precip, dim='time')
    precip_final[i_area].to_netcdf(f'{output_data_file}{i_area}_precip.nc')
    
del(pool, prec_data, i_ind, i_area, i_precip)

100%|█████████████████████████████████████████| 110/110 [09:19<00:00,  5.09s/it]


In [13]:
def frcst_wvf_init_date(init_date_used):
    
    ' Get the reforecast data for the selected initialization date '
    ' There is no need to derive ens mean, cause the mean is very biased and wrong! '
    
    # get the data of the control member (cf)
    file_name = f'{input_dir_wvf}CF/CF_{init_date_used}.grb'
    control_forecast = xr.open_dataset(file_name, engine='cfgrib')
    control_forecast = control_forecast.assign_coords({'number': 0})

    # get the data of the ensemble members (pf)
    file_name = f'{input_dir_wvf}PF/PF_{init_date_used}.grb'
    ensemble_forecast = xr.open_dataset(file_name, engine='cfgrib')

    final = xr.concat([control_forecast, ensemble_forecast], dim='number') # combine cf and pf data
    final = final.reset_coords(drop=True)
    final  = final.to_array().rename({'variable': 'direction'}) # dot (line 21) does not work in dataset

    # get weighted 6-hourly average WVF
    weight = xr.DataArray(np.array([0.5]+[1]*3+[0.5]), dims=['window'])
    final_daily = final.rolling(step=5, center=True).construct('window').dot(weight)/weight.sum()
    step_daily_format = final_daily.step.values/np.timedelta64(1, 'D')
    
    final_daily = final_daily.isel(step=(step_daily_format%1==0.5)) # keep only 12 UTC valid times
    final_steps = (final_daily.step-np.timedelta64(12, 'h')) # step as 0UTC for each day
    final_daily = final_daily.assign_coords({'step': final_steps})
    
    del(control_forecast, ensemble_forecast)
    
    return final_daily.to_dataset('direction') # convert back to dataset

In [14]:
def frcst_wvf_all_init_date(init_date_used):
    
    wvf_directions = frcst_wvf_init_date(init_date_used)
    wvf_northwards = wvf_directions['viwvn']
    wvf_northwards.name = 'wvf'
    wvf_eastwards = wvf_directions['viwve']
    wvf_eastwards.name = 'wvf'
    wvf_total = np.sqrt(wvf_northwards**2 + wvf_eastwards**2)
    
    wvf_northwards = spatial_aggregation_forecasts(wvf_northwards)
    wvf_eastwards = spatial_aggregation_forecasts(wvf_eastwards)
    wvf_total = spatial_aggregation_forecasts(wvf_total)
    
    wvf_name = pd.Index(['NorthW', 'SouthW', 'EastW', 'WestW', 'Total_Grid', 'Total_Area'], name='wvf_direction')
    
    aggr_data = []
    for i_dom in range(len(domains_used)):
        i_dom_northw = wvf_northwards[i_dom]
        i_dom_southw = -wvf_northwards[i_dom]
        i_dom_eastw = wvf_eastwards[i_dom]
        i_dom_westw = -wvf_eastwards[i_dom]
        i_dom_total_grid = wvf_total[i_dom]
        i_dom_total_area = np.sqrt(i_dom_northw**2 + i_dom_eastw**2)
        i_dom_final = [i_dom_northw, i_dom_southw, i_dom_eastw, i_dom_westw, i_dom_total_grid, i_dom_total_area]
        i_dom_final = xr.concat(i_dom_final, dim=wvf_name)
        aggr_data.append(i_dom_final)
    
    del(wvf_directions, wvf_northwards, wvf_eastwards, wvf_total, i_dom_final,
        i_dom_northw, i_dom_southw, i_dom_eastw, i_dom_westw, i_dom_total_grid, i_dom_total_area)
    
    return aggr_data

In [1]:
pool = multiprocessing.Pool() # object for multiprocessing
wvf_data = list(tqdm.tqdm(pool.imap(frcst_wvf_all_init_date, in_dates[:]), 
                          total=len(in_dates), position=0, leave=True))
pool.close(); pool.join()

wvf_final = {}
for i_ind, i_area in enumerate(domains_used):
    i_wvf = [i_pr[i_ind] for i_pr in wvf_data]
    wvf_final[i_area] = xr.concat(i_wvf, dim='time')
    wvf_final[i_area].to_netcdf(f'{output_data_file}{i_area}_wvf.nc')
    
del(pool, wvf_data, i_ind, i_area, i_wvf)

In [16]:
def calc_relhum(data_used):
    
    temp = data_used['t']
    sphm = data_used['q']
    relhum = .263*temp.isobaricInhPa*100*sphm/np.e**(17.67*(temp-274.16)/(temp-29.65))
    relhum = relhum.rename({'isobaricInhPa': 'pressure_level'})
    
    return relhum

In [17]:
def frcst_relhum_init_date(init_date_used):
    
    ' Get the reforecast data for the selected initialization date '
    ' There is no need to derive ens mean, cause the mean is very biased and wrong! '
    
    # get the data of the control member (cf)
    file_name = f'{input_dir_rh}CF/CF_{init_date_used}.grb'
    control_forecast = xr.open_dataset(file_name, engine='cfgrib')
    control_forecast = control_forecast.assign_coords({'number': 0})
    
    # get the data of the ensemble members (pf)
    file_name = f'{input_dir_rh}PF/PF_{init_date_used}.grb'
    ensemble_forecast = xr.open_dataset(file_name, engine='cfgrib')
    
    final = xr.concat([control_forecast, ensemble_forecast], dim='number') # combine cf and pf data
    
    final = calc_relhum(final.reset_coords(drop=True)) # calculate rel hum based on available temp and spe hum

    # calculate daily rh of 850 (all steps available) and 700 (includes only steps 0, 12, 24, ...)
    weight850 = xr.DataArray(np.array([0.5]+[1]*3+[0.5]), dims=['window'])
    rh850 = final.sel(pressure_level=850).copy(deep=True)

    weight700 = xr.DataArray(np.array([0.5]+[1]+[0.5]), dims=['window'])
    rh700 = final.sel(pressure_level=700).dropna('step').copy(deep=True)
    

    final_daily850 = rh850.rolling(step=5, center=True).construct('window').dot(weight850)/weight850.sum()
    step_daily850_format = final_daily850.step.values/np.timedelta64(1, 'D')
    final_daily850 = final_daily850.isel(step=(step_daily850_format%1==0.5)) # keep only 12 UTC valid times

    final_daily700 = rh700.rolling(step=3, center=True).construct('window').dot(weight700)/weight700.sum()
    step_daily700_format = final_daily700.step.values/np.timedelta64(1, 'D')
    final_daily700 = final_daily700.isel(step=(step_daily700_format%1==0.5)) # keep only 12 UTC valid times

    final_daily = xr.concat([final_daily700, final_daily850], dim='pressure_level')
    final_steps = (final_daily.step-np.timedelta64(12, 'h')) # step as 0UTC for each day
    final_daily = final_daily.assign_coords({'step': final_steps})
    final_daily.name = 'relhum'
    
    aggr_data = spatial_aggregation_forecasts(final_daily)
    
    del(control_forecast, ensemble_forecast, final, rh850, rh700, final_daily850, final_daily700, final_daily)
    
    return aggr_data

In [2]:
pool = multiprocessing.Pool(processes=5) # object for multiprocessing
rlhm_data = list(tqdm.tqdm(pool.imap(frcst_relhum_init_date, in_dates), 
                           total=len(in_dates), position=0, leave=True))
pool.close(); pool.join()

relhum_final = {}
for i_ind, i_area in enumerate(domains_used):
    i_relhum = [i_pr[i_ind] for i_pr in rlhm_data]
    relhum_final[i_area] = xr.concat(i_relhum, dim='time')
    relhum_final[i_area].to_netcdf(f'{output_data_file}{i_area}_relhum.nc')
    
del(pool, rlhm_data, i_ind, i_area, i_relhum)