In [None]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

In [1]:
import numpy as np
import pandas as pd
import xarray as xr

from itertools import product

import multiprocessing
import tqdm

from pathlib import Path

In [2]:
forecasts_loc = ''
out_dir = ''

In [3]:
# get the data for the Mediterranean patterns
composites = xr.open_dataset(out_dir+'Med_LocalizedPatterns_Composites.nc').astype('float32')
weights_lat = np.cos(np.deg2rad(composites.latitude)).astype('float32') # weight cells based on area
eucl_mean = xr.open_dataset(out_dir+'Med_LocalizedPatterns_EuclMean.nc')

# Get auxiliary data used for the analysis
atm_var_used = list(composites.keys())
lats_used = composites.latitude.values
lons_used = composites.longitude.values
composites

ERA5 climatologies of atmospheric variables

In [4]:
era5_clim = xr.open_dataset(out_dir+'ERA5_clim.nc')
era5_clim = era5_clim[atm_var_used]

days_shift = int(era5_clim.days_shift.values) # extention for defining climatonly, so same used for reforecasts
era5_clim = era5_clim.drop(['days_shift']).astype('float32')

Use the data for Cycle 46r1 (start at 2019-06-11, finish at 2020-06-30).

In [5]:
# Use dates for Cycle 46r1 11 June 2019 - 30 June 2020
start_date = '20190611'
end_date = '20200630'

In [6]:
init_dates = pd.date_range(start_date, end_date)

# keep Mondays (0) and Thursdays (3)
init_dates = init_dates[(init_dates.weekday == 0) | (init_dates.weekday == 3)]
init_dates = init_dates.strftime('%Y%m%d')

del(start_date, end_date)

Get all available steps for the analyzed reforecasts data.

In [7]:
aux_file = forecasts_loc+'Data/'+atm_var_used[0]+'/cf/'
aux_file = aux_file+atm_var_used[0]+'_cf_'+init_dates[0]+'.grb'
aux_file = xr.open_dataarray(aux_file, engine='cfgrib')

all_available_steps = aux_file.step.values
all_available_steps = all_available_steps[:-1] # last one is not available, as no 0UT data of next day exist

# make subsets for data volume liminations
len_subs = 3
all_available_steps = [all_available_steps[i:i+len_subs] for i in range(0, len(all_available_steps), len_subs)] 

del(aux_file, len_subs)

In [8]:
def frcst_data(input_data):
    
    ''' Input data is a list of 3 [a, b, c] with: a. initialization date, b. parameter used, c. domain name '''
    
    init_date_used = input_data[0] # initialization date of forecast
    param_used = input_data[1] # atmospheric variable of interest
    steps_used = input_data[2] # steps used (needs subsetting due to large volume of data!)
    
    'Get the reforecast data for the selected initialization date and parameter'
    # get the data of the control member (cf)
    files_loc = forecasts_loc+'Data/'+param_used+'/cf/'
    file_name = files_loc+param_used+'_cf_'+init_date_used+'.grb'
    control_forecast = xr.open_dataarray(file_name, engine='cfgrib')
    control_forecast = control_forecast.astype('float32') # float32 for memory efficiency
    control_forecast = control_forecast.sel(latitude=slice(max(lats_used), min(lats_used)), 
                                            longitude=slice(min(lons_used), max(lons_used)))
    control_forecast = control_forecast.assign_coords({'number': 0})
    
    # get the data of the ensemble members (pf)
    files_loc = forecasts_loc+'Data/'+param_used+'/pf/'
    file_name = files_loc+param_used+'_pf_'+init_date_used+'.grb'
    ensemble_forecast = xr.open_dataarray(file_name, engine='cfgrib')
    ensemble_forecast = ensemble_forecast.astype('float32') # float32 for memory efficiency
    ensemble_forecast = ensemble_forecast.sel(latitude=slice(max(lats_used), min(lats_used)), 
                                              longitude=slice(min(lons_used), max(lons_used)))
    
    frcst_data = xr.concat([control_forecast, ensemble_forecast], dim='number') # combine cf and pf data
    
    all_mean = frcst_data.mean(dim=['number']) # mean of all members (ensemble + control)
    all_mean = all_mean.assign_coords({'number':-1}) # assign the mean as "-1" on the number coordinate
    
    final = xr.concat([frcst_data, all_mean], dim='number').sortby('number') # combine all data with frcst mean
    
    final = final.rolling(step=2).mean().dropna('step') # average start & end of day to get mean daily field
    final = final.assign_coords({'step': final.step.values-np.timedelta64(1, 'D')}) # step is the min lag
    
    final.name = param_used
    
    final = [final.rolling(step=i_rol, min_periods=1).mean() for i_rol in eucl_mean.coords['rolling'].values]
    final = xr.concat(final, dim=eucl_mean.coords['rolling'])
    final = final.sel(step=steps_used)
    
    del(control_forecast, ensemble_forecast, frcst_data, all_mean)
    
    return final.astype('float32')

In [9]:
def all_frst(dates, variable, steps):
    
    combs = list(product(dates, [variable], [steps]))
    pool = multiprocessing.Pool() # object for multiprocessing
    data_all = list(tqdm.tqdm(pool.imap(frcst_data, combs), total=len(combs), position=0, leave=True))
    pool.close(); pool.join()
    
    data_all = xr.concat(data_all, dim='time')
    
    return data_all

In [10]:
def calc_clim_frcst(daymonth_used):
    
    # keep dates of interest (exact day month)
    central_dates_used = all_frcst_dates_atm_var_ext.strftime('%m%d').isin([daymonth_used])
    central_dates_used = all_frcst_dates_atm_var_ext[central_dates_used]
    
    # add buffer days (before/after) for having a more robust climatology for mean and std
    all_dates_used = [pd.date_range(i_date-np.timedelta64(days_shift, 'D'), 
                                    i_date+np.timedelta64(days_shift, 'D')) for i_date in central_dates_used]
    all_dates_used = np.array([j for i in all_dates_used for j in i]) # flatten data to have 1-d array
    all_dates_used = all_dates_used[pd.to_datetime(all_dates_used).isin(all_frcst_dates_atm_var)] # avail. dates

    # keep all dates of interest and get climatology
    subset_used_all = frcst_used.sel(time=all_dates_used)
    climatology_mean = subset_used_all.mean(['time', 'number'])
    climatology_std = subset_used_all.std(['time', 'number'])
    
    dim_name = pd.Index(['Mean', 'Std'], name='indicator')
    final_dataarray = xr.concat([climatology_mean, climatology_std], dim=dim_name)
    final_dataarray = final_dataarray.assign_coords({'time': daymonth_used})
    
    del(subset_used_all, climatology_mean, climatology_std)
    
    return final_dataarray

In [11]:
def frcst_anomalies(steps_analyzed):
    
    # get forecast data
    forecasted_atm_var = [all_frst(init_dates, i_var, steps_analyzed) for i_var in atm_var_used]
    forecasted_atm_var = xr.merge(forecasted_atm_var).reset_coords(drop=True)
    
    # calculate model(lead-time)-dependent climatology [pre-processing]
    global frcst_used, all_frcst_dates_atm_var, all_frcst_dates_atm_var_ext
    members = forecasted_atm_var.number.values # get flag of the members
    frcst_used = forecasted_atm_var.sel(number=members[members>=0]) # don't use ensemble mean for climatology

    all_frcst_dates_atm_var = pd.to_datetime(frcst_used.time.values)
    all_frcst_dates_atm_var_ext = pd.date_range(all_frcst_dates_atm_var[0] - pd.DateOffset(years=1), 
                                                all_frcst_dates_atm_var[-1] + pd.DateOffset(years=1))

    unique_daymonth = all_frcst_dates_atm_var.strftime('%m%d')
    unique_daymonth = sorted(set(unique_daymonth))  
    
    # calculate anomalies from model-dependent climatology [main-analysis]
    pool = multiprocessing.Pool() # object for multiprocessing
    frcst_clim = list(tqdm.tqdm(pool.imap(calc_clim_frcst, unique_daymonth), 
                                total=len(unique_daymonth), position=0))
    pool.close()
    frcst_clim = xr.concat(frcst_clim, dim='time').sortby('time')
    del(frcst_used, all_frcst_dates_atm_var, all_frcst_dates_atm_var_ext)
    
    # get forecasts anomalies by removing model(lead-time)-dependent climatology
    dates_actual = forecasted_atm_var.time.values # get values of initiation time of the forecast
    dates_grouped = pd.to_datetime(dates_actual) 
    forecasted_atm_var = forecasted_atm_var.assign_coords({'time': dates_grouped.strftime('%m%d')}) # new fmrt
    frcst_anom = forecasted_atm_var.groupby('time') - frcst_clim.sel(indicator='Mean') # get anomalies
    frcst_anom_stand = frcst_anom.groupby('time')/frcst_clim.sel(indicator='Std') # get standardized anomalies
    var_type_dim_name = pd.Index(['Anomalies', 'Anomalies_Std'], name='variable_type')
    frcst_anom = xr.concat([frcst_anom, frcst_anom_stand], dim=var_type_dim_name)
    frcst_anom = frcst_anom.assign_coords({'time': dates_actual}) # change back to initiation date
    frcst_anom = frcst_anom.assign_coords({'valid_time': frcst_anom.time + frcst_anom.step})
    forecasted_atm_var = forecasted_atm_var.assign_coords({'time': dates_actual})
    
    # get forecasts anomalies by removing ERA5 climatology, regardless of the lead time
    frcst_anom_ERA5 = []
    for i_step in forecasted_atm_var.step.values:
        
        subset = forecasted_atm_var.sel(step=i_step)
        actual_init_dates = subset.time.values
        
        subset = subset.assign({'time': actual_init_dates+i_step}) # change coord to valid time of the forecast
        dates_actual = subset.time.values # get values of actual valid_time of the forecast
        dates_grouped = pd.to_datetime(dates_actual) 

        subset = subset.assign_coords({'time': dates_grouped.strftime('%m%d')}) # change the time to Month-Day
        subset_anom = subset.groupby('time') - era5_clim.sel(indicator='Mean') # anomalies from ERA5 clim
        subset_std = subset_anom.groupby('time')/era5_clim.sel(indicator='Std') # stand. anom. from ERA5 clim
        
        subset_final = xr.concat([subset_anom, subset_std], dim=var_type_dim_name)
        subset_final = subset_final.assign_coords({'time': dates_actual-i_step}) # change back to initiation date
        frcst_anom_ERA5.append(subset_final)
    
    frcst_anom_ERA5 = xr.concat(frcst_anom_ERA5, dim='step')
    
    # combine both types of anomalies
    frcst_anom_all = xr.concat([frcst_anom, frcst_anom_ERA5], dim=pd.Index(['Model', 'ERA5'], name='clim_type')) 
    
    del(frcst_anom, frcst_anom_stand, frcst_anom_ERA5)
    
    return frcst_anom_all.reset_coords(drop=True)

## Allocate forecasts to clusters

In [12]:
def clust_alloc(i_step):
    
    difs_all = []
    for i in composites.cluster.values: # use loop, because memory is not enough to perform all difs at once!
        difs = frcst_anom.isel(step=i_step) - composites.sel(cluster=i) # differences from composite per cell
        difs = difs**2 # square of differences
        difs = difs.weighted(weights_lat).mean(['latitude', 'longitude']) # weighted mean of all the difs
        difs = np.sqrt(difs) # square root of error (as in RMSE metric)
        difs_all.append(difs)   

    difs_all = xr.concat(difs_all, dim=pd.Index(composites.cluster.values, name='cluster')) # concat results

    difs_all = difs_all/eucl_mean

    difs_all = difs_all.to_array()
    difs_all = difs_all.rename({'variable': 'atm_variable'})
    difs_all = difs_all.mean('atm_variable') # mean of differences for all variables

    cluster_allocation = difs_all.argmin('cluster')
    
    return cluster_allocation.assign_coords({'step': frcst_anom.step.values[i_step]})

### In the below way we iterate through subsets of lead times, otherwise we have memory limitations

In [14]:
allocations = []

for i_ind, i_steps in enumerate(all_available_steps):
    print(f'Generating forecast anomalies for subset #{i_ind+1} of lead times started.')
    frcst_anom_full_domain = frcst_anomalies(i_steps) # get forecast anomalies
    frcst_anom = frcst_anom_full_domain.sel(latitude=composites.latitude, longitude=composites.longitude)
    
    steps_all = np.arange(len(frcst_anom.step.values))
    pool = multiprocessing.Pool() # object for multiprocessing
    alloc = list(pool.imap(clust_alloc, steps_all))
    pool.close(); pool.join()
    alloc = xr.concat(alloc, dim='step')
    allocations.append(alloc)

    del(frcst_anom_full_domain, frcst_anom, steps_all, pool, alloc) # for memory
    print(f' Allocating forecasts for subset #{i_ind+1} of lead times completed.\n')

allocations = xr.concat(allocations, dim='step')
allocations.to_netcdf(out_dir+'Med_LocalizedPatterns_ForecastAllocations.nc')

del(i_ind, i_steps)

In [17]:
allocations