In [1]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

In [2]:
import numpy as np
import pandas as pd
import xarray as xr

from itertools import product

import multiprocessing
import tqdm

from pathlib import Path

In [3]:
dir_loc = ''

In [4]:
out_dir = dir_loc + 'ProcessedData/'
Path(out_dir).mkdir(parents=True, exist_ok=True) # main directory for saving data

Use the data for Cycle 46r1 (start at 2019-06-11, finish at 2020-06-30).

In [5]:
# Use dates for Cycle 46r1 11 June 2019 - 30 June 2020
start_date = '20190611'
end_date = '20200630'

In [6]:
initialization_dates = pd.date_range(start_date, end_date)

# keep Mondays (0) and Thursdays (3)
initialization_dates = initialization_dates[(initialization_dates.weekday == 0) | (initialization_dates.weekday == 3)]
initialization_dates = initialization_dates.strftime('%Y%m%d')

del(start_date, end_date)

In [7]:
# read patterns composites for extracting data about variables and coordinates used
Composites = xr.open_dataset(dir_loc+'ProcessedData/PatternComposites_ERA5.nc')
Var_used = list(Composites.keys())

In [8]:
def frcst_data(input_data):
    
    ''' Input data is a list of 2 [a., b.] with: a. initialization date, b. parameter used '''
    
    init_date_used = input_data[0] # initialization date of forecast
    param_used = input_data[1] # atmospheric variable of interest
    
    'Get the reforecast data for the selected initialization date and parameter'
    # get the data of the control member (cf)
    files_loc = dir_loc+'Data/'+param_used+'/cf/'
    file_name = files_loc+param_used+'_cf_'+init_date_used+'.grb'
    control_forecast = xr.open_dataarray(file_name, engine='cfgrib')
    control_forecast = control_forecast.astype('float32') # float32 for memory efficiency
    control_forecast = control_forecast.sel(latitude=Composites.latitude, longitude=Composites.longitude)
    control_forecast = control_forecast.assign_coords({'number': 0})
    
    # get the data of the ensemble members (pf)
    files_loc = dir_loc+'Data/'+param_used+'/pf/'
    file_name = files_loc+param_used+'_pf_'+init_date_used+'.grb'
    ensemble_forecast = xr.open_dataarray(file_name, engine='cfgrib')
    ensemble_forecast = ensemble_forecast.astype('float32') # float32 for memory efficiency
    ensemble_forecast = ensemble_forecast.sel(latitude=Composites.latitude, longitude=Composites.longitude)
    
    frcst_data = xr.concat([control_forecast, ensemble_forecast], dim='number') # combine cf and pf data
    
    all_mean = frcst_data.mean(dim=['number']) # mean of all members (ensemble + control)
    all_mean = all_mean.assign_coords({'number':-1}) # assign the mean as "-1" on the number coordinate
    
    final = xr.concat([frcst_data, all_mean], dim='number').sortby('number') # combine frcst data and frcst mean data
    
    final = final.rolling(step=2).mean().dropna('step') # average start and end of day for getting mean daily field
    final = final.assign_coords({'step': final.step.values-np.timedelta64(1, 'D')}) # step is the min possible lag
    
    final.name = param_used
    
    return final.astype('float32')

In [9]:
def all_frst(dates, variable):
    
    combs = list(product(dates, [variable]))
    pool = multiprocessing.Pool() # object for multiprocessing
    Data = list(tqdm.tqdm(pool.imap(frcst_data, combs), total=len(combs), position=0, leave=True))
    pool.close()
    
    Data = xr.concat(Data, dim='time')
    
    return Data

In [10]:
Frcst = [all_frst(dates=initialization_dates, variable=i_var) for i_var in Var_used]
Frcst = xr.merge(Frcst).reset_coords(drop=True)

In [11]:
# calculate model(lead-time)-dependent climatology
members = Frcst.number.values # get flag of the members
Frcst_clim = Frcst.sel(number=members[members>=0]) # don't use ensemble mean for deriving the climatology
Frcst_clim = Frcst_clim.assign_coords({'time': pd.to_datetime(Frcst_clim.time.values).strftime('%m%d')})
Frcst_clim = Frcst_clim.groupby('time').mean()
Frcst_clim = Frcst_clim.mean(['number'])
del(members)

In [12]:
# get forecasts anomalies by removing model(lead-time)-dependent climatology
dates_actual = Frcst.time.values
dates_grouped = pd.to_datetime(dates_actual) # get values of actual valid_time of the forecast
Frcst = Frcst.assign_coords({'time': dates_grouped.strftime('%m%d')}) # change the time to Month-Day
Anom = Frcst.groupby('time') - Frcst_clim # calculate the final anomalies from ERA5 climatology
Anom = Anom.assign_coords({'time': dates_actual}) # change back to initiation date
Anom = Anom.assign_coords({'valid_time': Anom.time + Anom.step})

del(Frcst, Frcst_clim)

In [13]:
 # read mean Euclidian differences of ERA5 for bringing the distances of all variables at same magnitudes
ERA5_MeanDifs = xr.open_dataset(dir_loc+'ProcessedData/DifMeanERA5_climatological.nc')

In [14]:
def cluster_allocation(i_step):
    
    Weights = np.cos(np.deg2rad(Composites.latitude))

    Difs_all = []
    for i in Composites.cluster.values: # use loop, because memory is not enough to perform all difs at once!
        Difs = Anom.isel(step=i_step) - Composites.sel(cluster=i) # difference from composite per cell
        Difs = Difs**2 # square of differences
        Difs = Difs.weighted(Weights).mean(['latitude', 'longitude']) # weighted mean of the differences for all cells
        Difs = np.sqrt(Difs) # square root of error (as in RMSE metric)
        Difs_all.append(Difs)   

    Difs_all = xr.concat(Difs_all, dim=pd.Index(Composites.cluster.values, name='cluster')) # concat results
    Difs_all = Difs_all/ERA5_MeanDifs

    Difs_all = Difs_all.to_array()
    Difs_all = Difs_all.rename({'variable': 'atm_variable'})
    Difs_all = Difs_all.mean('atm_variable') # mean of differences for all variables

    ClusterAllocation = Difs_all.argmin('cluster')

    ClusterAllocation = ClusterAllocation.to_dataframe(name='Cluster') # convert to DF and give the column name
    ClusterAllocation = ClusterAllocation.reset_index() # reset so we have all multiindex data as seperate columns
    ClusterAllocation['valid_time'] = ClusterAllocation['time']+ClusterAllocation['step'] # correct valid time

    ClusterAllocation.step = ClusterAllocation.step.apply(lambda x: x.days) # convert to numeric  
    
    return ClusterAllocation

In [15]:
Steps_all = np.arange(len(Anom.step.values))
pool = multiprocessing.Pool() # object for multiprocessing
Alloc = list(tqdm.tqdm(pool.imap(cluster_allocation, Steps_all), total=len(Steps_all), position=0, leave=True))
pool.close()
Alloc = pd.concat(Alloc) # concatenate results for all lead times (steps)
Alloc.sort_values(by=['time', 'step', 'number'], inplace=True) # sort data
Alloc.index = range(0, len(Alloc)) # new indexes

Alloc.to_csv(out_dir+'ForecastsClusterAllocations.csv')