In [1]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

In [2]:
import xarray as xr
import xskillscore as xs
import pandas as pd
import numpy as np

from sklearn.metrics.cluster import contingency_matrix

import multiprocessing
import tqdm

from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
dir_loc = ''

In [4]:
Vars_used = ['SLP', 'Z500']
Area_used = [50, -11, 26, 41]

In [5]:
# naming convention and prefered order of the clusters
New_names = ['Atlantic Low', 'Biscay Low', 'Iberian Low', 'Sicilian Low', 'Balkan Low', 'Black Sea Low',
             'Mediterranean High', 'Minor Low', 'Minor High'] # naming in the final order of interest
New_order = [4, 8, 1, 6, 7, 2, 0, 3, 5] # list with the final order of each cluster

# Dictionary for reordering the clusters
New_order = {i:j for i, j in enumerate(New_order)}

In [6]:
Clusters = dir_loc+'Data/ERA5/Clusters_Med_SLP~Z500.csv'
Clusters = pd.read_csv(Clusters, index_col=0)
Clusters = Clusters['Label']
Clusters.index = pd.to_datetime(Clusters.index, format='%d/%m/%Y')
Clusters = Clusters.map(New_order) # rename the clusters based on the prefered order

In [7]:
def anomalies(variable):
    
    # read actual daily values
    file_path = dir_loc+'/Data/ERA5/D1_Mean_'+variable+'.grb'
    Daily = xr.open_dataarray(file_path, engine='cfgrib') # read data
    Daily = Daily.reset_coords(drop=True).astype('float32') # float 32 and drop not used variables
    Daily = Daily.sel(latitude=slice(Area_used[0], Area_used[2]), # adjust domain for having only
                      longitude=slice(Area_used[1], Area_used[3])) # .. area of actual data used
    Daily_Patt = Daily.sel(time=Clusters.index) # select only dates that were used for deriving the patterns
    
    actual_days = Daily.time.values # get actual timesteps
    dates_grouped = pd.to_datetime(actual_days).strftime('%m%d') # get Month-Day of each timestep
    Daily = Daily.assign_coords({'time': dates_grouped}) # change the time to Month-Day
    
    dates_grouped_patterns = pd.to_datetime(Daily_Patt.time.values).strftime('%m%d') # get Month-Day of each timestep
    
    # 5-day smoothed climatology. Rolling can be applied directly because the daily data refer to consequtive days. If
    # days are not consecutive, firstly the xr.resample should be applied, so that missing days are generated with NaN
    Smoothed = Daily_Patt.rolling(time=5, center=True, min_periods=1).mean() # 5-day smoothing
    Smoothed = Smoothed.assign_coords({'time': dates_grouped_patterns}) # change the time to Month-Day
    
    Climatology = Smoothed.groupby('time').mean() # climatology of the smoothed data
    
    Anomalies = Daily.groupby('time') - Climatology
    Anomalies = Anomalies.assign_coords({'time': actual_days}) # change back to the original timestep information
    Anomalies.name = variable
    Daily = Daily.assign_coords({'time': actual_days}) 
    Daily.name = variable
    
    return Anomalies, Daily

In [8]:
pool = multiprocessing.Pool() # object for multiprocessing
PatternsAll = list(tqdm.tqdm(pool.imap(anomalies, Vars_used), total=len(Vars_used), position=0, leave=True))
pool.close()

Patterns = xr.merge([i[0] for i in PatternsAll])
DailyFull = xr.merge([i[1] for i in PatternsAll])

del(pool, PatternsAll)

In [9]:
# calculate patterns composites based on given xarray data, and tags for each daily field
def patterns_composites(data, tags):
    
    Comp = data.assign_coords({'time': tags}).groupby('time').mean() # composites
    Comp = Comp.rename({'time': 'cluster'})
    
    return Comp

In [10]:
# get initial composites only based on K-means clustering (as in Mastrantonas et al, 2021)
Composites_Kmeans = patterns_composites(Patterns.sel(time=Clusters.index), Clusters.values)

In [11]:
# get new composites based on minimum Euclidian distance (RMSE) of each daily field to all 9 cluster composites
def composites_RMSE(data, composites_input, Difs_normalization='actual'):
    
    Difs = data - composites_input # error per cell from each composite
    Difs = Difs**2 # square of error
    Weights = np.cos(np.deg2rad(Difs.latitude)) # weights due to areal differences of each grid
    Difs = Difs.weighted(Weights).mean(['latitude', 'longitude']) # weighted mean of the differences for all cells
    Difs = np.sqrt(Difs) # square root of error (as in RMSE metric)
    
    Difs_mean = Difs.mean(['time', 'cluster']) # get the average Euclidian distance
    
    if type(Difs_normalization) == str: # normalise difs from the used variables so values can be comparable
        Difs = Difs/Difs_mean 
    else:
        Difs = Difs/Difs_normalization
        
    Difs = Difs.to_array()
    Difs = Difs.rename({'variable': 'atm_variable'})
    Difs = Difs.mean('atm_variable') # mean of differences for all variables

    NewLabel = Difs.argmin('cluster').values # allocate each field to the cluster of lowest final Euclidian distance
    NewComposite = patterns_composites(data, NewLabel) # calculate new composites
    
    return (NewLabel, NewComposite, Difs_mean)

In [12]:
iterations_max = 100
mismatching_max = 0

In [13]:
i_iter = 1
print('Starting iterations for convergence of composites based on RMSE of daily patterns.')
IterationResults = composites_RMSE(Patterns.sel(time=Clusters.index), Composites_Kmeans)
Mismatch = (IterationResults[0] != Clusters.values).sum()/len(Clusters.values)*100
print(f'Initial mismatch between standalone K-means and K-means followed by RMSE is {np.round(Mismatch, 2)}%.')
while Mismatch>mismatching_max and i_iter<=iterations_max:
    old_tags = IterationResults[0]
    IterationResults = composites_RMSE(Patterns.sel(time=Clusters.index), IterationResults[1])
    Mismatch = (IterationResults[0] != old_tags).sum()/len(old_tags)*100
    i_iter = i_iter+1

if Mismatch==0:
    print(f'Analysis converged in {i_iter} iterations. There is a full agreement and clusters are stabilized.')
else:
    print(f'Analysis not converged after {i_iter} iterations. Final mismatch is {np.round(Mismatch, 2)}%.')

In [14]:
Path(dir_loc+'/ProcessedData/').mkdir(parents=True, exist_ok=True)

In [15]:
# save stabilized clusters and the Mean Cluster Climatological differences for using at next steps
IterationResults[1].to_netcdf(dir_loc+'/ProcessedData/PatternComposites_ERA5.nc')
IterationResults[2].to_netcdf(dir_loc+'/ProcessedData/DifMeanERA5_climatological.nc')

In [16]:
# save final cluster allocations for all dates, including the ones not used for deriving the clusters
Final_Allocations = composites_RMSE(Patterns, IterationResults[1], IterationResults[2])[0] # get labels of all data
Labels = pd.Series(Final_Allocations, index=Patterns.time.values, name='Label')
Labels.to_csv(dir_loc+'/ProcessedData/PatternAllocations_ERA5.csv')

### Derive allocations to patterns only based on ERA5 0 UTC data

In [17]:
def anomalies_0UTC(variable):
    
    # read actual daily values
    file_path = dir_loc+'/Data/ERA5/D1_Mean_'+variable+'_00UTC.grb'
    Daily = xr.open_dataarray(file_path, engine='cfgrib') # read data
    Daily = Daily.reset_coords(drop=True).astype('float32') # float 32 and drop not used variables
    Daily = Daily.sel(latitude=slice(Area_used[0], Area_used[2]), # adjust domain for having only
                      longitude=slice(Area_used[1], Area_used[3])) # .. area of actual data used
    
    Daily = Daily.rolling(time=2).mean().dropna('time') # get daily field by averaging 0UTC of same and next day
    Daily = Daily.assign_coords({'time': Daily.time.values-np.timedelta64(1, 'D')}) # data refer to the previous day  
    
    Daily_Patt = Daily.sel(time=Clusters.index) # select only dates that were used for deriving the patterns
    
    actual_days = Daily.time.values # get actual timesteps
    dates_grouped = pd.to_datetime(actual_days).strftime('%m%d') # get Month-Day of each timestep
    Daily = Daily.assign_coords({'time': dates_grouped}) # change the time to Month-Day
    
    dates_grouped_patterns = pd.to_datetime(Daily_Patt.time.values).strftime('%m%d') # get Month-Day of each timestep
    
    # 5-day smoothed climatology. Rolling can be applied directly because the daily data refer to consequtive days. If
    # days are not consecutive, firstly the xr.resample should be applied, so that missing days are generated with NaN
    Smoothed = Daily_Patt.rolling(time=5, center=True, min_periods=1).mean() # 5-day smoothing
    Smoothed = Smoothed.assign_coords({'time': dates_grouped_patterns}) # change the time to Month-Day
    
    Climatology = Smoothed.groupby('time').mean() # climatology of the smoothed data
    
    Anomalies = Daily.groupby('time') - Climatology
    Anomalies = Anomalies.assign_coords({'time': actual_days}) # change back to the original timestep information
    Anomalies.name = variable
    Daily = Daily.assign_coords({'time': actual_days}) 
    Daily.name = variable
    
    return Anomalies, Daily

In [18]:
pool = multiprocessing.Pool() # object for multiprocessing
Patterns_0UTCAll = list(tqdm.tqdm(pool.imap(anomalies_0UTC, Vars_used), total=len(Vars_used), position=0, leave=True))
pool.close()

Patterns_0UTC = xr.merge([i[0] for i in Patterns_0UTCAll])
Daily_0UTC = xr.merge([i[1] for i in Patterns_0UTCAll])

del(pool, Patterns_0UTCAll)

In [19]:
Weights = np.cos(np.deg2rad(Daily_0UTC.latitude)) # weights due to areal differences of each grid
Weights_2d = Weights.expand_dims({'longitude': Daily_0UTC.longitude.values}) # weights on both lat-lon

Corr_Pat = xs.pearson_r(Daily_0UTC.to_array(), DailyFull.to_array(), # spatial correlation of ERA5 data based ...
                        dim=['latitude', 'longitude'], weights=Weights_2d) # ... on full hourly and only 0 UTC data
Corr_Pat = Corr_Pat.to_dataframe('Corr').pivot_table(index='time', columns='variable', values='Corr')
Corr_Pat['Mean'] = Corr_Pat.mean(axis=1)
Corr_Pat = Corr_Pat.describe()
Corr_Pat.to_csv(dir_loc+'/ProcessedData/Correlation_Full_0UTC.csv')
Corr_Pat

In [20]:
# allocate days to cluster, using the ERA5 0UTC data only for the reference period (1979-2019)
Alcs0UTC_OUTC = composites_RMSE(Patterns_0UTC.sel(time=Clusters.index), IterationResults[1])[0]
Labels_0UTC = pd.Series(Alcs0UTC_OUTC, index=Clusters.index, name='Label')
Labels_0UTC.to_csv(dir_loc+'/ProcessedData/PatternAllocations_ERA5_0UTC.csv')

In [21]:
Alcs = pd.DataFrame({'Actual': Clusters.values, '0_UTC': Alcs0UTC_OUTC}, index=Clusters.index)
print('Percentage of mismatch:')
Alcs.apply(lambda x: x!=Alcs['Actual']).sum()/len(Alcs)*100

In [22]:
Sorted_Dates = np.array(pd.date_range('20040101', '20041231').strftime('%m%d')) # a leap year for getting all dates
StartSummerHalf = np.where(Sorted_Dates=='0416')[0]
EndSummerHalf = np.where(Sorted_Dates=='1015')[0]        

In [23]:
def temp_flagging(valid_dates, temp_subset):
    
    valid_dates = pd.to_datetime(valid_dates)
    
    if temp_subset == 'All':
        temporal_flag = ['All']*len(valid_dates)
    elif temp_subset == 'HalfYear':
        temporal_flag_aux = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag_aux = temporal_flag_aux.map({i: i_c for i_c, i in enumerate(Sorted_Dates)})
        temporal_flag_aux = temporal_flag_aux.values
        temporal_flag = np.repeat(['WinterHalf'], len(temporal_flag_aux))
        temporal_flag[(temporal_flag_aux>=StartSummerHalf) & (temporal_flag_aux<=EndSummerHalf)] = 'SummerHalf'
    elif temp_subset == 'Season':
        temporal_flag = (valid_dates.month%12 + 3)//3
        temporal_flag = temporal_flag.map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'})
    elif temp_subset == 'Month':
        temporal_flag = valid_dates.month.astype(str)
    elif temp_subset == 'DayMonth':
        temporal_flag = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag = temporal_flag.values
        
    return temporal_flag  

In [24]:
Alcs['HalfYear'] = temp_flagging(Alcs.index, 'HalfYear')

In [25]:
def bias_calc(temp_subset):
    PercentDifs = Alcs.query('HalfYear == @temp_subset')[['Actual', '0_UTC']]
    PercentDifs = PercentDifs.apply(lambda x: x.value_counts()).sort_index()
    PercentDifs = PercentDifs.apply(lambda x: x/PercentDifs['Actual'], axis=0)*100
    PercentDifs = PercentDifs['0_UTC']
    return PercentDifs

In [26]:
Biases = pd.DataFrame({'WH': bias_calc(['WinterHalf']), 'SH': bias_calc(['SummerHalf']), 
                       'Full': bias_calc(['WinterHalf', 'SummerHalf'])})
Biases.index = New_names
Biases.to_csv(dir_loc+'/ProcessedData/Biases_Full_0UTC.csv')
Biases