## Generate function to standardize data based on different criteria

In [1]:
import sys
sys.path.append('/'.join(sys.path[0].split('/')[:-1]))

import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import time
import os
import healpy as hp

In [2]:
datadir = "../data/healpix/"
input_dir = datadir + "5.625deg_nearest/"
pred_save_path = datadir + "predictions/"

train_years = ('1979', '2012')
val_years = ('2013', '2016')
test_years = ('2017', '2018')

nodes = 12*16*16
max_lead_time = 5*24
lead_time = 6
out_features = 2
nb_timesteps = 2

obs = xr.open_mfdataset(pred_save_path + 'observations_nearest.nc', combine='by_coords')

In [3]:
z500 = xr.open_mfdataset(f'{input_dir}geopotential_500/*.nc', combine='by_coords').rename({'z':'z500'})
t850 = xr.open_mfdataset(f'{input_dir}temperature_850/*.nc', combine='by_coords').rename({'t':'t850'})
rad = xr.open_mfdataset(f'{input_dir}toa_incident_solar_radiation/*.nc', combine='by_coords')

z500 = z500.isel(time=slice(7, None))
t850 = t850.isel(time=slice(7, None))

constants = xr.open_dataset(f'{input_dir}constants/constants_5.625deg.nc').rename({'orography' :'orog'})
constants = constants.assign(cos_lon=lambda x: np.cos(np.deg2rad(x.lon)))
constants = constants.assign(sin_lon=lambda x: np.sin(np.deg2rad(x.lon)))

temp = xr.DataArray(np.zeros(z500.dims['time']), coords=[('time', z500.time.values)])
constants, _ = xr.broadcast(constants, temp)

In [4]:
z500

Unnamed: 0,Array,Chunk
Bytes,24.58 kB,24.58 kB
Shape,"(3072,)","(3072,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 24.58 kB 24.58 kB Shape (3072,) (3072,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",3072  1,

Unnamed: 0,Array,Chunk
Bytes,24.58 kB,24.58 kB
Shape,"(3072,)","(3072,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,24.58 kB,24.58 kB
Shape,"(3072,)","(3072,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 24.58 kB 24.58 kB Shape (3072,) (3072,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",3072  1,

Unnamed: 0,Array,Chunk
Bytes,24.58 kB,24.58 kB
Shape,"(3072,)","(3072,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.62 GB,8.62 GB
Shape,"(350633, 3072)","(350633, 3072)"
Count,3 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 8.62 GB 8.62 GB Shape (350633, 3072) (350633, 3072) Count 3 Tasks 1 Chunks Type float64 numpy.ndarray",3072  350633,

Unnamed: 0,Array,Chunk
Bytes,8.62 GB,8.62 GB
Shape,"(350633, 3072)","(350633, 3072)"
Count,3 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [8]:
def get_standarization_params(ds, mean='mean', std='std', time='point'):
    """
    Compute parameters for standardizing data.
    
    Parameters
    ----------
    ds : xarray Dataset
        Dataset containing the input data
    mean: str
    std:  str
    time: str
    """
    if mean.lower() not in ['mean', 'median']:
        raise ValueError("mean parameter not valid. \
        It must be: \
        \n-'mean' or \
        \n-'median'")
    if std.lower() not in ['std','iqr']:
        raise ValueError("std parameter not valid. \
        It must be: \
        \n-'std' for Standard Deviation or \
        \n-'iqr' for Interquartile Range ")
    if time.lower() not in ['point','week', 'month']:
        raise ValueError("time parameter not valid. \
        It must be: \
        \n-'point' for statistic using complete dataset \
        \n-'week' for statistic using weekly moving window values\
        \n-'month' for statistic using monthly (30d) moving window values")
    
    if time == 'point':
        if mean == 'mean':
            ds_mean = ds.mean(('time','node')).compute()
        else:
            ds_mean = ds.median('time').mean('node').compute()
    
        if std == 'std':
            ds_std = ds.std(('time','node')).compute()
        else:
            q1, q3 = ds.quantile([0.25, 0.75], dim='time').compute().mean('node').to_array().values[0]
            ds_std = q3 - q1
    else:
        if time == 'week': t = 7  
        else: t=30
            
        if mean == 'mean':
            ds_mean = ds.rolling(time=24*t, center=True).mean().mean('node')
        else:
            ds_mean = ds.rolling(time=24*t, center=True).median().mean('node')
        if std == 'std':
            ds_std = ds.rolling(time=24*t, center=True).std().mean('node')
        else:
            quantiles = ds.rolling(time=24*t, center=True)\
            .construct('tmp').quantile([0.25, 0.75], dim='tmp').mean('node').to_array()
            ds_std = quantiles[:,1] - quantiles[:,0]
    
    return ds_mean, ds_std

In [6]:
z500_mean, z500_std = get_standarization_params(z500, 'mean', 'std', 'point')

In [9]:
z500_mean_week, z500_std_week = get_standarization_params(z500, 'mean', 'std', 'week')

In [10]:
z500_mean_month, z500_std_month = get_standarization_params(z500, 'mean', 'std', 'month')

In [11]:
z500_median, z500_iqr = get_standarization_params(z500, 'median', 'iqr', 'point')

In [12]:
z500_median_week, z500_iqr_week = get_standarization_params(z500, 'median', 'iqr', 'week')

In [13]:
z500_mean_std = (z500 - z500_mean)/z500_std

In [14]:
z500_mean_std_week = (z500 - z500_mean_week)/z500_std_week

In [15]:
z500_mean_std_week

Unnamed: 0,Array,Chunk
Bytes,24.58 kB,24.58 kB
Shape,"(3072,)","(3072,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 24.58 kB 24.58 kB Shape (3072,) (3072,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",3072  1,

Unnamed: 0,Array,Chunk
Bytes,24.58 kB,24.58 kB
Shape,"(3072,)","(3072,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,24.58 kB,24.58 kB
Shape,"(3072,)","(3072,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 24.58 kB 24.58 kB Shape (3072,) (3072,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",3072  1,

Unnamed: 0,Array,Chunk
Bytes,24.58 kB,24.58 kB
Shape,"(3072,)","(3072,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.62 GB,8.62 GB
Shape,"(350633, 3072)","(350632, 3072)"
Count,105 Tasks,2 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 8.62 GB 8.62 GB Shape (350633, 3072) (350632, 3072) Count 105 Tasks 2 Chunks Type float64 numpy.ndarray",3072  350633,

Unnamed: 0,Array,Chunk
Bytes,8.62 GB,8.62 GB
Shape,"(350633, 3072)","(350632, 3072)"
Count,105 Tasks,2 Chunks
Type,float64,numpy.ndarray


In [16]:
z500_median_std_week = (z500 - z500_median_week)/z500_iqr_week