# Machine learning: correlated multivariate profiles

Goal: to improve the prediction of a simple ml for predicting radiation flux.

Background: the number of samples (named profiles/columns in the data) are scarce, can we generate a larger set of random profiles which stll capture the correclation between differnt quantities?

In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import numpy as np
from scipy.constants import Stefan_Boltzmann
import xarray as xr

import matplotlib.pyplot as plt

import synthia as syn
from synthia.util import to_stacked_array, to_unstacked_dataset

## Physical model

In [None]:
def compute_cloud_optical_depth(ds: xr.Dataset) -> xr.DataArray:
    # Constants
    g = 9.81 # m/s²
    rho_liquid = 1000 # kg/m³
    rho_ice = 917 # kg/m³

    delta_pressure = ds['pressure_hl'].diff('half_level').rename('delta_pressure')
    delta_pressure = delta_pressure.rename({'half_level': 'level'})
    
    cloud_optical_depth = (ds['q_liquid'] / (rho_liquid * ds['re_liquid']) +\
                     ds['q_ice'] / (rho_ice * ds['re_ice']) ) * delta_pressure / g
    return cloud_optical_depth

In [None]:
def compute_layer_optical_depth(delta_pressure_fl: xr.DataArray, ext_coeff_fl: xr.DataArray):
    """ Compute the layer optical depth from a profile of extinsion coefficients
    """
    layer_optical_depth = ext_coeff_fl * delta_pressure_fl
    return layer_optical_depth

In [None]:
def compute_emissivity(delta_pressure_fl: xr.DataArray, ext_coeff_fl: xr.DataArray) -> xr.DataArray:
    diffusivity_factor = 1/np.cos(np.radians(53)) 
    layer_optical_depth = compute_layer_optical_depth(delta_pressure_fl, ext_coeff_fl)
    emissivity = 1 - np.exp(-diffusivity_factor * layer_optical_depth)
    return emissivity

In [None]:
def compute_plank_func(temperature: xr.DataArray) -> xr.DataArray:
    plank_func = Stefan_Boltzmann * temperature**4
    return plank_func

In [None]:
def compute_lw_up_boa(skin_temperature: xr.DataArray,
                      lw_emissivity: xr.DataArray) -> xr.DataArray:
    """Compute the upwelling longawe flux at BOA
    """
    lw_up_boa = lw_emissivity * compute_plank_func(skin_temperature)
    lw_up_boa = lw_up_boa.rename('flux_lw_up_boa')
    lw_up_boa.attrs = {'long_name': 'Upward logwave radiation at BOA', 'units': 'W/m2'}
    return lw_up_boa

In [None]:
def compute_lw_up_profile(temperature_fl: xr.DataArray,
                          delta_pressure_fl: xr.DataArray,
                          ext_coeff_fl: xr.DataArray,
                          flux_at_boa: xr.DataArray) -> xr.DataArray:
    """Compute the upwelling longawe flux profile to TOA given flux at BOA
    """
    # Array to store computed fluxes
    n_column = len(temperature_fl.column)
    n_level = len(temperature_fl.level)
    da_flux = xr.DataArray(
        np.zeros((n_column, n_level+1)),
        dims=('column', 'half_level'), # n_half_level = n_level + 1
        name='flux_up_hl',
        attrs = {'long_name': 'Upward logwave radiation', 
                 'units': 'W/m2'}
    )

    # Assign BC at BOA
    da_flux[:, -1] = flux_at_boa
    
    # Precompute emissivity and plank function as these are independent
    emissivity = compute_emissivity(delta_pressure_fl, ext_coeff_fl)
    plank_function = compute_plank_func(temperature_fl)

    n_half_level = list(range(len(da_flux.half_level))) 
    # Interate over half levels to TOA
    # Revert as TOA is at index zero.
    for i in range(da_flux.shape[1] - 1, 0, -1):
        da_flux[:, i-1] = da_flux[:, i] * (1 - emissivity[:, i-1]) + plank_function[:, i-1] * emissivity[:, i-1]
    return da_flux

In [None]:
def compute_lw_up(temperature_fl: xr.DataArray,
                  delta_pressure_fl: xr.DataArray,
                  ext_coeff_fl: xr.DataArray,
                  skin_temperature: xr.DataArray,
                  lw_emissivity: xr.DataArray) -> xr.DataArray:
    """Wrapper function to cumpute the full profile from BOA to TOA
    """
    flux_at_boa = compute_lw_up_boa(skin_temperature, lw_emissivity)
    lw_up = compute_lw_up_profile(temperature_fl,
                                  delta_pressure_fl,
                                  ext_coeff_fl,
                                  flux_at_boa)
    return lw_up

## Compute upward longwave radiation from temperature and optical depth

Here we use the functions we defined earlier to compute and plot the upward longwave radiation from temperature profiles and optical depth. x-axis indicates pressure levels where 0 is TOA and 137 is BOA. 

In [None]:
def compute_ext_coeff(pressure_fl, opt_depth):
    """ Compute the extinsion coefficient for
    the atmosphere given atmospheric pressure
    and atmospheric optical depth
    """
    ATM_SCALE_HEIGHT = 300000
    A = opt_depth / ATM_SCALE_HEIGHT
    ext_coeff = A * np.exp(-ATM_SCALE_HEIGHT / pressure_fl)
    return ext_coeff

In [None]:
THIS_DIR = Path.cwd()
ds_input = xr.open_dataset(THIS_DIR.parents[1] / 'data' / 'nwp_saf_profiles_in.nc')

ds_input['delta_pressure_fl'] = ds_input['pressure_hl'].diff('half_level').rename(half_level='level')

opt_depth = 30
ds_input['ext_coeff_fl'] = compute_ext_coeff(ds_input['pressure_fl'], opt_depth)

input_relevant = [
    'temperature_fl', # for plank fuction 
    'delta_pressure_fl', # for cloud optical depth
    'ext_coeff_fl', # for cloud optical depth
    'skin_temperature', # for flux at BOA
    'lw_emissivity' # for flux at BOA
]

ds_true_in = ds_input[input_relevant]
ds_true_in

In [None]:
ds_true_out = compute_lw_up(ds_true_in['temperature_fl'],
                            ds_true_in['delta_pressure_fl'],
                            ds_true_in['ext_coeff_fl'],
                            ds_true_in['skin_temperature'],
                            ds_true_in['lw_emissivity'])

ds_true_out.mean('column').plot();

In [None]:
def plot_profile(ds_in, ds_out, n_profiles):
    for idx in np.random.choice(ds_in.column, n_profiles):
        fig, axs = plt.subplots(1,2, figsize=(5*2,4))
        ds_in['temperature_fl'].isel(column=idx).plot(ax=axs[0], c='r')
        ds_out.isel(column=idx).plot(ax=axs[1], c='k')
        plt.show()

In [None]:
plot_profile(ds_true_in, ds_true_out, 2)

## Machine learning: baseline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [None]:
def compute_norm_stats(ds):
    stats = {
        name : {
            'mean' : ds[name].mean(),
            'std' : ds[name].std()
        } for name in ds
    }
    return stats

def normalize_inputs(ds, norm_stats):

    def compute_z_score(ds, stats):
        return (ds - stats['mean']) / stats['std']

    ds_norm = xr.zeros_like(ds)

    # These are already in reasonable scale O(1).
    quantity_no_norm = ['lw_emissivity']
    for quantity in list(ds_norm):
        if quantity in quantity_no_norm:
            ds_norm[quantity] = ds[quantity]
            print(f'Skipping normalization for: {quantity}')
        else:
            ds_norm[quantity] = compute_z_score(ds[quantity], norm_stats[quantity]) 
    return ds_norm

In [None]:
def plot_normalized_inputs(ds):
    fig, ax = plt.subplots(1,2, figsize=(15,5))
    for quantity in list(ds):
        if len(ds[quantity].shape) == 1: # scalars
            ds[quantity].plot.hist(ax=ax[0], label=quantity, alpha=0.3)
            ax[0].set_ylabel('Count')
            ax[0].set_xlabel('Range')
            ax[0].legend()
        elif len(ds[quantity].shape) == 2: # profiles
            ds[quantity].mean('column').plot(ax=ax[1], label=quantity)
            ax[1].set_ylabel('Normilized range (Z score)')
            ax[1].set_xlabel('Vertical level')
            ax[1].set_title('Mean profiles')
            ax[1].legend()
            ax
        else:
            raise RuntimeError('Number of dims not supported')

In [None]:
norm_stats = compute_norm_stats(ds_true_in)
X_true_norm = normalize_inputs(ds_true_in, norm_stats)
plot_normalized_inputs(X_true_norm)

In [None]:
# Flatten
X_true_stacked_norm, stack_info = to_stacked_array(X_true_norm)

# Train/test split
X_true_train_norm, X_true_test_norm, y_true_train_norm, y_true_test_norm = train_test_split(X_true_stacked_norm, 
                                                                        ds_true_out, test_size=0.3, random_state=42)

In [None]:
model_baseline = Ridge()
model_baseline.fit(X_true_train_norm, y_true_train_norm)
y_pred_test_norm = xr.DataArray(model_baseline.predict(X_true_test_norm), dims=['column', 'half_level'])

In [None]:
score = model_baseline.score(X_true_test_norm, y_true_test_norm)
print(score);

In [None]:
for column in np.random.choice(y_true_test_norm.column, 10):
    y_true_test_norm.sel(column=column).plot(label='true')
    y_pred_test_norm.sel(column=column).plot(label='pred')
    plt.legend()
    plt.show()

# Synthetic samples

In [None]:
# Here we use a copula to generate more samples to try improve the results
# 1. Split train/test data
# 2. Fit copula and generate synthetic samples for the model inputs only
# 3. Run physical model
# 4. Evaluate the modle with ML model as used before.

In [None]:
# Here we need to split the test train first -- we will not use the y

# Flatten
X_true_stacked, stack_info = to_stacked_array(ds_true_in)
X_true_stacked

# Train/test split
X_true_train, X_true_test, y_true_train, y_true_test = train_test_split(X_true_stacked, 
                                                                        ds_true_out, test_size=0.3, random_state=42)

In [None]:
parameterizer = syn.QuantileParameterizer(n_quantiles=100)
generator = syn.CopulaDataGenerator(verbose=True)
generator.fit(X_true_train, copula=syn.GaussianCopula(), parameterize_by=None)

In [None]:
n_samples = X_true_train.shape[0] * 2 # Twice as many
X_synth_train = generator.generate(n_samples=n_samples, uniformization_ratio=0, stretch_factor=1)
X_synth_train = to_unstacked_dataset(X_synth_train, stack_info)

In [None]:
X_synth_train

In [None]:
ds_synth_out = compute_lw_up(X_synth_train['temperature_fl'],
                            X_synth_train['delta_pressure_fl'],
                            X_synth_train['ext_coeff_fl'],
                            X_synth_train['skin_temperature'],
                            X_synth_train['lw_emissivity'])

ds_synth_out.mean('column').plot();
ds_true_out.mean('column').plot();

In [None]:
# Now we normilize and train the synthetic samples instead

norm_stats = compute_norm_stats(X_synth_train)
X_synth_norm = normalize_inputs(X_synth_train, norm_stats)
plot_normalized_inputs(X_synth_norm)

In [None]:
# Flatten
X_synth_stacked, stack_info = to_stacked_array(X_synth_norm)

# Train/test split
X_synth_train, X_synth_test, y_synth_train, y_synth_test = train_test_split(X_synth_stacked, 
                                                                        ds_synth_out, test_size=0.001, random_state=42)

In [None]:
model_synth = Ridge()
model_synth.fit(X_synth_train, y_synth_train)
y_synth_pred_test = xr.DataArray(model_synth.predict(X_true_test_norm), dims=['column', 'half_level'])

In [None]:
score = model_synth.score(X_true_test_norm, y_true_test_norm)
print(score);

In [None]:
for column in np.random.choice(y_true_test.column, 10):
    y_true_test_norm.sel(column=column).plot(label='true')
    y_synth_pred_test.sel(column=column).plot(label='pred')
    plt.show()