# fPCA

In [None]:
%load_ext autoreload
%autoreload


%load_ext autoreload
%autoreload 2

from pathlib import Path
from multiprocessing import cpu_count

import numpy as np
import xarray as xr
import pyvinecopulib as pv
import matplotlib.pyplot as plt

import synthia as syn

from examples_util import plot_random_columns, plot_ds_hist, plot_summary_stat_column

In [None]:
# Constants
this_dir = Path.cwd()
data_dir = this_dir.parents[1] / 'data'
path_to_data = data_dir / 'nwp_saf_profiles_in.nc'

scalar_vars = [
 'skin_temperature',
 'sw_albedo',
 'lw_emissivity',
 'cos_solar_zenith_angle',
 'cloud_fraction'
]

vector_vars = [
 'temperature_fl',
 'q',
 'q_liquid',
 'q_ice',
 're_liquid',
 're_ice'
]

In [None]:
ds_true = xr.open_dataset(path_to_data)
# Load 'relevant' variables
ds_true = ds_true[scalar_vars + vector_vars]
ds_true

In [None]:
zero_boundary_vars = [
 'q',
 'q_liquid',
 'q_ice',
 're_liquid',
]

zero_one_boundary_vars = [
 'sw_albedo',
 'lw_emissivity',
 'cos_solar_zenith_angle',
 'cloud_fraction'
]

In [None]:
for var_name in zero_boundary_vars:
    plot_summary_stat_column(ds_true[var_name])

In [None]:
# Manual transorfmation: manually log/exp tranform zero boundary vars
eps = 1e-6

for var_name in zero_boundary_vars:
    plot_summary_stat_column(np.log(ds_true[zero_boundary_vars] + 1e-6)[var_name])

In [None]:
# Tranform data that has hard bounds
transformer = syn.CombinedTransformer([
    syn.BoxCoxTransformer(zero_boundary_vars, 0),
    syn.ArcTanhTransformer(zero_one_boundary_vars)
])
ds_true_transformed  = transformer.apply(ds_true)
ds_true_transformed

In [None]:
generator = syn.FPCADataGenerator()
generator.fit(ds_true_transformed, n_fpca_components=200)

In [None]:
# Generate same number of samples as in the input
n_samples = ds_true_transformed.dims['column']
ds_synth_transformed = generator.generate(n_samples=n_samples)
ds_synth_transformed

In [None]:
ds_synth = transformer.revert(ds_synth_transformed)
ds_synth

In [None]:
for var_name in zero_boundary_vars:
    plot_summary_stat_column(ds_true[var_name], ds_synth[var_name])

In [None]:
# FIXME: these are perhaps too smoth 
plot_random_columns(ds_true, ds_synth)

In [None]:
# These are very well modelled because are uncorrelated with other features.
plot_ds_hist(ds_true, ds_synth)

In [None]:
# Evaluation -- histogram
# FIXME adapt to dataset instead of array
#ds_true_norm = np.linalg.norm(ds_true_transformed_stacked, axis=1)
#ds_synth_norm = np.linalg.norm(arr_synth_transformed_stacked, axis=1)
#plt.hist(ds_true_norm, bins=100, alpha=1, label='True')
#plt.hist(ds_synth_norm, bins=100, alpha=0.5, label='Synthetic')
#plt.legend()
#plt.show()