# Copula

In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from multiprocessing import cpu_count

import numpy as np
import xarray as xr
import pyvinecopulib as pv
import matplotlib.pyplot as plt

import synthia as syn

from examples_util import plot_random_columns, plot_ds_hist

In [None]:
# Constants
this_dir = Path.cwd()
data_dir = this_dir.parents[1] / 'data'
path_to_data = data_dir / 'nwp_saf_profiles_in.nc'

scalar_vars = [
 'skin_temperature',
 'sw_albedo',
 'lw_emissivity',
 'cos_solar_zenith_angle',
 'cloud_fraction'
]

vector_vars = [
 'temperature_fl',
 'q',
 'q_liquid',
 'q_ice',
 're_liquid',
 're_ice'
]

In [None]:
ds_true = xr.open_dataset(path_to_data)
# Load 'relevant' variables
ds_true = ds_true[scalar_vars + vector_vars]
# FIXME: for now subset to reduce CPU time
ds_true = ds_true.sel(column=slice(0, 5000))
ds_true

In [None]:
# Fitting the model
generator = syn.CopulaDataGenerator()
parameterizer = syn.QuantileParameterizer(n_quantiles=100)

# FIXME: not using as even when specifying the type, and with a low truncation level, 
# CPU time is unfeasible high on average consumer-hardware. 
ctrl = pv.FitControlsVinecop(family_set=[pv.BicopFamily.tll], trunc_lvl=2, select_trunc_lvl=False,
                             show_trace=False, num_threads=cpu_count())
#generator.fit(ds_true, copula=syn.VineCopula(controls=ctrl), parameterize_by=parameterizer)

generator.fit(ds_true, copula=syn.GaussianCopula(), parameterize_by=parameterizer)

In [None]:
# Generate same number of samples as in the input
n_samples = ds_true.dims['column']
ds_synth = generator.generate(n_samples=n_samples, uniformization_ratio=0, stretch_factor=1)

In [None]:
# FIXME: the problem using (Gaussian) copula(s) is that the profiles are much 
# more wobbly that the true -- this will become a problem when trying to fit a regression model.
# Gneerally, the jerk, as shown by the second derivative, is much higher.
plot_random_columns(ds_true, ds_synth)

In [None]:
# These are very well modelled because are uncorrelated with other features.
plot_ds_hist(ds_true, ds_synth)

In [None]:
# TODO adapt to Dataset

# Evaluation -- histogram
#ds_true_norm = np.linalg.norm(ds_true_stacked, axis=1)
#ds_synth_norm = np.linalg.norm(ds_synth_stacked, axis=1)
#plt.hist(ds_true_norm, bins=100, alpha=1, label='True')
#plt.hist(ds_synth_norm, bins=100, alpha=0.5, label='Synthetic')
#plt.legend()
#plt.show()