In [None]:
%load_ext autoreload
%autoreload

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import numpy as np
import xarray as xr
import pandas as pd
import psychrolib as psy

import synthia as syn
from synthia import util

import pyvinecopulib as pv

import matplotlib.pyplot as plt
import seaborn as sns

# Papermill parameters

In [None]:
# This cell is tagged `parameters` (see View -> Cell Toolbar -> Tags).
test_size = 9000
train_size = 100
factor_synthetic = 15

# Task

Given:
- Real data with multiple quantities
- Normalization method
- Train / test split of real data
- Regression problem
- ML model trained on real data
- Error metrics for ML model

Question:
- Can the ML error metrics be improved by training with additional synthetic data?

Method:
- Fit Copula on training data
- Generate synthetic training data
- Re-train ML model on real + synthetic data
- Compute error metrics for ML model
- Compare against original error metrics

# Real data with multiple quantities

Instead of using an existing dataset, we compute our own using a simple physical formula.
See the plot below that shows all quantities in a single plot.

In [None]:
psy.SetUnitSystem(psy.SI)

n_points = 100
tdb = np.linspace(1, 150, num=n_points)
rh = np.linspace(1e-2, 1, num=n_points)

grid = np.meshgrid(tdb, rh)
tdp = psy.GetTDewPointFromRelHum(grid[0], grid[1])

ds_grid = xr.Dataset({
    'tdb': (['i', 'j'], grid[0]),
    'rh': (['i', 'j'], grid[1]),
    'tdp': (['i', 'j'], tdp),
})

# Flatten the i/j grid dimensions into a single sample dimension for use in ML models.
ds_samples = ds_grid.stack(sample=('i', 'j'))
ds_samples

In [None]:
fig, ax = plt.subplots(1, 1)
cp = ax.contourf(ds_grid['tdb'], ds_grid['rh'], ds_grid['tdp'])
fig.colorbar(cp)
ax.set_title('Dew point temperature in °C')
ax.set_xlabel('Dry bulb temperature in °C')
ax.set_ylabel('Relative Humidity')
plt.show()

# Data normalization

In [None]:
ds_samples_norm, norm_stats = util.to_normalized_dataset(ds_samples)
ds_samples_norm.to_dataframe().hist();

# Train / test split of real data

In this example, we split the data as follows:

- Test data = 9000 samples
- Train data = out of the remaining 1000 samples, subset 100 or 500 or 1000 samples

See parameters cell at the top of the notebook.

In [None]:
ds_train, ds_test = util.train_test_split_dataset(ds_samples_norm, test_size=test_size, dim='sample', shuffle=True)
ds_train = ds_train.isel(sample=slice(train_size))

In [None]:
def plot_train_test_data(ds_train, ds_test, train_label='train', test_label='test'):
    fig, axs = plt.subplots(1, 2, figsize=(15,5))

    axs[0].scatter(ds_train['tdb'], ds_train['tdp'], label=train_label)
    axs[0].scatter(ds_test['tdb'], ds_test['tdp'], label=test_label, alpha=0.1)
    axs[1].scatter(ds_train['rh'], ds_train['tdp'], label=train_label)
    axs[1].scatter(ds_test['rh'], ds_test['tdp'], label=test_label, alpha=0.1)

    axs[0].set_xlabel('Dry bulb temperature in °C')
    axs[0].set_ylabel('Dew point temperature in °C')
    axs[1].set_xlabel('Relative Humidity')
    axs[1].set_ylabel('Dew point temperature in °C')
    axs[0].legend()
    axs[1].legend()
    
plot_train_test_data(util.to_unnormalized_dataset(ds_train, norm_stats), 
                     util.to_unnormalized_dataset(ds_test, norm_stats))

# Regression problem

Given dry bulb temperature (tdb) and relative humidity (rh), predict dew point temperature (tdp).

In [None]:
X_vars = ['tdb', 'rh']
y_var = 'tdp'

# ML model trained on real data
To establish a baseline for later comparison, we now train the ML model on the real data solving the stated regression problem.
In this sample, we use the standard MLP model from scikit-learn.

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor

In [None]:
def train_ml_model(train, test, X_vars, y_var, epochs, iterations):
    X_train, _ = util.to_stacked_array(train[X_vars])
    X_test, _ = util.to_stacked_array(test[X_vars])
    y_train = train[y_var]
    y_test = test[y_var]

    results = []
    for i in tqdm(range(iterations)):
        model = MLPRegressor(max_iter=epochs)
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        results.append({
            'r2': model.score(X_test, y_test),
            'mse': mean_squared_error(y_test, y_pred_test)
        })
    results = pd.DataFrame(results)
    return results, y_pred_test

epochs = 1000
iterations = 20

results, y_pred_test = train_ml_model(ds_train, ds_test, X_vars, y_var, epochs, iterations)

In [None]:
# Diagnostic plot (predictions of last trained model only!)
def plot_true_vs_pred(true, pred):
    plt.scatter(pred, true, alpha=0.1)
    plt.xlabel('predicted')
    plt.ylabel('true')
    y_min, y_max = true.min(), true.max()
    plt.plot([y_min, y_max], [y_min, y_max], color='black')

plot_true_vs_pred(ds_test[y_var], y_pred_test)

# Error metrics for ML model

In [None]:
def plot_error_metrics(df_metrics):
    fig, axs = plt.subplots(1, 2, figsize=(10,5))
    df_metrics[['mse']].boxplot(ax=axs[0])
    df_metrics[['r2']].boxplot(ax=axs[1])

plot_error_metrics(results)

# Create synthetic training data

In [None]:
def create_synthetic(data, n_samples, n_quantiles, uniformization_ratio, stretch_factor):
    generator = syn.CopulaDataGenerator(verbose=True)
    parameterizer = syn.QuantileParameterizer(n_quantiles=n_quantiles)
    ctrl = pv.FitControlsVinecop(num_threads=2)
    generator.fit(data, copula=syn.VineCopula(controls=ctrl), parameterize_by=parameterizer)
    #generator.fit(data, copula=syn.GaussianCopula(), parameterize_by=parameterizer)
    synthetic = generator.generate(n_samples=n_samples, uniformization_ratio=uniformization_ratio, stretch_factor=stretch_factor)
    return synthetic

ds_synthetic = create_synthetic(ds_train, n_samples=ds_train.dims['sample']*factor_synthetic,
                                n_quantiles=ds_train.dims['sample'],
                                uniformization_ratio=0., stretch_factor=1)

ds_train_with_synthetic = xr.concat([ds_synthetic, ds_train.reset_index('sample', drop=True)], dim='sample')

In [None]:
plot_train_test_data(util.to_unnormalized_dataset(ds_train_with_synthetic, norm_stats),
                     util.to_unnormalized_dataset(ds_test, norm_stats), train_label='train + synthetic')

# ML model trained on real + synthetic data

In [None]:
results_synthetic, y_pred_test_synthetic = train_ml_model(ds_train_with_synthetic, ds_test, X_vars, y_var, epochs, iterations)

In [None]:
# Diagnostic plot (predictions of last trained model only!)
plot_true_vs_pred(ds_test[y_var], y_pred_test_synthetic)

# Error metrics for ML model trained on real + synthetic data

In [None]:
plot_error_metrics(results_synthetic)

# Comparison of error metrics to original

In [None]:
plot_error_metrics(results)