# Machine learning: correlated multivariate scalars

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import xarray as xr
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

from multiprocessing import cpu_count

import synthia as syn

import pyvinecopulib as pv

import matplotlib.pyplot as plt

## Baseline
Use the Boston housing dataset as it small and contains several correlated futures

In [None]:
X, y = load_boston(return_X_y=True)
X = pd.DataFrame(load_boston(return_X_y=False)['data']).drop([1,3,8], axis=1).to_numpy() # FIXME
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LinearRegression
# Use linear regression as simple and easy to train model
model = LinearRegression()
model.fit(X_train, y_train)
score_baseline = model.score(X_test, y_test)
print(score_baseline)

## Data generation

Use a simple gaussian copula to generate model. If the copula has a perfect fit we would expect the score to be equal to that in the baseline

In [None]:
def create_synthetic(X_true, y_true, n_samples, uniformization_ratio, stretch_factor, use_pyvinecopulib=True):
    """
    Combines X and y into a single dataset D, models it
    using a copula, and generates a synthetic dataset S. It 
    returns the new, synthetic versions of X and y.
    """
    Xy_true = xr.DataArray(np.concatenate([X_true, np.expand_dims(y_true, 1)], axis=1))
    
    generator = syn.CopulaDataGenerator(verbose=True)
    parameterizer = syn.QuantileParameterizer(n_quantiles=Xy_true.shape[0])
    if use_pyvinecopulib:
        ctrl = pv.FitControlsVinecop(num_threads=cpu_count())
        generator.fit(Xy_true, copula=syn.VineCopula(controls=ctrl), parameterize_by=parameterizer)
    else:
        generator.fit(Xy_true, copula=syn.GaussianCopula(), parameterize_by=parameterizer)
    
    Xy_synthetic = generator.generate(n_samples=n_samples,
                                       uniformization_ratio=uniformization_ratio,
                                       stretch_factor=stretch_factor)
    X_synthetic = Xy_synthetic[:,:-1]
    y_synthetic = Xy_synthetic[:,-1]
    return X_synthetic, y_synthetic

In [None]:
n_samples = X_train.shape[0] * 1 # Same samples as in the original data
X_synthetic, y_synthetic = create_synthetic(X_train, y_train, n_samples,
                                            uniformization_ratio=0, stretch_factor=1, 
                                            use_pyvinecopulib=False)

model = LinearRegression()
model.fit(X_synthetic, y_synthetic)

score_copula_gaussian = model.score(X_test, y_test)
print(score_copula_gaussian)

In [None]:
n_samples = X_train.shape[0] * 5 # Ten times as many samples as in the original data
X_synthetic, y_synthetic = create_synthetic(X_train, y_train, n_samples,
                                            uniformization_ratio=0, stretch_factor=1, 
                                            use_pyvinecopulib=True)

model = LinearRegression()
model.fit(X_synthetic, y_synthetic)

score_copula_pyvinecopulib = model.score(X_test, y_test)
print(score_copula_pyvinecopulib)

In [None]:
n_samples = X_train.shape[0] * 10 # Ten times as many samples as in the original data
X_synthetic, y_synthetic = create_synthetic(X_train, y_train, n_samples,
                                            uniformization_ratio=0, stretch_factor=1.2, 
                                            use_pyvinecopulib=False)

model = LinearRegression()
model.fit(X_synthetic, y_synthetic)

score_copula_augmented = model.score(X_test, y_test)
print(score_copula_augmented)

In [None]:
ind = (1,2,3, 4)
plt.bar(ind, (score_baseline, score_copula_gaussian, score_copula_pyvinecopulib, score_copula_augmented))
plt.xticks(ind, ('baseline', 'gaussian', 'vinecopulib', 'gaussian_augmented'));

# Althought the scores are similar, in both cases are lower and we fail to improve the results 
# when generating more samples/increasing the range... 