# Build a Gaussian Process Emulator of FaIR output

Why? We need an efficient way to quickly sample and build an MCMC from FaIR input, which we can then pass back to the full model for a detailed run.

At the moment we are running deterministic, can run stochastic with 10x the number of samples to get RMSE for temperature down.

In [None]:
import numpy as np
import gp_emulator
import pandas as pd

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

from sklearn.decomposition import PCA

In [None]:
def rmse(obs, mod):
    return np.sqrt(np.sum((obs-mod)**2)/len(obs))

In [None]:
temp_in = np.load('../data/ar6_ensemble_batches/temperature_1850-2030.npy')
ohc_in = np.load('../data/ar6_ensemble_batches/ohc_2018_minus_1971.npy')
co2_in = np.load('../data/ar6_ensemble_batches/co2_2014.npy')
ari_in = np.load('../data/ar6_ensemble_batches/fari_2005-2014_mean.npy')
aci_in = np.load('../data/ar6_ensemble_batches/faci_2005-2014_mean.npy')
o3_in = np.load('../data/ar6_ensemble_batches/fo3_2019.npy')
ecs_in = np.load('../data/ar6_ensemble_batches/ecs.npy')
tcr_in = np.load('../data/ar6_ensemble_batches/tcr.npy')

In [None]:
temp_19952014 = temp_in[145:165,:].mean(axis=0)-temp_in[:51,:].mean(axis=0)

In [None]:
samples = len(temp_19952014)

In [None]:
df_gmst = pd.read_csv('../data/forcing/AR6_GMST.csv')
gmst = df_gmst['gmst'].values

In [None]:
temp_rmse = np.ones((samples)) * np.nan
for i in range(samples):
    temp_rmse[i] = rmse(gmst[:171], temp_in[:171,i]-temp_in[:51, i].mean())

In [None]:
output_vector = np.array(
    [
        temp_19952014,
        temp_rmse,
        ohc_in,
        co2_in,
        ari_in,
        aci_in,
        o3_in,
        ecs_in,
        tcr_in,
    ]
)

In [None]:
df_cc=pd.read_csv('../data/parameter_sets/carbon_cycle.csv')
df_cr=pd.read_csv('../data/parameter_sets/climate_response.csv')
df_aci=pd.read_csv('../data/parameter_sets/erfaci.csv')
df_ari=pd.read_csv('../data/parameter_sets/erfari.csv')
df_ozone=pd.read_csv('../data/parameter_sets/ozone.csv')
df_scaling=pd.read_csv('../data/parameter_sets/forcing_scaling.csv')

In [None]:
input_vector = np.hstack(
    (
        df_cr.loc[:samples-1, 'c1':'epsilon'].values.squeeze(),
        df_cc.loc[:samples-1, :].values.squeeze(),
        df_ari.loc[:samples-1, :'VOC'].values.squeeze(),
        df_ari.loc[:samples-1, 'CH4':'CFC-11'].values.squeeze(),
        df_aci.loc[:samples-1, :].values.squeeze(),
        df_ozone.loc[:samples-1, :].values.squeeze(),
        df_scaling.loc[:samples-1, :'solar_amplitude'].values.squeeze(),
    )
)

In [None]:
input_vector.shape, output_vector.shape

In [None]:
samples

In [None]:
#gp = gp_emulator.MultivariateEmulator(y=output_vector[:,:100].T, X=input_vector,
#                                        thresh=0.99, n_tries=25)
kernel = RBF()
gpr = GaussianProcessRegressor(
    kernel=kernel,
#    normalize_y=True
).fit(X=input_vector[:4000,:], y=output_vector[:,:4000].T)

In [None]:
gpr.get_params()

In [None]:
gpr.score(input_vector[:4000,:], output_vector[:,:4000].T)

In [None]:
gpr.predict(input_vector[11000:11001, :])

In [None]:
output_vector[:, 11000:11001]

In [None]:
input_vector[11000:11001, :]

In [None]:
gpr.sample_y(input_vector[11000:11001, :])

In [None]:
pca = PCA(n_components=2)

In [None]:
pca.fit(input_vector)

In [None]:
print(pca.explained_variance_ratio_.sum())

In [None]:
print(pca.n_components_)

In [None]:
pca.score(input_vector)

In [None]:
pca.transform(input_vector)

In [None]:
import matplotlib.pyplot as pl

In [None]:
pl.hist(pca.transform(input_vector)[:,0], bins=np.linspace(-3e6,1e6))

In [None]:
pca.components_