## Spatial prediction of soil pollutants with multi-output Gaussian processes
Source: https://nextjournal.com/essicolo/spatial-prediction-of-soil-pollutants-with-multi-output-gaussian-processes?version=latest

In [1]:
import numpy as np
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
np.random.seed(123)



In [2]:
import GPy
GPy.__version__, pm.__version__

('1.10.0', '3.11.3')

In [3]:
meuse_df = pd.read_csv("../data/meuse.csv")
meuse_df.head()

Unnamed: 0,x,y,cadmium,copper,lead,zinc,elev,dist,om,ffreq,soil,lime,landuse,dist.m
0,181072,333611,11.7,85,299,1022,7.909,0.001358,13.6,1,1,1,Ah,50
1,181025,333558,8.6,81,277,1141,6.983,0.012224,14.0,1,1,1,Ah,30
2,181165,333537,6.5,68,199,640,7.8,0.103029,13.0,1,1,1,Ah,150
3,181298,333484,2.6,81,116,257,7.655,0.190094,8.0,1,2,0,Ga,270
4,181307,333330,2.8,48,117,269,7.48,0.27709,8.7,1,2,0,Ah,380


In [4]:
obs_id = meuse_df.index
n_train = np.round(obs_id.shape[0] * 0.7, 0).astype("int")
id_train = np.random.choice(obs_id, size = n_train, replace = False)
id_test = obs_id[~obs_id.isin(id_train)].values

In [5]:
len(id_train), len(id_test)

(108, 47)

In [6]:
targets = ["cadmium", "copper", "lead", "zinc"]
features = ["x", "y", "dist"]

In [7]:
XY = meuse_df[targets + features]

In [8]:
mean_sc = XY.loc[XY.index.isin(id_train), :].mean(axis = 0)
std_sc = XY.loc[XY.index.isin(id_train), :].std(axis = 0)
XY_sc = XY.apply(lambda x: (x-mean_sc)/std_sc, axis = 1)

In [9]:
XY.shape, XY_sc.shape

((155, 7), (155, 7))

In [10]:
XY_sc

Unnamed: 0,cadmium,copper,lead,zinc,x,y,dist
0,2.341026,1.897420,1.265781,1.432640,1.335194,1.794602,-1.173337
1,1.467739,1.724563,1.071949,1.749590,1.274062,1.744386,-1.116452
2,0.876157,1.162780,0.384727,0.415202,1.456158,1.724489,-0.641089
3,-0.222495,1.724563,-0.346548,-0.604899,1.629150,1.674274,-0.185304
4,-0.166154,0.298498,-0.337737,-0.572937,1.640856,1.528363,0.270120
...,...,...,...,...,...,...,...
150,-0.729566,-0.911498,-0.936854,-0.847273,-1.120508,-1.454268,1.038340
151,-0.081642,-0.090430,0.155653,0.031665,-1.249276,-1.350046,1.038340
152,-0.363348,-0.436143,-0.320116,-0.378506,-1.522421,-1.332044,0.270120
153,-0.729566,-0.868284,-0.919233,-0.857926,-0.753714,-1.265722,0.696856


### Intrinsic model of coregionalization (ICM)

In [11]:
XY_sc[features].values.shape, XY_sc[targets].values.shape

((155, 3), (155, 4))

In [12]:
kern = GPy.kern.RBF(1, lengthscale=80)**GPy.kern.Coregionalize(1,output_dim=4, rank=3)
display(kern)

mul.,value,constraints,priors
rbf.variance,1.0,+ve,
rbf.lengthscale,80.0,+ve,
coregion.W,"(4, 3)",,
coregion.kappa,"(4,)",+ve,


In [18]:
XY_sc["dist"].values[:].shape

(155,)

In [20]:
model = GPy.models.GPRegression(X=XY_sc["dist"].values, Y=XY_sc[targets].values, kernel=kern)
model.optimize()

AssertionError: 

In [14]:
model

GP_regression.,value,constraints,priors
mul.rbf.variance,0.011532298282157745,+ve,
mul.rbf.lengthscale,80.1357520629657,+ve,
mul.coregion.W,"(4, 3)",,
mul.coregion.kappa,"(4,)",+ve,
Gaussian_noise.variance,0.9999999999999887,+ve,


In [None]:
fig, ax = plt.subplots()
for i in range(4):
    model.plot(fignum=1,fixed_inputs=[(1, i)],ax=ax,legend=i==0)
plt.xlabel('years')
plt.ylabel('time/s')

In [15]:
K = GPy.kern.Matern32(1)
icm = GPy.util.multioutput.ICM(input_dim=1, num_outputs=4, kernel=K)

m = GPy.models.GPCoregionalizedRegression(XY_sc[features].values, 
                                          XY_sc[targets].values, kernel=icm)
m['.*Mat32.var'].constrain_fixed(1.) #For this kernel, B.kappa encodes the variance now.
print(m)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 155 and the array at index 1 has size 465