/
coKriging.py
1056 lines (830 loc) · 39.2 KB
/
coKriging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Module with classes and methods to perform kriging of elements (and at some point exploit the potential field to
choose the directions of the variograms)
Tested on Ubuntu 16
Created on 1/5/2017
@author: Miguel de la Varga
"""
import theano
import theano.tensor as T
import matplotlib.pyplot as plt
import pymc3 as pm
import numpy as np
import pandas as pn
from bokeh.io import show
import bokeh.layouts as bl
import bokeh.plotting as bp
def choose_lithology_elements(df, litho, elem=None, coord = True):
"""
litho(str): Name of the lithology-domain
elem(list): List of strings with elements you want to analyze
"""
# Choosing just the opx litology
if elem is not None:
if coord:
domain = df[df['Lithology'] == litho][np.append(['X', 'Y', 'Z'], elem)]
else:
domain = df[df['Lithology'] == litho][elem]
# Drop negative values
domain = domain[(domain[elem] > 0).all(1)]
else:
domain = df[df['Lithology'] == litho][['X', 'Y', 'Z']]
return domain
def select_segmented_grid(df, litho, grid, block):
block = np.squeeze(block)
assert grid.shape[0] == block.shape[0], 'The grid you want to use for kriging and the grid used for the layers ' \
'segmentation are not the same'
litho_num = df['Lithology Number'][df["Lithology"] == litho].iloc[0]
segm_grid = grid[block == litho_num]
return segm_grid
def transform_data(df_o, n_comp=1, log10=False):
"""
Method to improve the normality of the input data before perform krigin
Args:
df_o: Dataframe with the data to interpolate
n_comp: Number of component in case of multimodal data
log10 (bool): If true return the log in base 10 of the properties:
Returns:
pandas.core.frame.DataFrame: Data frame with the transformed data
"""
import copy
df = copy.deepcopy(df_o)
# Take log to try to aproximate better the normal distributions
if log10:
print('computing log')
df[df.columns.difference(['X', 'Y', 'Z'])] = np.log10(df[df.columns.difference(['X', 'Y', 'Z'])])
# Finding n modes in the data
if n_comp > 1:
from sklearn import mixture
gmm = mixture.GaussianMixture(n_components=n_comp,
covariance_type='full').fit(df[df.columns.difference(['X', 'Y', 'Z'])])
# Adding the categories to the pandas frame
labels_all = gmm.predict(df[df.columns.difference(['X', 'Y', 'Z'])])
df['cluster'] = labels_all
return df
def theano_sed():
"""
Function to create a theano function to compute the euclidian distances efficiently
Returns:
theano.compile.function_module.Function: Compiled function
"""
theano.config.compute_test_value = "ignore"
# Set symbolic variable as matrix (with the XYZ coords)
coord_T_x1 = T.dmatrix()
coord_T_x2 = T.dmatrix()
# Euclidian distances function
def squared_euclidean_distances(x_1, x_2):
sqd = T.sqrt(T.maximum(
(x_1 ** 2).sum(1).reshape((x_1.shape[0], 1)) +
(x_2 ** 2).sum(1).reshape((1, x_2.shape[0])) -
2 * x_1.dot(x_2.T), 0
))
return sqd
# Compiling function
f = theano.function([coord_T_x1, coord_T_x2],
squared_euclidean_distances(coord_T_x1, coord_T_x2),
allow_input_downcast=False)
return f
# This is extremily ineficient. Try to vectorize it in theano, it is possible to gain X100
def compute_variogram(df, properties, euclidian_distances, tol=10, lags=np.logspace(0, 2.5, 100), plot=[]):
"""
Compute the experimental variogram and cross variogram for a par of properties
Args:
df (pandas.core.frame.DataFrame): Dataframe with the properties and coordinates used in the experimental
variogram computation
properties (list): List of the two properties to compute the semivariogram.
euclidian_distances (numpy.array): Precomputed distances of the euclidian distances
tol (float): Tolerance
lags (list): List of lags to compute the experimental variogram
plot (bool): If true plot the experimental variogram after computed
Returns:
list: semvariance aor cross-semivariance
"""
# Tiling the properties to a square matrix
element = (df[properties[0]].as_matrix().reshape(-1, 1) -
np.tile(df[properties[1]], (df[properties[1]].shape[0], 1))) ** 2
# Semivariance computation
semivariance_lag = []
# Looping every lag to compute the semivariance
for i in lags:
# Selecting the points at the given lag and tolerance
points_at_lag = ((euclidian_distances > i - tol) * (euclidian_distances < i + tol))
# Extracting the values of the properties of the selected lags
var_points = element[points_at_lag]
# Appending the semivariance
semivariance_lag = np.append(semivariance_lag, np.mean(var_points) / 2)
if "experimental" in plot:
# Visualizetion of the experimental variogram
plt.plot(lags, semivariance_lag, 'o')
return semivariance_lag
def exp_lags(max_range, exp=2, n_lags=100):
"""
Function to create a more specific exponential distance between the lags in case that log10 gives too much weight
to the smaller lags
Args:
max_range(float): Maximum distance
exp (float): Exponential degree
n_lags (int): Number of lags
Returns:
list: lags
"""
lags = np.empty(0)
for i in range(n_lags):
lags = np.append(lags, i ** exp)
lags = lags / lags.max() * max_range
return lags
def compute_crossvariogram(df, properties_names, euclidian_distances=None, **kwargs):
"""
Compute the experimental crossvariogram of all properties given
Args:
df (pandas.core.frame.DataFrame): Dataframe with the properties and coordinates used in the experimental
variogram computation
properties_names (list str): List of strings with the properties to compute
euclidian_distances (numpy.array): Precomputed euclidian distances. If None they are computed inplace
Keyword Args:
- lag_exp: Degree of the exponential. If None log10
- lag_range: Maximum distance to compute a lag
- n_lags: Number of lags
Returns:
pandas.core.frame.DataFrame: Every experimental cross-variogram
"""
lag_exp = kwargs.get('lag_exp', None)
lag_range = kwargs.get('lag_range', 500)
n_lags = kwargs.get('n_lags', 100)
# Choosing the lag array
if lag_exp is not None:
lags = exp_lags(lag_range, lag_exp, n_lags)
else:
lags = np.logspace(0, np.log10(lag_range), n_lags)
# Compute euclidian distance
if not euclidian_distances:
euclidian_distances = theano_sed()(df[['X', 'Y', 'Z']], df[['X', 'Y', 'Z']])
# Init dataframe to store the results
experimental_variograms_frame = pn.DataFrame()
# This is extremily ineficient. Try to vectorize it in theano, it is possible to gain X100
# Nested loop. DEPRECATED enumerate
for i in properties_names:
for j in properties_names:
col_name = i + '-' + j
values = compute_variogram(df, [i, j], euclidian_distances, lags=lags)
experimental_variograms_frame[col_name] = values
# Add lags column for plotting mainly
experimental_variograms_frame['lags'] = lags
return experimental_variograms_frame
class SGS(object):
"""
Class that contains the necessary methods to perform a sequential gaurssian simmulation from experimental variograms
"""
def __init__(self, exp_var, properties=None, n_exp=2, n_gauss=2,
data=None, grid=None, grid_to_inter=None, lithology=None, geomodel=None):
"""
Args:
exp_var (pandas.core.frame.DataFrame): Dataframe with the experimental variograms. Columns are the variables
while raws the lags
properties (list strings): Variables to be used
n_exp (int): number of exponential basic functions
n_gauss (int): number of gaussian basic functions
"""
# Analytical covariance
# ---------------------
self.exp_var_raw = exp_var
if not properties:
self.properties = exp_var.columns[['ppm' in i for i in exp_var.columns]]
else:
self.properties = properties
self.n_properties = len(self.properties)
self.lags = self.exp_var_raw['lags']
self.exp_var, self.nuggets = self.preprocess()
self.n_exp = n_exp
self.n_gauss = n_gauss
self.trace = None
# Kriging
# -------
self.data = data
self.data_to_inter = None
self.grid = grid
self.grid_to_inter = grid_to_inter
self.lithology = lithology
self.geomodel = geomodel
# Theano functions
# ----------------
self.SED_f = theano_sed()
self._recursion_check = 0
def select_segmented_grid(self, grid=None, df=None, litho=None, block=None):
"""
Extract from the full model the points belonging to a given domain (litho)
Args:
grid (numpy.arrau): Full model grid
df (pandas.core.frame.DataFrame): with all data
litho (str): Name of the lithology to interpolate
block (numpy.array): 3D block model generated by gempy
Returns:
"""
# If we do not pass the attributes but are already a property of the object extract it
if grid is None:
grid = self.grid
if block is not None:
block = np.squeeze(block)
else:
block = self.geomodel
if df is None:
df = self.data
if litho is None:
litho = self.lithology
# Check that the attributes have value
assert all([any(block), ~df.empty, litho, np.any(grid)]), "Some of the attributes are not provided"
assert grid.shape[0] == block.shape[0], 'The grid you want to use for kriging and the grid used for the layers ' \
'segmentation are not the same'
# Translate lithology name to lithology number, i.e. the value of the lithology in the gempy block
litho_num = df['Lithology Number'][df["Lithology"] == litho].iloc[0]
# Extract from the grid the XYZ coordinates where the gempy block model is the given lithology
segm_grid = grid[block == litho_num]
# Set the segmented grid
self.grid_to_inter = segm_grid
return segm_grid
def choose_lithology_elements(self, df=None, litho=None, elem=None, coord=True):
"""
Choose from the whole input data, those points which fall into the domain of interest
litho(str): Name of the lithology-domain
elem(list): List of strings with elements you want to analyze
"""
# If we do not pass the attributes but are already a property of the object extract it
if not df:
df = self.data
if not litho:
litho = self.lithology
# Check that the attributes have value
assert all([~df.empty, litho]), "Some of the attributes are not provided"
# Choosing just one lithology
if elem is not None:
if coord:
domain = df[df['Lithology'] == litho][np.append(['X', 'Y', 'Z'], elem)]
else:
domain = df[df['Lithology'] == litho][elem]
# Drop negative values
domain = domain[(domain[elem] > 0).all(1)]
else:
domain = df[df['Lithology'] == litho][['X', 'Y', 'Z']]
# Set the segmented data
self.data_to_inter = domain
return domain
def set_data(self, data):
"""
Data setter
Args:
data (pandas.core.frame.DataFrame): Dataframe with coordinates and values of the properties of interest
"""
self.data = data
def set_grid_to_inter(self, grid):
"""
Grid setter
Args:
grid (numpy.array): Grid object from gempy with the points of interest of the model
"""
self.grid_to_inter = grid
def set_lithology(self, lithology):
"""
Lithology setter
Args:
lithology (str): Name of the domain or lithology of interest
"""
self.lithology = lithology
def set_geomodel(self, geomodel):
"""
gempy model setter
Args:
geomodel (numpy.array): gempy block model
"""
self.geomodel = geomodel
def set_trace(self, trace):
"""
Analytical covarinace trace setter
Args:
trace (pymc3.trace): trace with the sill, range and weights of each property
"""
self.trace = trace
def preprocess(self):
"""
Normalization of data between 0 and 1 and subtraction of the nuggets
Returns:
pandas.core.frame.DataFrame: Dataframe containing the transformed data
pandas.core.frame.DataFrame: Containing the substracted nuggets
"""
import sklearn.preprocessing as skp
# Normalization
scaled_data = pn.DataFrame(skp.minmax_scale(self.exp_var_raw[self.properties]), columns=self.properties)
# Nuggets
nuggets = scaled_data[self.properties].iloc[0]
processed_data = scaled_data - nuggets
return processed_data, nuggets
def plot_experimental(self, transformed=False):
"""
Plotting the experimental data either transformed (see preprocess doc) or not
Args:
transformed (bool): if true plot the transformed data
Returns:
matplotlib.plot: plot of experimental variogram
"""
if transformed:
plot = self.exp_var.plot(x=self.lags, y=self.properties, subplots=True, kind ='line',
style='.',
layout=(int(np.sqrt(self.n_properties)), int(np.sqrt(self.n_properties))),
figsize=(16, 8))
else:
plot = self.exp_var_raw.plot(x=self.lags, y=self.properties, subplots=True, kind='line',
style='.',
layout=(int(np.sqrt(self.n_properties)), int(np.sqrt(self.n_properties))),
figsize=(16, 8))
return plot
def fit_cross_cov(self, n_exp=2, n_gauss=2, range_mu=None):
"""
Fit an analytical covariance to the experimental data.
Args:
n_exp (int): number of exponential basic functions
n_gauss (int): number of gaussian basic functions
range_mu: prior mean of the range. Default mean of the lags
Returns:
pymc.Model: PyMC3 model to be sampled using MCMC
"""
self.n_exp = n_exp
self.n_gauss = n_gauss
n_var = self.n_properties
df = self.exp_var
lags = self.lags
# Prior standard deviation for the error of the regression
prior_std_reg = df.std(0).max() * 10
# Prior value for the mean of the ranges
if not range_mu:
range_mu = lags.mean()
# pymc3 Model
with pm.Model() as model: # model specifications in PyMC3 are wrapped in a with-statement
# Define priors
sigma = pm.HalfCauchy('sigma', beta=prior_std_reg, testval=1., shape=n_var)
psill = pm.Normal('sill', prior_std_reg, sd=.5 * prior_std_reg, shape=(n_exp + n_gauss))
range_ = pm.Normal('range', range_mu, sd=range_mu * .3, shape=(n_exp + n_gauss))
lambda_ = pm.Uniform('weights', 0, 1, shape=(n_var * (n_exp + n_gauss)))
# Exponential covariance
exp = pm.Deterministic('exp',
# (lambda_[:n_exp*n_var]*
psill[:n_exp] *
(1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)),
(range_[:n_exp].reshape((1, n_exp)) / 3.) ** -1))))
gauss = pm.Deterministic('gaus',
psill[n_exp:] *
(1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)) ** 2,
(range_[n_exp:].reshape((1, n_gauss)) * 4 / 7.) ** -2))))
# We stack the basic functions in the same matrix and tile it to match the number of properties we have
func = pm.Deterministic('func', T.tile(T.horizontal_stack(exp, gauss), (n_var, 1, 1)))
# We weight each basic function and sum them
func_w = pm.Deterministic("func_w", T.sum(func * lambda_.reshape((n_var, 1, (n_exp + n_gauss))), axis=2))
for e, cross in enumerate(df.columns):
# Likelihoods
pm.Normal(cross + "_like", mu=func_w[e], sd=sigma[e], observed=df[cross].as_matrix())
return model
def plot_cross_variograms(self, iter_plot=200, trace=None, experimental=None):
"""
Plot the analytical cross-variogram of a given MCMC inference
Args:
iter_plot (int): Number of traces to plot
trace (pymc3.trace): trace with the sill, range and weights of each property
experimental (bool): if True plot the experimental variogram as well
Returns:
None
"""
if not trace:
trace = self.trace
assert trace, 'set the trace to the object'
n_exp = self.n_exp
n_gauss = self.n_gauss
lags = self.lags
# DEP- n_equations = trace['weights'].shape[1]
n_iter = trace['weights'].shape[0]
lags_tiled = np.tile(lags, (iter_plot, 1))
b_var = []
for i in range(0, self.n_properties): # DEP- n_equations, (n_exp+n_gaus)):
# Init tensor
b = np.zeros((len(lags), n_iter, 0))
for i_exp in range(0, n_exp):
b = np.dstack((b, trace['weights'][:, i_exp + i * (n_exp + n_gauss)] *
exp_vario(lags, trace['sill'][:, i_exp], trace['range'][:, i_exp])))
for i_gauss in range(n_exp, n_gauss + n_exp):
b = np.dstack((b, trace['weights'][:, i_gauss + i * (n_exp + n_gauss)] *
gaus_vario(lags, trace['sill'][:, i_gauss], trace['range'][:, i_gauss])))
# Sum the contributins of each function
b_all = b.sum(axis=2)
# Append each variable
b_var.append(b_all[:, -iter_plot:].T)
# Bokeh code to plot this
p_all = []
for e, el in enumerate(self.properties):
p = bp.figure()#x_axis_type="log")
p.multi_line(list(lags_tiled), list(b_var[e]), color='olive', alpha=0.08)
if experimental:
p.scatter(self.lags, y=self.exp_var[el], color='navy', size=2)
p.title.text = el
p.xaxis.axis_label = "lags"
p.yaxis.axis_label = "Semivariance"
p_all = np.append(p_all, p)
grid = bl.gridplot(list(p_all), ncols=5, plot_width=250, plot_height=150)
show(grid)
def exp_vario(self, lags, sill, range_):
"""
Vectorial computation of exponential variogram
Args:
lags (numpy.array): Distances
sill (numpy.array): Array of sills. The shape will be number of properties by number of exponential
basic functions
range_ (numpy.array): Array of ranges. The shape will be number of properties by number of exponential
basic functions
Returns:
numpy.array: Exponential variogram for every lag and every sill and range.
"""
return sill * (1 - np.exp(np.dot(-lags.reshape(-1, 1) * 3, range_.reshape(1, -1) ** -1)))
def gaus_vario(self, lags, sill, range_):
"""
Vectorial computation of gauss variogram
Args:
lags (numpy.array): Distances
sill (numpy.array): Array of sills. The shape will be number of properties by number of gauss
basic functions
range_ (numpy.array): Array of ranges. The shape will be number of properties by number of gauss
basic functions
Returns:
numpy.array: gauss variogram for every lag and every sill and range.
"""
return sill * (1 - np.exp(np.dot(-lags.reshape(-1, 1) ** 2, (range_.reshape(1, -1) * 4 / 7) ** -2)))
def plot_cross_covariance(self, nuggets=False, iter_plot=200):
"""
Plot the cross covariance for the given properties
Args:
nuggets (numpy.array): subtracted nuggets
iter_plot (int): number of traces to plot
Returns:
None
"""
n_exp = self.n_exp
n_gauss = self.n_gauss
trace = self.trace
lags = self.lags
n_equations = trace['weights'].shape[1]
n_iter = trace['weights'].shape[0]
lags_tiled = np.tile(lags, (iter_plot, 1))
b_var = []
for i in range(0, self.n_properties): # n_equations, (n_exp+n_gaus)):
# Init tensor
b = np.zeros((len(lags), n_iter, 0))
for i_exp in range(0, n_exp):
# print(i_exp, "exp")
b = np.dstack((b, trace['weights'][:, i_exp + i * (n_exp + n_gauss)] *
exp_vario(lags, trace['sill'][:, i_exp], trace['range'][:, i_exp])))
for i_gaus in range(n_exp, n_gauss + n_exp):
# print(i_gaus)
b = np.dstack((b, trace['weights'][:, i_gaus + i * (n_exp + n_gauss)] *
gaus_vario(lags, trace['sill'][:, i_gaus], trace['range'][:, i_gaus])))
# Sum the contributins of each function
if nuggets:
b_all = 1 - (b.sum(axis=2) + self.nuggets[i])
else:
b_all = 1 - (b.sum(axis=2))
# Append each variable
b_var.append(b_all[:, -iter_plot:].T)
p_all = []
for e, el in enumerate(self.properties):
p = bp.figure(x_axis_type="log")
p.multi_line(list(lags_tiled), list(b_var[e]), color='olive', alpha=0.08)
p.title.text = el
p.xaxis.axis_label = "lags"
p.yaxis.axis_label = "Semivariance"
p_all = np.append(p_all, p)
grid = bl.gridplot(list(p_all), ncols=5, plot_width=250, plot_height=150)
show(grid)
def solve_kriging(self, selection_A, selection_b, #selected_coord_data, selected_values_data, selected_grid_to_inter, trace,
nuggets=None, n_var=1, n_exp=2, n_gauss=2):
"""
Solve the kriging system for n given point of the grid_to_inter property selected by selection_b using n
points of data_to_inter given by selection_A
Args:
selection_A (bool): input points used in A matrix
selection_b (bool): points to interpolate from the grid
nuggets (numpy.array): Nugget effect of each cross variogram. If None take property
n_var: number of variables
n_exp (int): number of exponential basic functions
n_gauss (int): number of gaussian basic functions
Returns:
"""
# Select input data and compute its euclidean distances
selected_coord_data = self.data_to_inter[selection_A][['X', 'Y', 'Z']]
selected_values_data = self.data_to_inter[selection_A][self.data_to_inter.columns.difference(['X', 'Y', 'Z'])].as_matrix()
h_A = self.SED_f(selected_coord_data, selected_coord_data)
# Select points of grid to interpolate and compute the euclidean distances respect the input data
selected_grid_to_inter = self.grid_to_inter[selection_b]
h_b = self.SED_f(selected_coord_data, selected_grid_to_inter)
# Parameters for the covariances
trace = self.trace
nuggets = self.nuggets
n_var = int(np.sqrt(self.n_properties))
n_exp = self.n_exp
n_gauss = self.n_gauss
# Choose a random trace to conserve the covariance uncertainty of the regression
sample = np.random.randint(trace['weights'].shape[0] - 1000, trace['weights'].shape[0])
# Compute cross-covariances
cov_h = cross_covariance(trace, h_A, sample=sample,
nuggets=nuggets, n_var=n_var, n_exp=n_exp, n_gaus=n_gauss, ordinary=True)
cov_b = cross_covariance(trace, h_b, sample=sample, nuggets=nuggets, n_var=n_var, n_exp=n_exp,
n_gaus=n_gauss,
ordinary=True)
# Solve kriging system
k_weights = np.linalg.solve(cov_h, cov_b)
# Number of points to interpolate
npti = selected_grid_to_inter.shape[0]
# Repeat the input data for every point to interpolate
svd_tmp = np.tile(np.repeat(selected_values_data, npti, axis=1), (n_var, 1))
# Sol ordinary kriging mean
k_mean = (svd_tmp * k_weights[:-n_var]).sum(axis=0)
# Sol ordinary kriging std
k_std = svd_tmp.std(axis=0) - (k_weights * cov_b)[:-n_var].sum(axis=0) +\
(k_weights * cov_b)[-n_var:].sum(axis=0)
assert all(k_std) > -10, "A standard deviation of kringing is really off. Check nothing is wrong"
# Set negatives to 0
k_std[k_std < 0] = 0.1
# Check the results make sense else take another sample and recompute
import scipy
l_low, l_high = scipy.stats.norm.interval(.95, loc=np.mean(selected_values_data, axis=0),
scale=np.std(selected_values_data, axis=0))
if not np.all((k_mean > l_low) * (k_mean < l_high)):
k_mean, k_std, cov_h, cov_b, k_weights, sample = self.solve_kriging(selection_A, selection_b)
self._recursion_check += 1
assert self._recursion_check<500, 'Too many recursions. Probably something goes wrong'
else:
self._recursion_check = 0
values_interp = np.random.normal(k_mean, k_std)
return k_mean, k_std, cov_h, cov_b, k_weights, sample # , k_std# - svd_tmp.std(axis=0)
def fit_cross_cov(df, lags, n_exp=2, n_gaus=2, range_mu=None):
n_var = df.columns.shape[0]
n_basis_f = n_var * (n_exp + n_gaus)
prior_std_reg = df.std(0).max() * 10
#
if not range_mu:
range_mu = lags.mean()
# Because is a experimental variogram I am not going to have outliers
nugget_max = df.values.max()
# print(n_basis_f, n_var*n_exp, nugget_max, range_mu, prior_std_reg)
# pymc3 Model
with pm.Model() as model: # model specifications in PyMC3 are wrapped in a with-statement
# Define priors
sigma = pm.HalfCauchy('sigma', beta=prior_std_reg, testval=1., shape=n_var)
psill = pm.Normal('sill', prior_std_reg, sd=.5 * prior_std_reg, shape=(n_exp + n_gaus))
range_ = pm.Normal('range', range_mu, sd=range_mu * .3, shape=(n_exp + n_gaus))
# nugget = pm.Uniform('nugget', 0, nugget_max, shape=n_var)
lambda_ = pm.Uniform('weights', 0, 1, shape=(n_var * (n_exp + n_gaus)))
# Exponential covariance
exp = pm.Deterministic('exp',
# (lambda_[:n_exp*n_var]*
psill[:n_exp] *
(1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)),
(range_[:n_exp].reshape((1, n_exp)) / 3.) ** -1))))
gaus = pm.Deterministic('gaus',
psill[n_exp:] *
(1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)) ** 2,
(range_[n_exp:].reshape((1, n_gaus)) * 4 / 7.) ** -2))))
func = pm.Deterministic('func', T.tile(T.horizontal_stack(exp, gaus), (n_var, 1, 1)))
func_w = pm.Deterministic("func_w", T.sum(func * lambda_.reshape((n_var, 1, (n_exp + n_gaus))), axis=2))
# nugget.reshape((n_var,1)))
for e, cross in enumerate(df.columns):
# Likelihoods
pm.Normal(cross + "_like", mu=func_w[e], sd=sigma[e], observed=df[cross].as_matrix())
return model
def exp_vario(lags, sill, range_):
return sill * (1 - np.exp(np.dot(-lags.reshape(-1, 1) * 3, range_.reshape(1, -1) ** -1)))
def gaus_vario(lags, sill, range_):
return sill * (1 - np.exp(np.dot(-lags.reshape(-1, 1) ** 2, (range_.reshape(1, -1) * 4 / 7) ** -2)))
def plot_cross_variograms(trace, lags, df, n_exp=2, n_gaus=2, iter_plot=200, experimental=None):
n_equations = trace['weights'].shape[1]
n_iter = trace['weights'].shape[0]
lags_tiled = np.tile(lags, (iter_plot, 1))
b_var = []
for i in range(0, df.shape[1]): # n_equations, (n_exp+n_gaus)):
# Init tensor
b = np.zeros((len(lags), n_iter, 0))
for i_exp in range(0, n_exp):
# print(i_exp, "exp")
b = np.dstack((b, trace['weights'][:, i_exp + i * (n_exp + n_gaus)] *
exp_vario(lags, trace['sill'][:, i_exp], trace['range'][:, i_exp])))
for i_gaus in range(n_exp, n_gaus + n_exp):
# print(i_gaus)
b = np.dstack((b, trace['weights'][:, i_gaus + i * (n_exp + n_gaus)] *
gaus_vario(lags, trace['sill'][:, i_gaus], trace['range'][:, i_gaus])))
# Sum the contributins of each function
b_all = b.sum(axis=2)
# Append each variable
b_var.append(b_all[:, -iter_plot:].T)
p_all = []
for e, el in enumerate(df.columns):
p = bp.figure(x_axis_type="log")
p.multi_line(list(lags_tiled), list(b_var[e]), color='olive', alpha=0.08)
if experimental is not None:
p.scatter(experimental['lags'], y=experimental[el], color='navy', size=2)
p.title.text = el
p.xaxis.axis_label = "lags"
p.yaxis.axis_label = "Semivariance"
p_all = np.append(p_all, p)
grid = bl.gridplot(list(p_all), ncols=5, plot_width=200, plot_height=150)
show(grid)
def plot_cross_covariance(trace, lags, df, n_exp=2, n_gaus=2, nuggets=None, iter_plot=200):
n_equations = trace['weights'].shape[1]
n_iter = trace['weights'].shape[0]
lags_tiled = np.tile(lags, (iter_plot, 1))
b_var = []
for i in range(0, df.shape[1]): # n_equations, (n_exp+n_gaus)):
# Init tensor
b = np.zeros((len(lags), n_iter, 0))
for i_exp in range(0, n_exp):
# print(i_exp, "exp")
b = np.dstack((b, trace['weights'][:, i_exp + i * (n_exp + n_gaus)] *
exp_vario(lags, trace['sill'][:, i_exp], trace['range'][:, i_exp])))
for i_gaus in range(n_exp, n_gaus + n_exp):
# print(i_gaus)
b = np.dstack((b, trace['weights'][:, i_gaus + i * (n_exp + n_gaus)] *
gaus_vario(lags, trace['sill'][:, i_gaus], trace['range'][:, i_gaus])))
# Sum the contributins of each function
if nuggets is not None:
b_all = 1 - (b.sum(axis=2) + nuggets[i])
else:
b_all = 1 - (b.sum(axis=2))
# Append each variable
b_var.append(b_all[:, -iter_plot:].T)
p_all = []
for e, el in enumerate(df.columns):
p = bp.figure(x_axis_type="log")
p.multi_line(list(lags_tiled), list(b_var[e]), color='olive', alpha=0.08)
p.title.text = el
p.xaxis.axis_label = "lags"
p.yaxis.axis_label = "Semivariance"
p_all = np.append(p_all, p)
grid = bl.gridplot(list(p_all), ncols=5, plot_width=250, plot_height=150)
show(grid)
def cross_covariance(trace, sed, sample, nuggets=None, n_var=1, n_exp=2, n_gaus=2, ordinary=True):
"""
Args:
trace:
sed:
nuggets:
n_var:
n_exp:
n_gaus:
ordinary:
Returns:
"""
h = np.ravel(sed)
n_points = len(h)
n_points_r = sed.shape[0]
n_points_c = sed.shape[1]
#sample = np.random.randint(trace['weights'].shape[0]-200, trace['weights'].shape[0])
n_eq = trace['weights'].shape[1]
# Exp contribution
exp_cont = (np.tile(
exp_vario(h, trace['sill'][sample][:n_exp], trace['range'][sample][:n_exp]),
n_var**2
) * trace['weights'][sample][np.linspace(0, n_eq-1, n_eq) % (n_exp + n_gaus) < n_exp]).reshape(n_points, n_exp, n_var**2, order="F")
# Gauss contribution
gaus_cont = (np.tile(
gaus_vario(h, trace['sill'][sample][n_exp:], trace['range'][sample][n_exp:]),
n_var**2
) * trace['weights'][sample][np.linspace(0, n_eq-1, n_eq) % (n_exp + n_gaus) >= n_exp]).reshape(n_points, n_gaus, n_var**2, order="F")
# Stacking and summing
conts = np.hstack((exp_cont, gaus_cont)).sum(axis=1)
if nuggets is not None:
conts += nuggets
cov = 1 - conts
cov_str = np.zeros((0, n_points_c*n_var))
cov_aux = cov.reshape(n_points_r, n_points_c, n_var**2, order='F')
# This is a incredibly convoluted way to reshape the cross covariance but I did not find anything better
for i in range(n_var):
cov_str = np.vstack((cov_str,
cov_aux[:, :, i*n_var:(i+1)*n_var].reshape(n_points_r, n_points_c*n_var, order='F')))
if ordinary:
ord = np.zeros((n_var, n_var*n_points_c+n_var))
for i in range(n_var):
ord[i, n_points_c * i:n_points_c * (i + 1)] = np.ones(n_points_c)
cov_str = np.vstack((cov_str, ord[:, :-n_var]))
# Stack the ordinary values to the C(h)
if n_points_r == n_points_c:
cov_str = np.hstack((cov_str, ord.T))
return cov_str
def clustering_grid(grid_to_inter, n_clusters=50, plot=False):
from sklearn.cluster import KMeans
clust = KMeans(n_clusters=n_clusters).fit(grid_to_inter)
if plot:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(clust.cluster_centers_[:, 0], clust.cluster_centers_[:, 1], clust.cluster_centers_[:, 2])
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.show()
return clust
def select_points(df, grid_to_inter, cluster, SED_f = theano_sed(), n_rep=10):
points_cluster = np.bincount(cluster.labels_)
# SED_f = theano_sed()
for i in range(n_rep):
for i_clust in range(cluster.n_clusters):
cluster_bool = cluster.labels_ == i_clust
cluster_grid = grid_to_inter[cluster_bool]
# Mix the values of each cluster
if i is 0:
np.random.shuffle(cluster_grid)
size_range = int(points_cluster[i_clust]/n_rep)
selected_cluster_grid = cluster_grid[i * size_range:(i + 1) * size_range]
dist = SED_f(df, selected_cluster_grid)
# Checking the radio of the simulation
for r in range(100, 1000, 100):
select = (dist < r).any(axis=1)
if select.sum() > 50:
break
h_x0 = dist
yield (h_x0, select, selected_cluster_grid)
def SGS_compute_points(selected_coord_data, selected_grid_to_inter, selected_values_data,
trace, nuggets=None, n_var=1, n_exp=2, n_gaus=2):
#SED_f = theano_sed()
#SED = SED_f(selected_coord_data, selected_coord_data)
npti = selected_grid_to_inter.shape[1]
cov_h = cross_covariance(trace, selected_coord_data,
nuggets=nuggets, n_var=n_var, n_exp=n_exp, n_gaus=n_gaus, ordinary=True)
cov_b = cross_covariance(trace, selected_grid_to_inter, nuggets=nuggets, n_var=n_var, n_exp=n_exp, n_gaus=n_gaus,
ordinary=True)
k_weights = np.linalg.solve(cov_h, cov_b)
svd_tmp = np.tile(np.repeat(selected_values_data, npti, axis=1), (n_var, 1))
# Sol ordinary kriging mean
k_mean = (svd_tmp * k_weights[:-n_var]).sum(axis=0)
# Sol ordinary kriging std
k_std = svd_tmp.std(axis=0) - (k_weights * cov_b)[:-n_var].sum(axis=0) + (k_weights * cov_b)[-n_var:].sum(axis=0)
assert all(k_std) > -10, "A standard deviation of kringing is really off. Check nothing is wrong"
# Set negatives to 0
k_std[k_std < 0] = 0.1
values_interp = np.random.normal(k_mean, k_std)
# for point in range(npti-1):
# values_data = np.vstack((values_data, values_interp[point::npti]))
#coord_data = np.vstack((coord_data, grid_interpolating))
return values_interp#, k_std# - svd_tmp.std(axis=0)
def SGS_run(df, grid_to_inter, cluster,
trace, nuggets=None, n_var=1, n_exp=2, n_gaus=2,
n_rep=10, verbose = 0):
points_cluster = np.bincount(cluster.labels_)
coord_data = df[['X', 'Y', 'Z']].as_matrix()
values_data = df[df.columns.difference(['X', 'Y', 'Z'])].as_matrix()
SED_f = theano_sed()
for i in range(n_rep):
for i_clust in range(cluster.n_clusters):
cluster_bool = cluster.labels_ == i_clust
cluster_grid = grid_to_inter[cluster_bool]
# Mix the values of each cluster
# if i is 0:
# np.random.shuffle(cluster_grid)
size_range = int(points_cluster[i_clust]/n_rep)