This script shows how to use an sklearn estimator (in this case simply knn nearest neighbors) to interpolate point data into continous gridded data within the basin region of the model (the area between the surface from the DEM and the basin elevation surface derived from Abott and Louie). 
            



In [None]:
#my library
import points2Rfile
from points2Rfile import grid
#sklearn
from sklearn import linear_model as SK_linear_model
from sklearn.model_selection import GridSearchCV
#dask dependencies 
import dask.array as da
from dask_ml.linear_model import LinearRegression as dask_linear_regression
from dask_ml.wrappers import ParallelPostFit
#for numpy data type enforcement
import numpy as np




In [None]:
silenceWarnings=True

In [None]:
#load an hdf5 container with points
gridFname = "renoRemi100m.hdf5"
gridFile = grid.grid(gridFname)
#preprare input points to predict accross
x=gridFile.x.flatten()
y=gridFile.y.flatten()
z=gridFile.z.flatten()


stackedPoints = da.stack([x,y,z],axis=1)
#shape this correctly (has to do with directionality of chunks in memory, note that doing it with the auto option)
#is probably pretty inefficient and you may need to think about a better way of doing this
print("warning this particular rechunking method may be slow!")
#note that I have reduced the chunk size 
stackedPoints = stackedPoints.rechunk({0: "auto", 1: -1},block_size_limit=1e6)

### Assign all input points to an simple nearest neighbor regressor without any weighting

In [None]:
#grab all of the points availible in this grid file and use them to construct computation
points = []
for key in gridFile.data["points"].keys():
    #construct points object
    
    points.append(points2Rfile.grid.points(gridFile,key))
### Now get all of the points values for each object
pointValues = []
for point in points:
    pointValues.append(point.getPoints())
pointValues = da.concatenate(pointValues,axis=1).T

### run k means neighbors regressor for each with n jobs


In [None]:
#silence warnings
if silenceWarnings:
    import warnings
    warnings.filterwarnings('ignore')
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

targets = da.stack([pointValues[:,0],pointValues[:,1],pointValues[:,2]],axis=1)
#all point values 
pVp,pVs,pP,pQp,pQs = pointValues[:,3],pointValues[:,4],pointValues[:,5],pointValues[:,6],pointValues[:,7]
nNeighbors = 3
weight = 'distance' #influence of a point is equal to its inverse distance (so closer ones are more important)
modelVp = KNeighborsRegressor(n_neighbors=nNeighbors,n_jobs=-1,weights=weight)
modelVp.fit(X=targets,y=pVp)
modelVp = ParallelPostFit(estimator=modelVp)
vp = modelVp.predict(stackedPoints)
print("Done with vp computing all of the other ones!")
#coordinate vector for points
#vs
modelVs = KNeighborsRegressor(n_neighbors=nNeighbors,n_jobs=-1,weights=weight)
modelVs.fit(X=targets,y=pVs)
modelVs = ParallelPostFit(estimator=modelVs)
vs = modelVs.predict(stackedPoints) 
#p
modelp = KNeighborsRegressor(n_neighbors=nNeighbors,n_jobs=-1,weights=weight)
modelp.fit(X=targets,y=pP)
modelp = ParallelPostFit(estimator=modelp)
p = modelp.predict(stackedPoints) 
#qp
modelqp = KNeighborsRegressor(n_neighbors=nNeighbors,n_jobs=-1,weights=weight)
modelqp.fit(X=targets,y=pQp)
modelqp = ParallelPostFit(estimator=modelqp)
qp = modelqp.predict(stackedPoints) 
#qs
modelqs = KNeighborsRegressor(n_neighbors=nNeighbors,n_jobs=-1,weights=weight)
modelqs.fit(X=targets,y=pQs)
modelqs = ParallelPostFit(estimator=modelqs)
qs = modelqs.predict(stackedPoints) 
#compute and save all to hdf5 file
#with parallel_backend('dask'):
print("preparing to overwrite grid!")
#gridFile.clearGrid()
#cast all of this crap as dask arrays

gridFile.assignNewGridProperties(vp,vs,p,qp,qs)

### make sure that everything is cut off below the DEM (since there obviously isnt data in the air above)

In [None]:
#reload the grid file--This shouldnt be necessary but not doing it messes things up sometimes and I have been unable to determine why
gridFile = grid.grid(gridFname)
gridMaterials = grid.materialProperties(gridFile,mpropsINI="mprops.ini")
gridMaterials.cutOffAtDEM()


In [None]:
#save the result as an sw4 compatible rfile

In [None]:
import points2Rfile
from points2Rfile import grid #maybe dont need this, find out why I would need to any way
from points2Rfile import rfile
import dask.array as da 
import numpy as np
#top of the rfile
maxElevation = -4000


In [None]:
gridFileName = gridFname


In [None]:
#load grid
gridFile = grid.grid(gridFileName)

In [None]:
#instantiate rfile file io object
#name the rFile
f = gridFile.fname.split('.')[0] + ".r"
fileObject = open(f,"wb")

In [None]:
#write header information
rfile.write_hdr(fileObject, magic=1, precision=4, attenuation=1,az=gridFile.mdata["AZIMUTH"],
lon0=gridFile.mdata["LON0"], lat0=gridFile.mdata["LAT0"],
proj_str="+proj=utm +zone=36 +datum=WGS84 +units=m +no_defs", nb=2)

In [None]:
#write topo header
rfile.write_block_hdr(fileObject, gridFile.mdata["DX"],gridFile.mdata["DY"], 0.0, 1,gridFile.mdata["NX"] ,gridFile.mdata["NY"],1)


In [None]:
#write data block header
rfile.write_block_hdr(fileObject, gridFile.mdata["DX"],gridFile.mdata["DZ"], maxElevation, 5,gridFile.mdata["NX"] ,gridFile.mdata["NY"],gridFile.mdata["NZ"])


In [None]:
#write topo block
rfile.write_topo_block(fileObject,gridFile.topo.compute())

In [None]:
#write material properties
rfile.write_properties(fileObject,gridFile.vp.compute(),5,gridFile.vs.compute(),gridFile.p.compute(),gridFile.qp.compute(),gridFile.qs.compute(),gridFile.x.shape)

In [None]:
ls -laht

In [None]:
#verify that the hdf5 file is closed correctly
gridFile.data.close()