# Measuring the Confidence of the k-Nearest Neighbors Algorithm developed in Python for Visualizing the 3D Stratigraphic Architecture of the Llobregat River Delta

In this notebook, we draw maps that measure, in some way, how solid are the predictions used for the horizontal sections in https://www.mdpi.com/2077-1312/10/7/986. For this purpose we take a grid of points in the Llobregat River Delta and we assign to each point a confidence metric satisfying some desirable properties.

In [1]:
DATADIR='data/' # Directory with the data
FIGURESDIR='figures/' # Figures produced

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import os

import shapely.geometry as geometry

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors._base import _get_weights

In [3]:
Heights=list(range(0,-101,-5)) # Heights where the confidence maps will be obtained

FRAME=500 # Plot frame

The metric we propose uses the weight of the nearest neighbor and divies it by the sum of the weights of the four nearest neighbors such that each one belongs to a different class, this includes the nearest neighbor used to infer the granulometry, so the confidence degree lies always in the interval $[0.25,1]$.

A bonus of confidence is applied when there are several nearest neighbors of the class basement closer than any other non-basement neighbor.

In [4]:
knn_grav = NearestNeighbors(n_neighbors=1)
knn_sand = NearestNeighbors(n_neighbors=1)
knn_clay = NearestNeighbors(n_neighbors=1)
knn_base = NearestNeighbors(n_neighbors=4)

def confidence(X):
    neight_grav=knn_grav.kneighbors(X)[0].reshape((len(X)))
    neight_sand=knn_sand.kneighbors(X)[0].reshape((len(X)))
    neight_clay=knn_clay.kneighbors(X)[0].reshape((len(X)))
    
    m = min(knn_base.n_samples_fit_, 4)
    neight_base=knn_base.kneighbors(X, m)[0].reshape((len(X),m))
        
    cert=np.zeros(len(X))    
        
    for i in range(len(X)):
        bas=sorted(list(neight_base[i]))
        no_bas=[neight_grav[i],neight_sand[i],neight_clay[i]]
        m_no_bas=min(no_bas)
        w=_get_weights(np.array([no_bas+[bas[0]]]), 'distance')[0]
        cert[i]=np.max(w)/np.sum(w)
        
        # Bonus of confidence when several basement points are closer than the nearest non-basement neighbor.
        s=1
        while s<m and bas[s]<m_no_bas:
            w=_get_weights(np.array([no_bas+[bas[s]]]), 'distance')[0]
            cert[i]=cert[i]+(1-cert[i])*(w[3]-max(w[:3]))/np.sum(w)
            s+=1
        
    return 100*cert

Although the confidence function is continuous, we discretize it into several intervals.

In [5]:
cmap = plt.cm.rainbow
cmaplist = [cmap(i) for i in range(cmap.N)]
cmap = mpl.colors.LinearSegmentedColormap.from_list('Custom cmap', cmaplist, cmap.N)
bounds=np.array([25, 30, 35, 40, 50, 60, 70, 80, 90, 95, 100])
norm=mpl.colors.BoundaryNorm(bounds, cmap.N)

As in the horizontal sections, we create a grid over the delta. Then, for each point in the grid, we estimate the certainty of the knn prediction with the previous function. Finally, we plot only the points inside the delta contour.

In [6]:
data=pd.read_excel(DATADIR+'hsd new basements.xls')
data=data.drop(columns=['Codi'])

ax=[data.UTM_X.min()-FRAME,data.UTM_X.max()+FRAME,data.UTM_Y.min()-FRAME,data.UTM_Y.max()+FRAME]
minx_rounded = 1000 * round(ax[0]/1000)
maxx_rounded = 1000 * round(ax[1]/1000)

xx, yy = np.meshgrid(np.linspace(ax[0],ax[1],300),np.linspace(ax[2],ax[3],300))
C=np.zeros(xx.shape,dtype = float)

contorno=pd.read_csv(DATADIR+'deltacontour.csv')
contorno=contorno.drop(columns=['Cota'])
poligono=geometry.Polygon(zip(contorno['UTM_X'],contorno['UTM_Y']))

outside=[] # Points outside the contour
for i in range(xx.shape[0]):
    for j in range(xx.shape[1]):
        if poligono.distance(geometry.Point(xx[i,j],yy[i,j]))>1:
            outside.append((i,j))
            
xpol,ypol = poligono.exterior.xy

for H in Heights:

    df=data[data.Cota==H]  
        
    gravels=df[df.Clase=='gravillas y gravas']
    sands=df[df.Clase=='arenas']
    clays=df[df.Clase=='arcillas y limos']
    basements=df[df.Valor=='S']
    
    knn_grav.fit(list(zip(gravels['UTM_X'],gravels['UTM_Y'])))
    knn_sand.fit(list(zip(sands['UTM_X'],sands['UTM_Y'])))
    knn_clay.fit(list(zip(clays['UTM_X'],clays['UTM_Y'])))
    knn_base.fit(list(zip(basements['UTM_X'],basements['UTM_Y'])))
        
    for i in range(xx.shape[0]):
        d=list(zip(xx[i],yy[i]))
        C[i]=confidence(d)

    for (i,j) in outside:
        C[i,j]=np.nan

    plt.imshow(C, vmin = 25, vmax = 100, cmap=cmap, origin='lower', norm=norm, alpha=0.6,
           extent=[xx.min(), xx.max(), yy.min(), yy.max()])
    
    plt.colorbar(spacing='proportional').set_label(' %', rotation=0)
    
    plt.scatter(gravels['UTM_X'], gravels['UTM_Y'], c='blue', s=1, alpha=0.9, label='gravels')
    plt.scatter(sands['UTM_X'], sands['UTM_Y'], c='orange', s=1, alpha=0.8, label='sands')
    plt.scatter(clays['UTM_X'], clays['UTM_Y'], c='black', s=1, alpha=0.7, label='clays')
    plt.scatter(basements['UTM_X'], basements['UTM_Y'], c='brown', s=1, alpha=0.7, label='basements')

    plt.plot(xpol, ypol, alpha=0.6, color='black', linewidth=1.5)    

    plt.xlabel('UTM_X')
    plt.ylabel('UTM_Y')

    plt.xticks(np.arange(minx_rounded, maxx_rounded, step=3000))
    
    plt.title("Height "+str(H)+' m')
    plt.axis(ax)

    filename='height'+str(H)+'.png'
    
    filename=FIGURESDIR+filename
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    plt.savefig(filename, transparent=False, dpi=164, bbox_inches='tight')

    plt.clf()
    #plt.show()

<Figure size 432x288 with 0 Axes>