In [1]:
%%capture
!pip install utm
!pip install openpyxl

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import cm
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
import matplotlib.pyplot as plt
import utm

import warnings
warnings.filterwarnings("ignore")

In [53]:
data = pd.read_excel("../Dataset/Maharashtra_Soil_Nutrients_Data.xlsx")
data.head()

Unnamed: 0,lon,lat,OC,N,P,K
0,73.401111,17.894722,1.08,756.0,9.43,834.37
1,73.401389,17.894722,1.12,781.2,9.21,265.1
2,73.402222,17.894722,0.68,478.8,8.99,318.96
3,73.403056,17.894722,1.76,1234.8,9.65,954.77
4,73.403333,17.894722,1.78,1247.4,8.77,371.77


In [54]:
def scaled_coord(x,y):
    """
    parameters
    ----------
    x : numpy array, float64
        list of longitude cordinates
    y : numpy array, float64
        list of latitude cordinates
        
    return
    ------
    scaled(0-1) x and y
    """
    x = (x-x.min())/(x.max()-x.min())
    y = (y-y.min())/(y.max()-y.min())
    return x,y

In [55]:
%%time

val_col = ['OC','N','P','K']
values = data[val_col]
coordinates = data[['lon','lat']]
#lat,lon to utm projection

x,y,zone,ut = utm.from_latlon(coordinates['lat'].values,coordinates['lon'].values)

lon,lat = y/1000,x/1000 #in km

# lon, lat = scaled_coord(lon,lat)
#normalize values of OC, N, K, P

#standardise lon and lat
lon = (lon-np.mean(lon))/np.std(lon)
lat = (lat-np.mean(lat))/np.std(lat)

test_k = MinMaxScaler().fit_transform(values)
values = test_k

Wall time: 15 ms


In [57]:
data['lon'] = lon
data['lat'] = lat
for i,col in enumerate(val_col):
    data[col] = values[:,i]

In [88]:
data.head()

Unnamed: 0,lon,lat,OC,N,P,K
0,-1.425536,-2.402803,0.011632,0.079661,0.001163,0.012262
1,-1.42554,-2.402303,0.012067,0.082316,0.001136,0.003893
2,-1.42555,-2.400805,0.007283,0.05045,0.001109,0.004684
3,-1.425561,-2.399307,0.019024,0.130115,0.00119,0.014032
4,-1.425565,-2.398808,0.019241,0.131443,0.001082,0.005461


In [94]:
#split dataset into train and test
# split the dataset into train and test dataset
ix = np.random.choice(data.shape[0],int(data.shape[0]*0.2),replace = False)
data_train = data.iloc[[int(i) for i in range(data.shape[0]) if i not in ix]].reset_index(drop = True)
data_test = data.iloc[ix].reset_index(drop = True)

In [95]:
data_train.shape, data_test.shape

((20837, 6), (5209, 6))

In [96]:
data_train.head()

Unnamed: 0,lon,lat,OC,N,P,K
0,-1.425536,-2.402803,0.011632,0.079661,0.001163,0.012262
1,-1.42554,-2.402303,0.012067,0.082316,0.001136,0.003893
2,-1.42555,-2.400805,0.007283,0.05045,0.001109,0.004684
3,-1.425565,-2.398808,0.019241,0.131443,0.001082,0.005461
4,-1.425575,-2.39731,0.016306,0.111527,0.001136,0.007821


In [97]:
data_test.head()

Unnamed: 0,lon,lat,OC,N,P,K
0,-0.29388,-0.602949,0.011849,0.029912,0.001579,0.001492
1,-0.568214,0.959502,0.011958,0.014854,0.000451,0.002744
2,-0.829316,-0.745546,0.002174,0.004211,0.000716,0.00417
3,1.591848,0.984585,0.005435,0.037615,0.001821,0.002374
4,0.621615,-0.65224,0.006196,0.006668,0.009286,0.013992


In [107]:
torch.repeat_interleave?

## Data loading in torch.Dataloader

In [98]:
class NutrientsDataset(Dataset):
    def __init__(self, df, num_context=40, num_extra_target=10):
        self.df = df
        self.num_context = num_context
        self.num_extra_target = num_extra_target

    def get_rows(self, i):
        rows = self.df.iloc[i : i + (self.num_context + self.num_extra_target)].copy()
        x = rows.iloc[:,:2].copy()
        y = rows.iloc[:,2:].copy()
        return x, y


    def __getitem__(self, i):
        x, y = self.get_rows(i)
        return x.values, y.values
        
    def __len__(self):
        return len(self.df) - (self.num_context + self.num_extra_target)

In [99]:
def npsample_batch(x, y, size=None, sort=False):
    
    """Sample from numpy arrays along 2nd dim."""
    inds = np.random.choice(range(x.shape[1]), size=size, replace=False)
    return x[:, inds], y[:, inds]

def collate_fns(max_num_context, max_num_extra_target, sample, sort=True, context_in_target=True):
    def collate_fn(batch, sample=sample):
        # Collate
        x = np.stack([x for x, y in batch], 0)
        y = np.stack([y for x, y in batch], 0)

        # Sample a subset of random size
        num_context = np.random.randint(4, max_num_context)
        num_extra_target = np.random.randint(4, max_num_extra_target)

        x = torch.from_numpy(x).float()
        y = torch.from_numpy(y).float()

        
        x_context = x[:, :max_num_context]
        y_context = y[:, :max_num_context]
    
        x_target_extra = x[:, max_num_context:]
        y_target_extra = y[:, max_num_context:]
        
        if sample:

            x_context, y_context = npsample_batch(
                x_context, y_context, size=num_context
            )

            x_target_extra, y_target_extra = npsample_batch(
                x_target_extra, y_target_extra, size=num_extra_target, sort=sort
            )

        # do we want to compute loss over context+target_extra, or focus in on only target_extra?
        if context_in_target:
            x_target = torch.cat([x_context, x_target_extra], 1)
            y_target = torch.cat([y_context, y_target_extra], 1)
        else:
            x_target = x_target_extra
            y_target = y_target_extra

        
        return x_context, y_context, x_target, y_target

    return collate_fn

In [106]:
hparamas = dict(num_context = 15,
               num_extra_target = 16,
               batch_size = 40,
               context_in_target = False)
train_df = NutrientsDataset(data_train,hparamas['num_context'],hparamas['num_extra_target'])

train_loader = DataLoader(data_train,
                          batch_size=hparamas['batch_size'],
                         shuffle = True,
                         collate_fn=collate_fns(
                             hparamas['num_context'],hparamas['num_extra_target'], True,hparamas['context_in_target']))

In [63]:
class baseNPBlock(nn.Module):
    """relu non-linearities for NP block"""
    def __init__(self, inp_size,op_size, norm, bias, p = 0):
        """init function for linear2d class
        
        parameters
        ----------
        inp_size : int
                input dimension for the Encoder part (d_in)
        op_size : int
                output dimension for Encoder part(d_out)
        norm : str
                normalization to be applied on linear output
                pass norm == 'batch' to apply batch normalization
                else dropout normalization is applied
        bias : bool
                if True, bias is included for linear layer else discarded
        p : float
                probality to be considered while applying Dropout regularization
                
        """
        super().__init__()
        self.norm = norm
        self.linear = nn.Linear(inp_size,op_size,bias = bias)
        self.relu  = nn.ReLU()
        self.batch_norm = nn.BatchNorm2d(op_size)
        self.dropout = nn.Dropout2d(p)
        
    def forward(self,x):
        x = self.linear(x)
        x = self.batch_norm(x.permute(0,2,1)[:,:,:,None]) if self.norm == 'batch' else self.dropout(x.permute(0,2,1)[:,:,:,None])
        
        x = self.relu(x[:,:,:,0].permute(0,2,1))
        return x

In [62]:
class batch_MLP(nn.Module):
    """ Batch MLP layer for NP-Encoder"""
    def __init__(self, in_size, op_size, num_layers, norm, p = 0):
        """init function for linear2d class
        
        parameters
        ----------
        inp_size : int
                input dimension for the Encoder part (d_in)
        op_size : int
                output dimension for Encoder part(d_out)
        norm : str
                normalization to be applied on linear output
                pass norm == 'batch' to apply batch normalization
                else dropout normalization is applied
                
        return torch.tensor of size (B,num_context_points,d_out)
        """
        super().__init__()
        self.in_size = in_size
        self.op_size = op_size
        self.num_layers = num_layers
        self.norm  = norm
        
        self.first_layer = baseNPBlock(in_size, op_size, self.norm, False,p)
        self.encoder = nn.Sequential(*[batch_MLP(op_size, op_size, self.norm, False, p) for layer in range(self.num_layers-2)])
        self.last_layer = nn.ReLU()
        
    def forward(self, x):
        x = self.first_layer(x)
        x = self.encoder(x)
        x = self.last_layer(x)
        
        return x

In [None]:
class LinearAttention(nn.Module):
    def __init__(self,in_ch, out_ch):
        super().__init__()
        self.linear = nn.linear(in_ch, out_ch, bias = False)
        torch.nn.init.normal_(self.linear.weight,std = in_ch**0.5) #initilize weight matrix
        
    def forward(self,x):
        return self.linear(x)
    
    
class AttentionModule(nn.Module):
    def __init__(
        self,
        hidden_dim, 
        attn_type, 
        attn_layers,
        n_multiheads,
        x_dim, 
        rep='mlp',
        norm = 'dropout',
        p = 0):
        
        super().__init__()
        self._rep = rep
        
        if self._rep ='mlp':
            
            #Both Key and Value needs to have same dimension
            self.batch_mlpk = batch_MLP(x_dim, hidden_dim, attn_layers, norm )
            self.batch_mlpq = batch_MLP(x_dim, hidden_dim, attn_layers, norm, )
        
        
        if attn_type == 'uniform':
            self.attn_func = self.uniform_attn
        if attn_type=='laplace':
            self.attn_func = self.laplace_attn
        if attn_type == 'dot':
            self.attn_func = self.dot_attn
        elif attn_type == 'multihead':
            self.W_k = nn.ModuleList([LinearAttention(hidden_dim,hidden_dim) for head in range(n_multiheads)])
            self.W_v = nn.ModuleList([LinearAttention(hidden_dim,hidden_dim) for head in range(n_multiheads)])
            self.W_q = nn.ModuleList([LinearAttention(hidden_dim,hidden_dim) for head in range(n_multiheads)])
            
            self.w = LinearAttention(hidden_dim*n_multiheads,hidden_dim)
            self.attn_func = self.multihead_attn
            self.num_heads = n_multiheads
            
            
            
    def forward(self, k, q, v):
        if self.rep =='mlp':
            k = self.batch_mlpk(k)
            q = self.batch_mlpq(q)
        
        rep = self.attn_func(k,q,v)
        
        return rep
    
    
    def uniform_attn(self,k,q,v):
        num_points = q.shape[1]
        rep = torch.mean(v, axis = 1, keepdim = True)
        rep = rep.repeat(1,num_points,1)
        
        return rep
    
    def laplace_attn(self, k, q, v):
        
        
        