In [1]:
# https://machinelearningmastery.com/pytorch-tutorial-develop-deep-learning-models/
# pytorch mlp for regression
from numpy import vstack
from numpy import sqrt
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
from torch.nn import Linear
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import MSELoss
from torch.nn.init import xavier_uniform_

In [5]:
import torch
print(torch.__version__)

1.4.0


In [None]:
# dataset definition
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # store the inputs and outputs
        self.X = ...
        self.y = ...
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
    
# create the dataset
dataset = CSVDataset(...)
# select rows from the dataset
train, test = random_split(dataset, [[...], [...]])
# create a data loader for train and test sets
train_dl = DataLoader(train, batch_size=32, shuffle=True)
test_dl = DataLoader(test, batch_size=1024, shuffle=False)    

In [2]:
# dataset definition
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = read_csv(path, header=None)
        # store the inputs and outputs
        self.X = df.values[:, :-1].astype('float32')
        self.y = df.values[:, -1].astype('float32')
        # ensure target has the right shape
        self.y = self.y.reshape((len(self.y), 1))
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])
 
# model definition
class MLP(Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        # input to first hidden layer
        self.hidden1 = Linear(n_inputs, 10)
        xavier_uniform_(self.hidden1.weight)
        self.act1 = Sigmoid()
        # second hidden layer
        self.hidden2 = Linear(10, 8)
        xavier_uniform_(self.hidden2.weight)
        self.act2 = Sigmoid()
        # third hidden layer and output
        self.hidden3 = Linear(8, 1)
        xavier_uniform_(self.hidden3.weight)
 
    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
         # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # third hidden layer and output
        X = self.hidden3(X)
        return X
 
# prepare the dataset
def prepare_data(path):
    # load the dataset
    dataset = CSVDataset(path)
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=32, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return train_dl, test_dl
 
# train the model
def train_model(train_dl, model):
    # define the optimization
    criterion = MSELoss()
    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    # enumerate epochs
    for epoch in range(100):
        # enumerate mini batches
        for i, (inputs, targets) in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat = model(inputs)
            # calculate loss
            loss = criterion(yhat, targets)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()
 
# evaluate the model
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate mse
    mse = mean_squared_error(actuals, predictions)
    return mse
 
# make a class prediction for one row of data
def predict(row, model):
    # convert row to data
    row = Tensor([row])
    # make prediction
    yhat = model(row)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    return yhat
 


In [3]:
# prepare the data
path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
train_dl, test_dl = prepare_data(path)
print(len(train_dl.dataset), len(test_dl.dataset))
# define the network
model = MLP(13)
# train the model
train_model(train_dl, model)
# evaluate the model
mse = evaluate_model(test_dl, model)
print('MSE: %.3f, RMSE: %.3f' % (mse, sqrt(mse)))
# make a single prediction (expect class=1)
row = [0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98]
yhat = predict(row, model)
print('Predicted: %.3f' % yhat)

339 167
MSE: 81.341, RMSE: 9.019
Predicted: 21.890


In [4]:
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    return predictions
evaluate_model(test_dl, model)


array([[21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],
       [21.89015 ],


In [None]:
# ADAPTATION FOR EONR PREDICTION

In [None]:
high_var = ['rain_30', 'rain_60','rain_90', 't_max_30', 't_max_60', 't_max_90', 't_min_30', 't_min_60', 't_min_90', 'Y_prev',
 'Y_corn_lt_avg', 'day_sow', 'day_v5', 'lai_v5', 'whc', 'oc_20cm_v5', 'sw_dep_v5', 'n_0_60cm_v5', 'surfaceom_wt_v5', 'sand_40cm', 'clay_40cm']

In [None]:
import pyreadr

# dataset definition
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        # df = read_csv(path, header=None)
        df = pyreadr.read_r(path)[None] # also works for RData
        df = df[high_var+['eonr']]
        # store the inputs and outputs
        self.X = df.values[:, :-1].astype('float32')
        self.y = df.values[:, -1].astype('float32')
        # ensure target has the right shape
        self.y = self.y.reshape((len(self.y), 1))
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

In [None]:
# prepare the data
path = "/home/germanm2/n_policy_box/Data/files_rds/TrainSet_eonr2.rds"
train_dl, test_dl = prepare_data(path)
print(len(train_dl.dataset), len(test_dl.dataset))
model = MLP(21)
# train the model
train_model(train_dl, model)
# evaluate the model
mse = evaluate_model(test_dl, model)
print('MSE: %.3f, RMSE: %.3f' % (mse, sqrt(mse)))


In [None]:
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    return predictions
    # calculate mse
    #mse = mean_squared_error(actuals, predictions)
    #return mse

In [None]:
evaluate_model(test_dl, model)

In [None]:
#Make predictions for one row

row = [ 1.25000000e+02,  2.29000000e+02,  1.70000000e+01,  2.20500000e+01, 1.52166667e+01, 8.25000000e+00, 9.90000000e+00, 3.53333333e+00, -4.63333333e+00, 4.57363044e+03,
       1.38846430e+04, 1.12000000e+02, 1.52000000e+02, 4.08000000e-01, 2.94075688e+02, 1.62058756e+00, 6.29483784e+02, 4.50482040e+01, 4.65378891e+02, 9.07137482e+00, 2.10242336e+01]
yhat = predict(row, model)

In [None]:
#Make predictions
prediction_set_aggregated_df1 = pyreadr.read_r("/home/germanm2/n_policy_box/Data/files_rds/prediction_set_aggregated_dt.rds")[None] # also works for RData
prediction_set_aggregated_df2 = prediction_set_aggregated_df1[high_var][:200]
prediction_set_aggregated_df2
print(prediction_set_aggregated_df2[0:1].values)
class RDSdataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = pyreadr.read_r(path)[None] # also works for RData
        df = df[high_var]
        # store the inputs and outputs
        self.X = df.values[:, :-1].astype('float32')
        self.y = df.values[:, -1].astype('float32')
        # ensure target has the right shape
        self.y = self.y.reshape((len(self.y), 1))
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

In [None]:
path = "/home/germanm2/n_policy_box/Data/files_rds/prediction_set_aggregated_dt.rds"
dataset = RDSdataset(path)
pred_dl = DataLoader(dataset, batch_size=1024, shuffle=False)
row = [ 1.25000000e+02,  2.29000000e+02,  1.70000000e+01,  2.20500000e+01, 1.52166667e+01, 8.25000000e+00, 9.90000000e+00, 3.53333333e+00, -4.63333333e+00, 4.57363044e+03,
       1.38846430e+04, 1.12000000e+02, 1.52000000e+02, 4.08000000e-01, 2.94075688e+02, 1.62058756e+00, 6.29483784e+02, 4.50482040e+01, 4.65378891e+02, 9.07137482e+00, 2.10242336e+01]
yhat = predict(row, model)
yhat

In [None]:
predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate mse
    mse = mean_squared_error(actuals, predictions)