In [3]:
import os, sys
sys.path.append("/content")
from src.utils import fit_mlp, plot_obs_predict

with open("/content/credentials") as f:
    env_vars = f.read().split("\n")

for var in env_vars:
    key, value = var.split(" = ")
    os.environ[key] = value

storage_options = {"account_name":os.environ["ACCOUNT_NAME"],
                   "account_key":os.environ["BLOB_KEY"]}


In [88]:
# Specify hyperparameters and fit model
buffer_distance = 500
day_tolerance = 8
cloud_thr = 80
min_water_pixels = 10
features = [
    "sentinel-2-l2a_AOT", 
    "sentinel-2-l2a_B02", "sentinel-2-l2a_B03", "sentinel-2-l2a_B04", # RGB bands
    "sentinel-2-l2a_B08", # NIR
    #"sentinel-2-l2a_WVP", 
    "sentinel-2-l2a_B05", "sentinel-2-l2a_B06", "sentinel-2-l2a_B07", "sentinel-2-l2a_B8A",  # Red edge bands
    "is_brazil", #"sine_julian", 
    "sentinel-2-l2a_B11", "sentinel-2-l2a_B12", # SWIR
    "mean_viewing_azimuth", "mean_viewing_zenith",
    "mean_solar_azimuth", "mean_solar_zenith"
]
epochs = 1000
batch_size = 32
learning_rate = 0.005

layer_out_neurons = [12, 24, 6]

buffer_distance = 500
day_tolerance = 8
cloud_thr = 80
mask_method1 = "lulc"
mask_method2 = "mndwi"
min_water_pixels = 1
layer_out_neurons = [24, 12, 6]
learn_sched_step_size = 200 
learn_sched_gamma = 0.2

In [90]:
import numpy as np
import random
import pandas as pd
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

n_layers = len(layer_out_neurons)

# Read the data
fp = f"/content/local/partitioned_feature_data_buffer500m_daytol8_cloudthr80percent_lulcmndwi_masking_tmp.csv"
data = pd.read_csv(fp)

In [96]:

data["Log SSC (mg/L)"] = np.log(data["SSC (mg/L)"])
response = "Log SSC (mg/L)"

lnssc_0 = data["Log SSC (mg/L)"] == 0
data.drop(lnssc_0[lnssc_0].index, inplace=True)
not_enough_water = data["n_water_pixels"] < 10
data.drop(not_enough_water[not_enough_water].index, inplace=True)
data["fold"] = random.choices([1,2,3,4,5], k=data.shape[0])
data["partition"]
data.drop(data[data["partition"] == "testing"].index, inplace=True)


In [97]:

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(data[features])
val_mse = []
fold_n_sites = []

for fold in [1,2,3,4,5]:
    X_train = X_scaled[data["fold"] != fold]
    y_train = data[data["fold"] != fold][response]
    X_val = X_scaled[data["fold"] == fold]
    y_val = data[data["fold"] == fold][response]
    site_val = data[data["fold"] == fold]["site_no"]
    X_train, y_train = np.array(X_train), np.array(y_train)
    X_val, y_val = np.array(X_val), np.array(y_val)

    class RegressionDataset(Dataset):

        def __init__(self, X_data, y_data):
            self.X_data = X_data
            self.y_data = y_data

        def __getitem__(self, index):
            return self.X_data[index], self.y_data[index]

        def __len__ (self):
            return len(self.X_data)

    train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
    val_dataset = RegressionDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())

    num_features = X_train.shape[1]

    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    train_loader_all = DataLoader(dataset=train_dataset, batch_size=1)
    val_loader = DataLoader(dataset=val_dataset, batch_size=1)


    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    global MultipleRegression
    class MultipleRegression(nn.Module):
        def __init__(self, num_features, n_layers, layer_out_neurons):
            super(MultipleRegression, self).__init__()
            self.n_layers = n_layers
            self.layer_out_neurons = layer_out_neurons

            most_recent_n_neurons = layer_out_neurons[0]
            self.layer_1 = nn.Linear(num_features, layer_out_neurons[0])

            for i in range(2, n_layers + 1):
                setattr(
                    self,
                    f"layer_{i}",
                    nn.Linear(layer_out_neurons[i-2], layer_out_neurons[i-1])
                )
                most_recent_n_neurons = layer_out_neurons[i-1]

            self.layer_out = nn.Linear(most_recent_n_neurons, 1)
            self.relu = nn.ReLU()


        def forward(self, inputs):
            x = self.relu(self.layer_1(inputs))
            for i in range(2, self.n_layers + 1):
                x = self.relu(getattr(self, f"layer_{i}")(x))

            x = self.layer_out(x)

            return (x)


    model = MultipleRegression(num_features, n_layers, layer_out_neurons)
    model.to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=learn_sched_step_size,
        gamma=learn_sched_gamma
    )

    loss_stats = {
        "train": [],
        "val": []
    }

    # Train the model
    print("Begin training.")
    for e in range(1, epochs+1):
        # TRAINING
        train_epoch_loss = 0
        model.train()

        for X_train_batch, y_train_batch in train_loader:
            # grab data to iteration and send to CPU
            X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)

            def closure():
                # Zero gradients
                optimizer.zero_grad()
                # Forward pass
                y_train_pred = model(X_train_batch)
                # Compute loss
                train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1))
                # Backward pass
                train_loss.backward()

                return train_loss

            # Update weights
            optimizer.step(closure)

            # Update the running loss
            train_loss = closure()
            train_epoch_loss += train_loss.item()

        loss_stats["train"].append(train_epoch_loss/len(train_loader))

        scheduler.step()

        if (e % 50 == 0):
            print(f"Epoch {e}/{epochs} | Train Loss: {train_epoch_loss/len(train_loader):.5f}")

    val_pred_list = []

    with torch.no_grad():
        model.eval()
        for X_batch, _ in val_loader:
            X_batch = X_batch.to(device)
            y_pred = model(X_batch).cpu().squeeze().tolist()#.numpy()
            val_pred_list.append(y_pred)

    val_pred = np.array(val_pred_list)
    val_se = list((val_pred - y_val)**2)

    se_df = pd.DataFrame({
        "se": val_se,
        "site": site_val
    })

    val_mse.append(np.mean(np.array(se_df.groupby("site").mean())))
    fold_n_sites.append(len(np.unique(site_val)))



output = {
    "buffer_distance": buffer_distance,
    "day_tolerance": day_tolerance,
    "cloud_thr": cloud_thr,
    "min_water_pixels": min_water_pixels,
    "features": features,
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "layer_out_neurons": layer_out_neurons,
    "epochs": epochs,
    "val_mse": np.average(val_mse, weights=fold_n_sites)
}


Begin training.
Epoch 50/1000 | Train Loss: 0.52757
Epoch 100/1000 | Train Loss: 0.44099
Epoch 150/1000 | Train Loss: 0.30658
Epoch 200/1000 | Train Loss: 0.28053
Epoch 250/1000 | Train Loss: 0.26680
Epoch 300/1000 | Train Loss: 0.26705
Epoch 350/1000 | Train Loss: 0.25177
Epoch 400/1000 | Train Loss: 0.25805
Epoch 450/1000 | Train Loss: 0.24883
Epoch 500/1000 | Train Loss: 0.24426
Epoch 550/1000 | Train Loss: 0.25567
Epoch 600/1000 | Train Loss: 0.24904
Epoch 650/1000 | Train Loss: 0.24231
Epoch 700/1000 | Train Loss: 0.24394
Epoch 750/1000 | Train Loss: 0.24064
Epoch 800/1000 | Train Loss: 0.25758
Epoch 850/1000 | Train Loss: 0.24109
Epoch 900/1000 | Train Loss: 0.24642
Epoch 950/1000 | Train Loss: 0.24290
Epoch 1000/1000 | Train Loss: 0.24305
Begin training.
Epoch 50/1000 | Train Loss: 0.33953
Epoch 100/1000 | Train Loss: 0.29922
Epoch 150/1000 | Train Loss: 0.25759
Epoch 200/1000 | Train Loss: 0.25191
Epoch 250/1000 | Train Loss: 0.23287
Epoch 300/1000 | Train Loss: 0.22995
Epoch 3

In [100]:
output["val_mse"]

0.6938127389359943

[19, 19, 17, 18, 19]