In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

import time
import os
import random

from pathlib import Path

from datetime import datetime, date, timedelta

import xarray as xr

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as D

from torch.utils.tensorboard import SummaryWriter

from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import mean_squared_error
import seaborn as sns

from layers import *
from sam import SAM

In [2]:
EPOCHES = 30
BATCH_SIZE = 128
N_SPLITS = 5
SEED = 42

LR = 1e-3
WEIGHT_DECAY = 1e-3
WARMUP_STEPS = 200
MOMENTUM = 0.9
CLIP_NORM = 15.0

TIMELAG = 92
H_DIM = 64
WIDTH = 92

STAGE = 'development'

subfolder = Path("weights")
subfolder.mkdir(exist_ok=True)

MODEL_PATH = subfolder.as_posix()
logfile = None
print(f"Model save path: {MODEL_PATH}")

Model save path: weights


In [3]:
def set_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seeds(SEED);

In [4]:
# Numpy print option
np.set_printoptions(precision=4, suppress=True, threshold=1000);
# Seaborn figuresize
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_style("whitegrid")

In [5]:
### Loading date

In [6]:
start_data = datetime(2014,9,1)

labels_2013_2019 = pd.read_csv("development/train_labels.csv", index_col=[0])
labels_2020_2021 = pd.read_csv("development/labels_2020_2021.csv", index_col=[0])

labels_2013_2019.columns = pd.to_datetime(labels_2013_2019.columns)
labels_2020_2021.columns = pd.to_datetime(labels_2020_2021.columns)

targets_df = pd.merge(
    labels_2013_2019, labels_2020_2021, how = "outer", left_index = True, right_index = True)

# Targets:
# Get datas only with features
targets_df = targets_df.loc[:, start_data:]
targets_df = targets_df.astype('f4')

In [7]:
# Get cell_idx, data_idx for known SWE
cell_idx, data_idx = np.nonzero((~np.isnan(targets_df.values)).astype(int))
# Convert to cell_id and times
target_values = targets_df.values[cell_idx, data_idx].astype(np.float32)
cell_idx = targets_df.index[cell_idx]
data_idx = pd.to_datetime(targets_df.columns[data_idx])

# bin targets
binc = np.arange(10, 120, 10)
# y, binc = pd.cut(target_values, bins=num_bins, labels=False, retbins=True)
y = np.digitize(target_values, bins=binc)

In [8]:
ds = xr.open_dataset(f"{STAGE}/train_dataset.nc", engine='netcdf4')
ds.load();
ds.close();

In [9]:
band = xr.concat([
        (ds.t00 - 273.15) / 10,
        (ds.t12 - 273.15) / 10,
        (ds.sdwe**0.25 - 1),
        (ds.pwat - 8) / 7,
        ds.refc / 10,
        ds.u / 20,
        ds.v / 20,
        ds.sdwea,
        ds.NDSI,
        (ds.sd / 200) - 3.6,
    ], dim = 'feature'
)

band_values = band.ffill('time').fillna(0).data

In [10]:
FEATURES = ['t00', 't12', 'sdwe', 'pwat', 'refc', 'u80', 'v80', 'sdwes', 'NDSI', 'sd']

xds = xr.Dataset(
    data_vars = dict(
            band = (["feature", "time", "cell_id"], band_values),
            target = (["cell_id", "days"], targets_df.values),
            dem = (["cell_id", "x", "y"], ds.dem.data / 1000 - 2.25),
            soil = (["cell_id", "x", "y"], ds.soil.data),
    ),
    coords = dict(
            feature = FEATURES,
            cell_id = ds.coords['cell_id'].data,
            days = targets_df.columns,
            time = ds.coords['time'].data,
        ),
)

In [11]:
class SnowDataset(D.Dataset):

    def __init__(self, cells, datas, xds, timelag : int = TIMELAG):
        
        self.xds = xds
        self.datas = datas
        self.cells = cells
        self.tlag = timedelta(days=timelag)
        self.days = timelag
        
    def __len__(self):
        return len(self.cells)

    def __getitem__(self, idx):

        cell_id = self.cells[idx]
        data_idx = self.datas[idx]
          
        features = self.xds.band.loc[
            :, data_idx-self.tlag : data_idx-timedelta(1), cell_id].data
        
        dem_value = self.xds.dem.loc[cell_id].data[None]
        soil_value = self.xds.soil.loc[cell_id].data
        target = self.xds.target.loc[cell_id, data_idx].data
        
        return features, dem_value, soil_value, target[None]

In [12]:
### Table for results
header = r'''
                  Train | Valid
Epoch |  Loss  | Metric |  Loss  | Metric | Time, m
'''
#          Epoch         metrics            time
raw_line = '{:6d}' + '\u2502{:8.3f}'*4 + '\u2502{:6.2f}'

@torch.no_grad()
def validation(model, loader, loss_fn):
    losses = []
    yt, yp = [], []
    model.eval()
    for features, dem, soil, targets in loader:
        features = features.float()
        dem = dem.float()
        soil = soil.long()
        targets = targets.float()
        outputs = model(features, dem, soil).clamp(0)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        
        yt.append(targets)
        yp.append(outputs)
        
    yt = torch.cat(yt)
    yp = torch.cat(yp)
    
    return np.array(losses).mean(), yt.numpy(), yp.numpy()

def get_figure(yt, yp, binc=binc):
    errors = np.abs(yt - yp).squeeze()
    xbin = np.digitize(yt, binc).squeeze()
    # xbin = np.floor(yt / 10).astype(np.int64)
    figure = sns.boxplot(x=xbin, y=errors)
    figure.set(ylim=(0, 100), xlim=(-1, 12))
    medians = [np.median(errors[xbin == i]) for i in np.unique(xbin)]
    vertical_offset = np.median(errors) * 0.4 # offset from median for display

    for xtick in figure.get_xticks():
        figure.text(xtick, medians[xtick] + vertical_offset, f"{medians[xtick]:.3f}", 
            horizontalalignment='center',size='x-small',color='w',weight='semibold')
        
    rmse_score = mean_squared_error(yt, yp, squared=False)
    figure.annotate(f"RMSE: {rmse_score:.3f}", xytext=(0.1, 0.9),
            xy=(0.1, 0.9), xycoords="axes fraction", fontsize=12);
    
    return figure

def train_loop(model, optimizer, scheduler, loader, vloader, epoches, fold_idx=0, loss_fn=nn.MSELoss()):
    
    print(header, file=logfile)

    best_metric = np.inf
    
    for epoch in range(1, epoches+1):
        losses = []
        yt, yp = [], []
        start_time = time.time()
        model.train()
        for features, dem, soil, targets in loader:
            soil = soil.long()
            optimizer.zero_grad()
            
            def closure():
                loss = loss_fn(targets, model(features, dem, soil).clamp(0))
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
                return loss
            outputs = model(features, dem, soil).clamp(0)
            loss = loss_fn(targets, outputs)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
            optimizer.step(closure)
            
            scheduler.step()
            losses.append(loss.detach().cpu().numpy())
            
            yt.append(targets.detach().cpu().numpy())
            yp.append(outputs.detach().cpu().numpy())
            
        yt = np.concatenate(yt)
        yp = np.concatenate(yp)
        rmse_score = mean_squared_error(yt, yp, squared=False)
        
        losses = np.array(losses).mean()
        vloss, yt, yp = validation(model, vloader, loss_fn)
        vrmse_score = mean_squared_error(yt, yp, squared=False)
        
        figure = get_figure(yt, yp)
        writer.add_figure("boxplot", figure.get_figure(), global_step=epoch)
        writer.add_scalars("Loss", {"train" : losses, "validation" : vloss}, epoch)
        writer.add_scalars("RMSE", {"train" : rmse_score, "validation" : vrmse_score}, epoch)
        
        if best_metric > vrmse_score:
            best_metric = vrmse_score
            torch.save({
                'model' : model.state_dict(),
                'epoch' : epoch,
                'metric' : best_metric,
            },  f'{MODEL_PATH}/SnowNet_fold_{fold_idx}_best.pt')

        print(raw_line.format(epoch, losses, rmse_score,
                              vloss, vrmse_score, (time.time()-start_time)/60**1), file=logfile)    
    torch.save({
        'model' : model.state_dict(),
        'epoch' : epoch,
        'metric' : best_metric,
    },  f'{MODEL_PATH}/SnowNet_fold_{fold_idx}_last.pt')
    
    return losses**0.5, best_metric

In [13]:
params = {'X': target_values,
          'y' : y }

skf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
params['groups'] = cell_idx

In [14]:
for fold_idx, (train_idx, valid_idx) in enumerate(skf.split(**params)):
        
    train_ds = SnowDataset(cell_idx[train_idx], data_idx[train_idx], xds)
    valid_ds = SnowDataset(cell_idx[valid_idx], data_idx[valid_idx], xds)
    
    # define training and validation data loaders
    loader = D.DataLoader(train_ds, batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=8, drop_last=True)

    vloader = D.DataLoader(
                valid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)
    
    writer = SummaryWriter(log_dir=f"runs/SnowNet_Fold#{fold_idx:02d}")
    model = SnowNet(features=len(FEATURES), h_dim=H_DIM, width=WIDTH, timelag=TIMELAG)

    base_optimizer = torch.optim.SGD  # define an optimizer for the "sharpness-aware" update
    optimizer = SAM(model.parameters(), base_optimizer, lr=LR, momentum=MOMENTUM)
        
    scheduler = get_cosine_schedule_with_warmup( #get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=len(loader)*EPOCHES, min_coef=0., last_epoch=-1)
    
    print(f'\n Start training model with fold #{fold_idx}', file=logfile)

    loss, vloss = train_loop(model, optimizer, scheduler, loader, vloader, EPOCHES, fold_idx)
    
    inputs = train_ds[0]
    inputs = [torch.from_numpy(i[None]) for i in inputs[:-1]]
    writer.add_graph(model, input_to_model=inputs)
    
    writer.add_hparams({'lr': LR,
                        'momentum': MOMENTUM,
                        'bsize': BATCH_SIZE,
                        'wdecay' : WEIGHT_DECAY,
                        'warmup': WARMUP_STEPS,
                        'clip_norm' : CLIP_NORM,
                        'seed' : SEED,
                        'epoches': EPOCHES,
                        'hdim' : H_DIM,
                        'width': WIDTH,
                        'timelag' : TIMELAG,
                        'fold_idx' : fold_idx,
                        'features' : " ".join(FEATURES),
                       },
                      {'hparam/loss': loss, 'hparam/vloss': vloss})
    writer.close()

    del model, optimizer, scheduler, loader, vloader


 Start training model with fold #0

                  Train | Valid
Epoch |  Loss  | Metric |  Loss  | Metric | Time, m

     1│  95.554│   9.775│  70.354│   8.386│  1.74
     2│  53.858│   7.339│  53.485│   7.314│  1.76
     3│  42.605│   6.527│  43.874│   6.624│  1.82
     4│  34.945│   5.911│  39.993│   6.327│  1.83
     5│  29.412│   5.423│  39.808│   6.312│  1.83
     6│  25.763│   5.076│  37.275│   6.108│  1.84
     7│  22.492│   4.743│  31.789│   5.642│  1.83
     8│  20.130│   4.487│  38.980│   6.248│  1.85
     9│  17.901│   4.231│  30.952│   5.568│  1.85
    10│  16.206│   4.026│  32.748│   5.728│  1.84
    11│  14.781│   3.845│  30.886│   5.563│  1.85
    12│  13.571│   3.684│  29.749│   5.459│  1.85
    13│  12.525│   3.539│  31.110│   5.584│  1.85
    14│  11.434│   3.381│  27.705│   5.268│  1.85
    15│  10.752│   3.279│  27.589│   5.258│  1.86
    16│   9.887│   3.144│  27.457│   5.245│  1.86
    17│   9.250│   3.041│  28.015│   5.298│  1.87
    18│   8.641│   2.940│  2