STGCN trained on simulated streamflow.

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from datetime import datetime, timedelta
import netCDF4 as nc
import torch
from torch import nn, utils
from torch.utils.tensorboard import SummaryWriter
from src import load_data, evaluate, conv_lstm, datasets, utils, stgcn
import random
import pickle
import json
import networkx as nx

time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')
time_stamp

'20190830-182512'

In [2]:
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='../log.out', mode='a')
chandler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s - {} - %(message)s'.format(time_stamp))
fhandler.setFormatter(formatter)
chandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.addHandler(chandler)
logger.setLevel(logging.INFO)

In [3]:
USE_CUDA = False
if torch.cuda.is_available():
    print('CUDA Available')
    USE_CUDA = True
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
device = torch.device('cuda:0' if USE_CUDA else 'cpu')
num_devices = torch.cuda.device_count() if USE_CUDA else 0
logger.warning('cuda devices: {}'.format(list(torch.cuda.get_device_name(i) for i in range(num_devices))))
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

CUDA Available
2019-08-30 18:25:12,688 - 20190830-182512 - cuda devices: ['Tesla V100-SXM2-16GB']


In [4]:
partitioning_strategy = 'distance'  # 'distance' or 'unilabel', see https://arxiv.org/abs/1801.07455
rdrs_vars = [4,5]
max_hops = 2
agg = ['sum','minmax']
include_month = False
dem, landcover, soil, groundwater = False, True, False, False
landcover_types = [5, 14, 15, 17, 18]
seq_len = 14
seq_steps = 1

train_start = datetime.strptime('2010-01-01', '%Y-%m-%d') + timedelta(days=seq_len * seq_steps)  # first day for which to make a prediction in train set
train_end = '2012-12-31'
test_start = '2013-01-01'
test_end = '2014-12-31'
spatial_val = True  # Whether the val set does spatial or temporal validation
val_fraction = 0.1

In [5]:
with open('../data/simulations_shervan/subbasins.geojson', 'r') as f:
     subbasin_shapes = json.loads(f.read())

subbasin_graph = utils.create_subbasin_graph()
component_graph = subbasin_graph.copy()
component_graph.remove_nodes_from(['sub-1', 'sub1', 'sub474', 'sub475'])  # remove Lake Erie, St. CLair and sink to get connected components
connected_components = sorted(sorted(c) for c in nx.connected_components(nx.Graph(component_graph)))

# Split into train/test/val regions
test_subbasins = []  # Do not include 474 (Lake Erie), 475 (Lake St. Clair), because they have _very_ different properties (e.g. huge cell_count)
train_subbasins = []
val_subbasins = []
train_components = []
np.random.seed(1)
for component in connected_components:
    max_y = -999
    for node in component:
        subbasin = list(s['properties'] for s in subbasin_shapes['features'] if 'sub' + str(s['properties']['SubId']) == node)[0]
        max_y = max(max_y, subbasin['INSIDE_Y'])
    if max_y < 42.6:
        if spatial_val and np.random.choice([True, False], size=1, p=[val_fraction, 1-val_fraction]):
            val_subbasins += list(int(c[3:]) for c in component)
        else:
            train_components.append(list(int(c[3:]) for c in component))
            train_subbasins += train_components[-1]
    else:
        test_subbasins += list(int(c[3:]) for c in component)

if not spatial_val: # if no spatial validation, use same graph but different samples
    val_subbasins = train_subbasins
train_subbasins, val_subbasins, test_subbasins = sorted(train_subbasins), sorted(val_subbasins), sorted(test_subbasins)

train_subgraph = subbasin_graph.subgraph(list('sub' + str(t) for t in train_subbasins))
val_subgraph = subbasin_graph.subgraph(list('sub' + str(t) for t in val_subbasins))
test_subgraph = subbasin_graph.subgraph(list('sub' + str(t) for t in test_subbasins))
if partitioning_strategy == 'unilabel':
    train_adjacency = torch.unsqueeze(torch.from_numpy(nx.to_numpy_array(train_subgraph, nodelist=list('sub' + str(t) for t in train_subbasins))), 0).float().to(device)
    val_adjacency = torch.unsqueeze(torch.from_numpy(nx.to_numpy_array(val_subgraph, nodelist=list('sub' + str(t) for t in val_subbasins))), 0).float().to(device)
    test_adjacency = torch.unsqueeze(torch.from_numpy(nx.to_numpy_array(test_subraph, nodelist=list('sub' + str(t) for t in test_subbasins))), 0).float().to(device)
elif partitioning_strategy == 'distance':  # use distances in upstream-graph, i.e. in reversed downstream-graph
    train_adjacency = utils.normalize_hop_matrix(utils.create_hop_matrix(train_subgraph.reverse(), max_hops, 
                                                                         list('sub' + str(t) for t in train_subbasins))).to(device)
    val_adjacency = utils.normalize_hop_matrix(utils.create_hop_matrix(val_subgraph.reverse(), max_hops, 
                                                                       list('sub' + str(t) for t in val_subbasins))).to(device)
    test_adjacency = utils.normalize_hop_matrix(utils.create_hop_matrix(test_subgraph.reverse(), max_hops, 
                                                                        list('sub' + str(t) for t in test_subbasins))).to(device)
else:
    raise Exception('Unsupported partitioning strategy')

train_longest_path = int(nx.dag_longest_path_length(train_subgraph))
val_longest_path = int(nx.dag_longest_path_length(val_subgraph))
test_longest_path = int(nx.dag_longest_path_length(test_subgraph))
subbasins = sorted(set(train_subbasins + test_subbasins + val_subbasins))

In [6]:
train_dataset = datasets.SubbasinAggregatedDataset(rdrs_vars, train_subbasins, seq_len, seq_steps, train_start, train_end, aggregate_daily=agg, include_months=include_month, 
                                                   dem=dem, landcover=landcover, soil=soil, groundwater=groundwater, landcover_types=landcover_types)
if spatial_val:
    val_dataset = datasets.SubbasinAggregatedDataset(rdrs_vars, val_subbasins, seq_len, seq_steps, train_start, train_end, aggregate_daily=agg, include_months=include_month, 
                                                     conv_scalers=train_dataset.scalers, dem=dem, landcover=landcover, soil=soil, groundwater=groundwater, landcover_types=landcover_types)
else:
    val_dataset = train_dataset

# Two test datasets: one with spatial and temporal validation (i.e., different graph, different time), and one with only temporal validation (i.e. different time period only)
spatial_test_dataset = None
if test_subbasins != train_subbasins:
    spatial_test_dataset = datasets.SubbasinAggregatedDataset(rdrs_vars, test_subbasins, seq_len, seq_steps, test_start, test_end, aggregate_daily=agg, include_months=include_month, 
                                                              conv_scalers=train_dataset.scalers, dem=dem, landcover=landcover, soil=soil, groundwater=groundwater, landcover_types=landcover_types)
temporal_test_dataset = datasets.SubbasinAggregatedDataset(rdrs_vars, train_subbasins, seq_len, seq_steps, test_start, test_end, aggregate_daily=agg, include_months=include_month, 
                                                           conv_scalers=train_dataset.scalers, dem=dem, landcover=landcover, soil=soil, groundwater=groundwater, landcover_types=landcover_types)

station_subbasins = train_dataset.simulated_streamflow[~pd.isna(train_dataset.simulated_streamflow['StationID'])]['subbasin'].unique()

Using saved dataset in file /home/mgauch/runoff-nn/src/../data/train_test/SubbasinAggregatedDataset_4-5_14-1_2010-01-15000000-2012-12-31_landcover_5-14-15-17-18_sum-minmax_noMonth_58736d029db6b15e8c67bf8dee4f91bd.pkl
Using saved dataset in file /home/mgauch/runoff-nn/src/../data/train_test/SubbasinAggregatedDataset_4-5_14-1_2010-01-15000000-2012-12-31_landcover_5-14-15-17-18_sum-minmax_noMonth_2761b1b36f5684b26f32e46be9470e27.pkl
Using saved dataset in file /home/mgauch/runoff-nn/src/../data/train_test/SubbasinAggregatedDataset_4-5_14-1_2013-01-01-2014-12-31_landcover_5-14-15-17-18_sum-minmax_noMonth_dc790fa91e4d325fece5485c8f55b8d7.pkl
Using saved dataset in file /home/mgauch/runoff-nn/src/../data/train_test/SubbasinAggregatedDataset_4-5_14-1_2013-01-01-2014-12-31_landcover_5-14-15-17-18_sum-minmax_noMonth_58736d029db6b15e8c67bf8dee4f91bd.pkl


In [7]:
# Train model
num_epochs = 500
learning_rate = 2e-3
patience = 500
min_improvement = 0.01
best_loss_model = (-1, np.inf, None)
dropout = 0.3
weight_decay = 1e-5
comp_subsample_fraction = 0.3
p_node_subsample = 0.1
scheduler_t_max = 80

batch_size = 8
temp_kernel_size = 3
model = stgcn.Model(train_dataset.x.shape[2], train_adjacency.shape[0], temp_kernel_size, dropout=dropout).to(device)
if num_devices > 1:
    model = torch.nn.DataParallel(model, device_ids=list(range(num_devices)))
loss_fn = evaluate.NSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = None#torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, scheduler_t_max)

writer = SummaryWriter(comment='STGCN_simulationTraining')
param_description = {'time_stamp': time_stamp, 'batch_size': batch_size, 'loss': loss_fn, 'include_month': include_month, 'aggregate_daily': agg, 'rdrs_vars': rdrs_vars, 'dropout': dropout, 'spatial_validation': spatial_val, 'val_fraction': val_fraction, 'temp_kernel': temp_kernel_size,
                     'optimizer': optimizer, 'lr': learning_rate, 'scheduler': scheduler, 'patience': patience, 'min_improvement': min_improvement, 'x_train_shape': train_dataset.x.shape, 'x_val_shape': val_dataset.x.shape, 'partitioning_strategy': partitioning_strategy, 'max_hops': max_hops, 'comp_subsample_fraction': comp_subsample_fraction, 'p_node_subsample': p_node_subsample,
                     'spatial_x_test_shape': spatial_test_dataset.x.shape if spatial_test_dataset is not None else '', 'temporal_x_test_shape': temporal_test_dataset.x.shape, 'num_epochs': num_epochs, 'seq_len': seq_len, 'seq_steps': seq_steps, 'train_start': train_start, 'train_end': train_end, 'weight_decay': weight_decay, 
                     'landcover_types': landcover_types, 'test_start': test_start, 'test_end': test_end, 'model': str(model).replace('\n','').replace(' ', ''),}
writer.add_text('Parameter Description', str(param_description))
str(param_description)

"{'time_stamp': '20190830-182512', 'batch_size': 8, 'loss': NSELoss(), 'include_month': False, 'aggregate_daily': ['sum', 'minmax'], 'rdrs_vars': [4, 5], 'dropout': 0.3, 'spatial_validation': True, 'val_fraction': 0.1, 'temp_kernel': 3, 'optimizer': Adam (\nParameter Group 0\n    amsgrad: False\n    betas: (0.9, 0.999)\n    eps: 1e-08\n    lr: 0.002\n    weight_decay: 1e-05\n), 'lr': 0.002, 'scheduler': None, 'patience': 500, 'min_improvement': 0.01, 'x_train_shape': torch.Size([1082, 14, 25, 348]), 'x_val_shape': torch.Size([1082, 14, 25, 56]), 'partitioning_strategy': 'distance', 'max_hops': 2, 'comp_subsample_fraction': 0.3, 'p_node_subsample': 0.1, 'spatial_x_test_shape': torch.Size([730, 14, 25, 317]), 'temporal_x_test_shape': torch.Size([730, 14, 25, 348]), 'num_epochs': 500, 'seq_len': 14, 'seq_steps': 1, 'train_start': datetime.datetime(2010, 1, 15, 0, 0), 'train_end': '2012-12-31', 'weight_decay': 1e-05, 'landcover_types': [5, 14, 15, 17, 18], 'test_start': '2013-01-01', 'test

In [8]:
if spatial_val:
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True, pin_memory=True, drop_last=False)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size, shuffle=True, pin_memory=True, drop_last=False)
else:
    val_indices = np.random.choice(len(train_dataset), size=int(val_fraction * len(train_dataset)), replace=False)
    train_indices = list(i for i in range(len(train_dataset)) if i not in val_indices)
    train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
    val_sampler = torch.utils.data.SubsetRandomSampler(val_indices)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, sampler=train_sampler, pin_memory=True, drop_last=False)
    val_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, sampler=val_sampler, pin_memory=True, drop_last=False)
    
if spatial_test_dataset is not None:
    spatial_test_dataloader = torch.utils.data.DataLoader(spatial_test_dataset, batch_size, shuffle=False, pin_memory=True, drop_last=False)
temporal_test_dataloader = torch.utils.data.DataLoader(temporal_test_dataset, batch_size, shuffle=False, pin_memory=True, drop_last=False)

In [9]:
# Subbasins with constant streamflow will divide by zero in loss calculation. Calculate loss without them.
train_non_constant_subbasin_mask = ((train_dataset.y_sim.min(dim=0)[0] - train_dataset.y_sim.max(dim=0)[0]) != 0).to(device)
val_non_constant_subbasin_mask = ((val_dataset.y_sim.min(dim=0)[0] - val_dataset.y_sim.max(dim=0)[0]) != 0).to(device)

y_train_means = train_dataset.y_sim_means.to(device)
y_val_means = val_dataset.y_sim_means[val_non_constant_subbasin_mask].to(device)

In [10]:
torch.manual_seed(0)
np.random.seed(0)
for epoch in range(num_epochs):
    model.train()

    train_losses = torch.tensor(0.0)
    for i, train_batch in enumerate(train_dataloader):
        # In each batch, train on a random subset of connected components to foster generalization.
        # With probability p_subsample_comp, further subset this iteration's components to a 
        #   subgraph starting with some of the source nodes
        train_nodes, max_path_len = utils.random_graph_subsample_with_sources(train_subgraph, train_components, 
                                                                              comp_subsample_fraction, p_node_subsample)
        train_node_idxs = [train_subbasins.index(n) for n in train_nodes]
        
        mask = train_non_constant_subbasin_mask[train_node_idxs]
        adj = train_adjacency[:,train_node_idxs][:,:,train_node_idxs]
        y_pred = model(train_batch['x'].permute(0,2,1,3)[...,train_node_idxs].to(device), adj, max_path_len)
        train_loss = loss_fn(y_pred[:,mask], train_batch['y_sim'][:,train_node_idxs][:,mask].to(device), means=y_train_means[train_node_idxs][mask])
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        train_losses += train_loss.detach()
        
    #scheduler.step()
    #if epoch % scheduler_t_max == 0:
    #    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, scheduler_t_max)
    train_loss = (train_losses / len(train_dataloader)).item()
    print('Epoch', epoch, 'mean train loss:\t{}'.format(train_loss))
    writer.add_scalar('loss_nse', train_loss, epoch)
    
    model.eval()
    val_losses = torch.tensor(0.0)
    for i, val_batch in enumerate(val_dataloader):
        y_pred = model(val_batch['x'].permute(0,2,1,3).to(device), val_adjacency, val_longest_path).detach()
        val_losses += loss_fn(y_pred[:,val_non_constant_subbasin_mask], val_batch['y_sim'][:,val_non_constant_subbasin_mask].to(device), means=y_val_means).detach()
        
    val_loss = (val_losses / len(val_dataloader)).item()
    print('Epoch', epoch, 'mean val loss:\t{}'.format(val_loss))
    writer.add_scalar('loss_nse_val', val_loss, epoch)
    
    if val_loss < best_loss_model[1] - min_improvement:
        best_loss_model = (epoch, val_loss, model.state_dict())  # new best model
        load_data.pickle_model('STGCN_simulationTraining', model, 'allStations', time_stamp, model_type='torch.dill')
    elif epoch > best_loss_model[0] + patience:
        print('Patience exhausted in epoch {}. Best val-loss was {}'.format(epoch, best_loss_model[1]))
        break

print('Using best model from epoch', str(best_loss_model[0]), 'which had loss', str(best_loss_model[1]))
model.load_state_dict(best_loss_model[2])
load_data.save_model_with_state('STGCN_simulationTraining', best_loss_model[0], model, optimizer, time_stamp, use_dill=True)

Epoch 0 mean train loss:	111.12616729736328
Epoch 0 mean val loss:	1.668614387512207
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_simulationTraining_allStations_20190830-182512.pkl
Epoch 1 mean train loss:	25.47852897644043
Epoch 1 mean val loss:	5.207401275634766
Epoch 2 mean train loss:	8.881752967834473
Epoch 2 mean val loss:	17.31187629699707
Epoch 3 mean train loss:	261.99505615234375
Epoch 3 mean val loss:	1.7288106679916382
Epoch 4 mean train loss:	15.464902877807617
Epoch 4 mean val loss:	2.312006711959839
Epoch 5 mean train loss:	5.211882591247559
Epoch 5 mean val loss:	4.38754940032959
Epoch 6 mean train loss:	4.149111270904541
Epoch 6 mean val loss:	7.825879096984863
Epoch 7 mean train loss:	4.207488536834717
Epoch 7 mean val loss:	5.442692756652832
Epoch 8 mean train loss:	2.4100985527038574
Epoch 8 mean val loss:	4.305781364440918
Epoch 9 mean train loss:	2.6773581504821777
Epoch 9 mean val loss:	6.562178134918213
Epoch 10 mean train loss:	2.45973849296

In [11]:
del y_train_means, y_val_means, y_pred, train_non_constant_subbasin_mask, val_non_constant_subbasin_mask
if USE_CUDA:
    torch.cuda.empty_cache()

In [12]:
logger.warning('predicting')
model.eval()

temporal_test_predictions = []  # test on train graph but different time
for i, test_batch in enumerate(temporal_test_dataloader):
    pred = model(test_batch['x'].permute(0,2,1,3).to(device), train_adjacency, train_longest_path).detach().cpu()
    temporal_test_predictions.append(pred)
predictions = torch.cat(temporal_test_predictions)

if spatial_test_dataset is not None:
    spatial_test_predictions = []  # test on different graph, different time
    for i, test_batch in enumerate(spatial_test_dataloader):
        pred = model(test_batch['x'].permute(0,2,1,3).to(device), test_adjacency, test_longest_path).detach().cpu()
        spatial_test_predictions.append(pred)
    predictions = torch.cat([torch.cat(spatial_test_predictions), predictions], dim=1)

2019-08-30 19:36:07,059 - 20190830-182512 - predicting


In [13]:
actuals = temporal_test_dataset.data_streamflow.copy()
if len(actuals['date'].unique()) != len(predictions):
    print('Warning: length of prediction {} and actuals {} does not match.'.format(len(predictions), len(actuals['date'].unique())))

nse_dict, nse_sim_dict = {}, {}
mse_dict, mse_sim_dict = {}, {}
predictions_df = pd.DataFrame(columns=actuals.columns)
predictions_df['is_test_subbasin'] = False
all_test_subbasins = test_subbasins + train_subbasins if train_subbasins != test_subbasins else train_subbasins
for i in range(len(all_test_subbasins)):
    subbasin = all_test_subbasins[i]
    station = None
    subbasin_sim = temporal_test_dataset.simulated_streamflow[temporal_test_dataset.simulated_streamflow['subbasin'] == subbasin].set_index('date')
    if subbasin in station_subbasins:
        station = subbasin_sim['StationID'].values[0]
        act = actuals[actuals['station'] == station].set_index('date')['runoff']
    if predictions.shape[0] != subbasin_sim.shape[0]:
        print('Warning: length of prediction {} and actuals {} does not match for subbasin {}. Ignoring excess actuals.'.format(len(predictions), len(subbasin_sim), subbasin))
        subbasin_sim = subbasin_sim.iloc[:predictions.shape[0]]
        if station is not None:
            act = act.iloc[:predictions.shape[0]]
            
    pred = pd.DataFrame({'runoff': predictions[:,i]}, index=subbasin_sim.index)
    pred['subbasin'] = subbasin
    pred['station'] = station
    pred['is_test_subbasin'] = subbasin in test_subbasins
    predictions_df = predictions_df.append(pred.reset_index(), sort=True)
    subbasin_type = 'test' if subbasin in test_subbasins else 'train'
    nse_sim, mse_sim = evaluate.evaluate_daily('Sub{}'.format(subbasin), pred['runoff'], subbasin_sim['simulated_streamflow'], writer=writer, group=subbasin_type)
    nse_sim_dict[subbasin] = nse_sim
    mse_sim_dict[subbasin] = mse_sim

    if station is not None:
        nse, mse = evaluate.evaluate_daily(station, pred['runoff'], act, writer=writer)
        nse_dict[subbasin] = nse
        mse_dict[subbasin] = mse
        print(station, subbasin, '\tNSE:', nse, '\tMSE:', mse, '(clipped to 0)')
    print(subbasin, '\tNSE sim:', nse_sim, '\tMSE sim:', mse_sim)


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


2 	NSE sim: -0.5929051795425517 	MSE sim: 135.20178879497985
3 	NSE sim: -0.640393690901012 	MSE sim: 52.20053886724018
4 	NSE sim: 0.0550877534630414 	MSE sim: 16.100846677882288
5 	NSE sim: -0.7763882384983642 	MSE sim: 43.085326425989834
6 	NSE sim: 0.22744678371164984 	MSE sim: 4.48300561871656
7 	NSE sim: 0.2970656171874988 	MSE sim: 1.9489700619748287
8 	NSE sim: 0.2855391070739358 	MSE sim: 5.438828032432143
9 	NSE sim: 0.25463663766935873 	MSE sim: 3.025520742584415
10 	NSE sim: 0.22811466404732195 	MSE sim: 4.399619245322067
11 	NSE sim: 0.027223465838725036 	MSE sim: 11.870930302902792
12 	NSE sim: 0.28089312642773856 	MSE sim: 2.1161843115495236
18 	NSE sim: -0.9276112093362388 	MSE sim: 13856.580183904445
21 	NSE sim: -0.8710353758547471 	MSE sim: 13180.446693342863
25 	NSE sim: -0.8422558344108992 	MSE sim: 11238.473607123893
26 	NSE sim: -0.07027308782363284 	MSE sim: 31.24238375293001
27 	NSE sim: 0.33321538780617166 	MSE sim: 4.161121894365635
28 	NSE sim: 0.29864593174

  np.sum((evaluation - np.mean(evaluation)) ** 2, dtype=np.float64))


499 	NSE sim: nan 	MSE sim: 0.003743881250567575
500 	NSE sim: -0.481068783159307 	MSE sim: 17.376346476595526
501 	NSE sim: 0.2669746037816616 	MSE sim: 0.029195242512829914
502 	NSE sim: 0.10950862830763863 	MSE sim: 0.09283599399122137
503 	NSE sim: -10.142413775475083 	MSE sim: 0.30070069198732746
504 	NSE sim: 0.17294729107889695 	MSE sim: 0.005566196358977983
505 	NSE sim: 0.4336763460001889 	MSE sim: 1.6104356155658928
506 	NSE sim: -0.19364663113638003 	MSE sim: 41.97360256239157
507 	NSE sim: 0.28378832825589784 	MSE sim: 0.009975901371080684
508 	NSE sim: 0.4225799026131132 	MSE sim: 1.5047752554631706
509 	NSE sim: -0.6290386104197967 	MSE sim: 93.74076580779212
511 	NSE sim: 0.27171931654120507 	MSE sim: 0.5304806171104709
512 	NSE sim: -0.6158299360547426 	MSE sim: 378.74565737266266
513 	NSE sim: -0.5433884580024158 	MSE sim: 339.0788961515784
552 	NSE sim: 0.35540835537191584 	MSE sim: 7.607598412534052
553 	NSE sim: -0.9301262261016128 	MSE sim: 13935.55903011992
554 	N

  np.sum((evaluation - np.mean(evaluation)) ** 2, dtype=np.float64))


527 	NSE sim: -1.5293900849478566 	MSE sim: 0.04147349531532837
528 	NSE sim: 0.14101899804378681 	MSE sim: 0.05801784355444779
529 	NSE sim: -0.11288355014247453 	MSE sim: 0.011471231785106334
530 	NSE sim: -0.040220928259113276 	MSE sim: 5.489905756168335
531 	NSE sim: 0.2604848489740489 	MSE sim: 0.8839093326296703
532 	NSE sim: -0.43841592038567256 	MSE sim: 0.47278914488309404
533 	NSE sim: 0.2782294743072381 	MSE sim: 0.0014179298297621412
534 	NSE sim: 0.4034146106482077 	MSE sim: 0.36473547559350555
535 	NSE sim: 0.3168891542737786 	MSE sim: 0.21232003521834678
536 	NSE sim: 0.3287405340628957 	MSE sim: 1.7982837974782915
537 	NSE sim: 0.3018131364990244 	MSE sim: 1.1275541488916023
538 	NSE sim: -0.06096160193836897 	MSE sim: 181.61994466148448
539 	NSE sim: 0.10554811802988018 	MSE sim: 0.017383670693125887
540 	NSE sim: -0.5215583330003966 	MSE sim: 7934.048099976682
541 	NSE sim: -0.3401522293700674 	MSE sim: 723.744530633196
542 	NSE sim: -0.3308664141207647 	MSE sim: 759.

In [14]:
def print_nse_mse(name, nse_dict, mse_dict, subbasins):
    nses = list(nse_dict[s] for s in subbasins)
    mses = list(mse_dict[s] for s in subbasins)
    print(name, 'Median NSE (clipped to 0)', np.nanmedian(nses), '/ Min', np.nanmin(nses), '/ Max', np.nanmax(nses))
    print(' ' * len(name), 'Median MSE (clipped to 0)', np.nanmedian(mses), '/ Min', np.nanmin(mses), '/ Max', np.nanmax(mses))
    
    return np.nanmedian(nses)

nse_median_sim_temporal = print_nse_mse('Temporal test sim', nse_sim_dict, mse_sim_dict, train_subbasins)
nse_median_stations_temporal = print_nse_mse('Stations temporal test', nse_dict, mse_dict, list(s for s in station_subbasins if s in train_subbasins))
if spatial_test_dataset is not None:
    nse_median_sim_spatial = print_nse_mse('Spatial test sim', nse_sim_dict, mse_sim_dict, test_subbasins)
    nse_median_stations_spatial = print_nse_mse('Stations spatial test', nse_dict, mse_dict, list(s for s in station_subbasins if s in test_subbasins))
    writer.add_scalar('nse_median_sim', nse_median_sim_spatial)
    writer.add_scalar('nse_median_stations_spatial', nse_median_stations_spatial)
writer.add_scalar('nse_median_sim_temporal', nse_median_sim_temporal)
writer.add_scalar('nse_median_stations_temporal', nse_median_stations_temporal)

Temporal test sim Median NSE (clipped to 0) 0.24055871988392474 / Min -1.5293900849478566 / Max 0.48146261148162683
                  Median MSE (clipped to 0) 6.689706021106744 / Min 0.0014179298297621412 / Max 55549.50545983026
Stations temporal test Median NSE (clipped to 0) -0.18151200475079876 / Min -1.3281745696891796 / Max 0.047253793217372664
                       Median MSE (clipped to 0) 996.3002729538321 / Min 25.06613935507493 / Max 100212.74099017325
Spatial test sim Median NSE (clipped to 0) -0.0265026923059537 / Min -10.142413775475083 / Max 0.5065368858646782
                 Median MSE (clipped to 0) 19.56531149288915 / Min 0.002483982920373252 / Max 91440.16325081288
Stations spatial test Median NSE (clipped to 0) -0.13075686021582233 / Min -1.7809618175291453 / Max 0.2687206191667614
                      Median MSE (clipped to 0) 128.66557540905774 / Min 5.362696704167209 / Max 14449.84960620828


In [15]:
nse_dict

{676: -0.13371546865579398,
 677: -0.594847620210742,
 678: -0.05421450945125739,
 680: 0.2522590381348411,
 681: -0.002786381111234526,
 682: 0.1185213717822009,
 683: -0.011069989945642789,
 684: -0.1813426403592746,
 685: -0.4580045337266683,
 686: 0.08451561136406471,
 687: -0.9241131475656528,
 688: -0.2135147301820981,
 689: 0.2687206191667614,
 690: -0.3142995927724508,
 691: -1.0799314426631654,
 692: -0.801789555633645,
 693: -0.8183226382682012,
 694: 0.037841108678362545,
 695: 0.11195078391603652,
 696: -0.37342390877532683,
 697: 0.05372826695778976,
 698: -0.3220140721893565,
 699: -0.03175716489469482,
 700: -0.08184549063230606,
 701: -0.1277982517758507,
 702: -0.49493542655679645,
 703: -1.4709771150604771,
 705: -0.09767881210731866,
 706: -0.047319719444244734,
 707: -1.7809618175291453,
 709: -0.37156758218993025,
 710: -0.25891050364636436,
 712: 0.0037875095953449067,
 713: -0.41541921556608874,
 714: -0.07831784626740546,
 716: -0.0323828357697975,
 717: 0.04725

In [16]:
writer.close()

In [17]:
save_df = pd.merge(predictions_df.rename({'runoff': 'prediction'}, axis=1), 
                   temporal_test_dataset.simulated_streamflow, on=['date', 'subbasin'])
save_df = pd.merge(save_df, actuals.rename({'runoff': 'actual'}, axis=1), how='left', on=['date', 'station'])\
            [['date', 'subbasin', 'station', 'prediction', 'actual', 'simulated_streamflow', 'is_test_subbasin']]
load_data.pickle_results('STGCN_simulationTraining', save_df, time_stamp)

'STGCN_simulationTraining_20190830-182512.pkl'

In [18]:
_ = print(train_subbasins), print(val_subbasins), print(test_subbasins)

[41, 45, 51, 57, 60, 62, 69, 70, 85, 88, 89, 90, 91, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, 112, 113, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 155, 156, 158, 159, 160, 163, 164, 165, 166, 169, 170, 171, 172, 173, 178, 179, 180, 181, 182, 186, 187, 188, 189, 191, 194, 195, 198, 199, 200, 205, 207, 210, 212, 213, 214, 217, 218, 221, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 237, 238, 239, 240, 241, 242, 245, 255, 256, 257, 266, 267, 274, 296, 297, 298, 299, 300, 301, 306, 308, 310, 311, 313, 316, 317, 318, 319, 320, 321, 322, 327, 328, 329, 330, 331, 332, 334, 336, 339, 340, 341, 342, 343, 344, 347, 348, 349, 352, 353, 355, 356, 357, 358, 359, 361, 362, 363, 364, 365, 366, 371, 372, 373, 374, 375, 376, 377, 380, 381, 382, 386, 387, 390, 393, 394, 395, 399, 402, 403, 404, 405, 407, 408, 409, 411, 414, 415, 416, 417, 4

In [19]:
list(k for k in list(nse_dict.keys()) if k in test_subbasins)

[676,
 677,
 678,
 680,
 681,
 682,
 683,
 684,
 685,
 686,
 687,
 688,
 689,
 690,
 691,
 692,
 693,
 694,
 695,
 696,
 697,
 698,
 699,
 700,
 701,
 702,
 703,
 705,
 706,
 707]

In [20]:
datetime.now().strftime('%Y%m%d-%H%M%S')

'20190830-193747'