STGCN trained on measured streamflow at gauge stations.

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from datetime import datetime, timedelta
import netCDF4 as nc
import torch
from torch import nn, utils
from torch.utils.tensorboard import SummaryWriter
from src import load_data, evaluate, conv_lstm, datasets, utils, stgcn
import random
import pickle
import json
import networkx as nx

time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')
time_stamp

'20190830-214200'

In [2]:
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='../log.out', mode='a')
chandler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s - {} - %(message)s'.format(time_stamp))
fhandler.setFormatter(formatter)
chandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.addHandler(chandler)
logger.setLevel(logging.INFO)

In [3]:
USE_CUDA = False
if torch.cuda.is_available():
    print('CUDA Available')
    USE_CUDA = True
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
device = torch.device('cuda:0' if USE_CUDA else 'cpu')
num_devices = torch.cuda.device_count() if USE_CUDA else 0
logger.warning('cuda devices: {}'.format(list(torch.cuda.get_device_name(i) for i in range(num_devices))))
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

CUDA Available
2019-08-30 21:42:00,991 - 20190830-214200 - cuda devices: ['Tesla V100-SXM2-16GB']


In [4]:
partitioning_strategy = 'distance'  # 'distance' or 'unilabel', see https://arxiv.org/abs/1801.07455
max_hops = 3 if partitioning_strategy == 'distance' else None
rdrs_vars = list(range(8))
agg = ['minmax','minmax','minmax','minmax','sum','minmax','minmax','minmax']
include_month = True
dem, landcover, soil, groundwater = False,False,False,False
landcover_types = []
seq_len = 14
seq_steps = 1

train_start = datetime.strptime('2010-01-01', '%Y-%m-%d') + timedelta(days=seq_len * seq_steps)  # first day for which to make a prediction in train set
train_end = '2012-12-31'
test_start = '2013-01-01'
test_end = '2014-12-31'
val_fraction = 0.1

In [5]:
with open('../data/simulations_shervan/subbasins.geojson', 'r') as f:
     subbasin_shapes = json.loads(f.read())

subbasin_graph = utils.create_subbasin_graph()
subbasin_graph.remove_nodes_from(['sub-1'])
subbasins = sorted(int(n[3:]) for n in subbasin_graph.nodes)

if partitioning_strategy == 'unilabel':
    adjacency = torch.unsqueeze(torch.from_numpy(nx.to_numpy_array(subbasin_graph, nodelist=subbasins)), 0).float().to(device)
elif partitioning_strategy == 'distance':  # use distances in upstream-graph, i.e. in reversed downstream-graph
    adjacency = utils.create_hop_matrix(subbasin_graph.reverse(), max_hops, list('sub' + str(s) for s in subbasins)).float().to(device)
else:
    raise Exception('Unsupported partitioning strategy')
    
max_path_len = int(nx.dag_longest_path_length(subbasin_graph))

In [6]:
train_dataset = datasets.SubbasinAggregatedDataset(rdrs_vars, subbasins, seq_len, seq_steps, train_start, train_end, aggregate_daily=agg, include_months=include_month, 
                                                   dem=dem, landcover=landcover, soil=soil, groundwater=groundwater, landcover_types=landcover_types)
test_dataset = datasets.SubbasinAggregatedDataset(rdrs_vars, subbasins, seq_len, seq_steps, test_start, test_end, aggregate_daily=agg, include_months=include_month, 
                                                  conv_scalers=train_dataset.scalers, dem=dem, landcover=landcover, soil=soil, groundwater=groundwater, landcover_types=landcover_types)

station_subbasins = list(train_dataset.subbasin_to_station.keys())

Using saved dataset in file /home/mgauch/runoff-nn/src/../data/train_test/SubbasinAggregatedDataset_0-1-2-3-4-5-6-7_14-1_2010-01-15000000-2012-12-31___minmax-minmax-minmax-minmax-sum-minmax-minmax-minmax_month_2ebc6a3d272e5604ce930efd83f50bb5.pkl
Using saved dataset in file /home/mgauch/runoff-nn/src/../data/train_test/SubbasinAggregatedDataset_0-1-2-3-4-5-6-7_14-1_2013-01-01-2014-12-31___minmax-minmax-minmax-minmax-sum-minmax-minmax-minmax_month_2ebc6a3d272e5604ce930efd83f50bb5.pkl


In [7]:
# Train model
num_epochs = 800
learning_rate = 2e-3
patience = 300
min_improvement = 0.01
best_loss_model = (-1, np.inf, None)
dropout = 0.3
weight_decay = 1e-5
edge_weighting = True

batch_size = 16
temp_kernel_size = 3

#model = load_data.load_model_and_state('STGCN_simulationTraining', '20190824-162736', device, use_dill=True)[0]
#for param in model.parameters():
#    param.requires_grad = False  # Freeze layers

# un-freeze last layer, add edge importance weighting
#for param in model.fcn.parameters():
#    param.requires_grad = True
#model.edge_importance = nn.ParameterList([nn.Parameter(torch.ones(adjacency.shape, device=device)) for i in model.st_gcn_networks])

model = stgcn.Model(train_dataset.x.shape[2], adjacency.shape[0], temp_kernel_size, edge_importance_weighting=edge_weighting, 
                    adjacency_shape=adjacency.shape, dropout=dropout).to(device)

if num_devices > 1:
    model = torch.nn.DataParallel(model, device_ids=list(range(num_devices)))
loss_fn = evaluate.NSELoss()
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=learning_rate, weight_decay=weight_decay)
scheduler = None#torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=1)

writer = SummaryWriter(comment='STGCN_stationTraining')
param_description = {'time_stamp': time_stamp, 'batch_size': batch_size, 'loss': loss_fn, 'include_month': include_month, 'aggregate_daily': agg, 'rdrs_vars': rdrs_vars, 'dropout': dropout, 'val_fraction': val_fraction, 'temp_kernel': temp_kernel_size,
                     'optimizer': optimizer, 'lr': learning_rate, 'patience': patience, 'min_improvement': min_improvement, 'x_train_shape': train_dataset.x.shape, 'partitioning_strategy': partitioning_strategy, 'max_hops': max_hops, 'scheduler': scheduler,
                     'x_test_shape': test_dataset.x.shape, 'num_epochs': num_epochs, 'seq_len': seq_len, 'seq_steps': seq_steps, 'train_start': train_start, 'train_end': train_end, 'weight_decay': weight_decay, 'edge_weighting': edge_weighting, 'max_path_len': max_path_len,
                     'landcover_types': landcover_types, 'test_start': test_start, 'test_end': test_end, 'model': str(model).replace('\n','').replace(' ', ''),}
writer.add_text('Parameter Description', str(param_description))
str(param_description)

"{'time_stamp': '20190830-214200', 'batch_size': 16, 'loss': NSELoss(), 'include_month': True, 'aggregate_daily': ['minmax', 'minmax', 'minmax', 'minmax', 'sum', 'minmax', 'minmax', 'minmax'], 'rdrs_vars': [0, 1, 2, 3, 4, 5, 6, 7], 'dropout': 0.3, 'val_fraction': 0.1, 'temp_kernel': 3, 'optimizer': Adam (\nParameter Group 0\n    amsgrad: False\n    betas: (0.9, 0.999)\n    eps: 1e-08\n    lr: 0.002\n    weight_decay: 1e-05\n), 'lr': 0.002, 'patience': 300, 'min_improvement': 0.01, 'x_train_shape': torch.Size([1082, 14, 82, 724]), 'partitioning_strategy': 'distance', 'max_hops': 3, 'scheduler': None, 'x_test_shape': torch.Size([730, 14, 82, 724]), 'num_epochs': 800, 'seq_len': 14, 'seq_steps': 1, 'train_start': datetime.datetime(2010, 1, 15, 0, 0), 'train_end': '2012-12-31', 'weight_decay': 1e-05, 'edge_weighting': True, 'max_path_len': 33, 'landcover_types': [], 'test_start': '2013-01-01', 'test_end': '2014-12-31', 'model': 'Model((data_bn):BatchNorm1d(59368,eps=1e-05,momentum=0.1,affi

In [8]:
val_indices = np.random.choice(len(train_dataset), size=int(val_fraction * len(train_dataset)), replace=False)
train_indices = list(i for i in range(len(train_dataset)) if i not in val_indices)
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
val_sampler = torch.utils.data.SubsetRandomSampler(val_indices)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, sampler=train_sampler, pin_memory=True, drop_last=False)
val_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, sampler=val_sampler, pin_memory=True, drop_last=False)
    
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size, shuffle=False, pin_memory=True, drop_last=False)

In [9]:
y_train_means = train_dataset.y_means.to(device)

In [10]:
torch.manual_seed(0)
np.random.seed(0)
for epoch in range(num_epochs):
    model.train()

    train_losses = torch.tensor(0.0)
    for i, train_batch in enumerate(train_dataloader):
        y_mask = train_batch['y_mask'].all(dim=0).to(device)  # only backprop for stations where we have non-NA values for the whole batch
        if not torch.any(y_mask):
            continue
            
        y_pred = model(train_batch['x'].permute(0,2,1,3).to(device), adjacency, max_path_len)
        train_loss = loss_fn(y_pred[:,y_mask], train_batch['y'][:,y_mask].to(device), means=y_train_means[y_mask])
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        train_losses += train_loss.detach()
    
    #scheduler.step()    
    train_loss = (train_losses / len(train_dataloader)).item()
    print('Epoch', epoch, 'mean train loss:\t{}'.format(train_loss))
    writer.add_scalar('loss_nse', train_loss, epoch)
    
    model.eval()
    val_losses = torch.tensor(0.0)
    for i, val_batch in enumerate(val_dataloader):
        y_mask = val_batch['y_mask'].all(dim=0).to(device)
        if not torch.any(y_mask):
            continue
            
        y_pred = model(val_batch['x'].permute(0,2,1,3).to(device), adjacency, max_path_len).detach()
        val_losses += loss_fn(y_pred[:,y_mask], val_batch['y'][:,y_mask].to(device), means=y_train_means[y_mask]).detach()
            
    val_loss = (val_losses / len(val_dataloader)).item()
    print('Epoch', epoch, 'mean val loss:\t{}'.format(val_loss))
    writer.add_scalar('loss_nse_val', val_loss, epoch)
    
    if val_loss < best_loss_model[1] - min_improvement:
        best_loss_model = (epoch, val_loss, model.state_dict())  # new best model
        load_data.pickle_model('STGCN_stationTraining', model, 'allStations', time_stamp, model_type='torch.dill')
    elif epoch > best_loss_model[0] + patience:
        print('Patience exhausted in epoch {}. Best val-loss was {}'.format(epoch, best_loss_model[1]))
        break
    
print('Using best model from epoch', str(best_loss_model[0]), 'which had loss', str(best_loss_model[1]))
model.load_state_dict(best_loss_model[2])
load_data.save_model_with_state('STGCN_stationTraining', best_loss_model[0], model, optimizer, time_stamp, use_dill=True)

Epoch 0 mean train loss:	1.3888498544692993
Epoch 0 mean val loss:	24.389188766479492
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_stationTraining_allStations_20190830-214200.pkl
Epoch 1 mean train loss:	0.9695371985435486
Epoch 1 mean val loss:	1.2352184057235718
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_stationTraining_allStations_20190830-214200.pkl
Epoch 2 mean train loss:	0.8049817085266113
Epoch 2 mean val loss:	1.1139267683029175
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_stationTraining_allStations_20190830-214200.pkl
Epoch 3 mean train loss:	0.7375428676605225
Epoch 3 mean val loss:	1.0600476264953613
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_stationTraining_allStations_20190830-214200.pkl
Epoch 4 mean train loss:	0.6575727462768555
Epoch 4 mean val loss:	0.9820159673690796
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_stationTraining_allStations_20190830-214200.pkl
Epoch

In [11]:
del y_train_means, y_pred, y_mask
if USE_CUDA:
    torch.cuda.empty_cache()

In [12]:
logger.warning('predicting')
model.eval()

predictions = []  # test on same graph but different time
for i, test_batch in enumerate(test_dataloader):
    pred = model(test_batch['x'].permute(0,2,1,3).to(device), adjacency, max_path_len).detach().cpu()
    predictions.append(pred)
    
predictions = torch.cat(predictions, dim=0)

2019-08-31 00:08:44,369 - 20190830-214200 - predicting


In [13]:
actuals = test_dataset.data_streamflow.copy()
if len(actuals['date'].unique()) != len(predictions):
    print('Warning: length of prediction {} and actuals {} does not match.'.format(len(predictions), len(actuals['date'].unique())))

nse_dict = {}
mse_dict = {}
predictions_df = pd.DataFrame(columns=actuals.columns)
for i in range(len(subbasins)):
    subbasin = subbasins[i]
    if subbasin not in station_subbasins:
        continue
    station = test_dataset.subbasin_to_station[subbasin]
    act = actuals[actuals['station'] == station].set_index('date')['runoff']
    if predictions.shape[0] != act.shape[0]:
        print('Warning: length of prediction {} and actuals {} does not match for subbasin {}. Ignoring excess actuals.'.format(len(predictions), len(act), subbasin))
        act = act.iloc[:predictions.shape[0]]
            
    pred = pd.DataFrame({'runoff': predictions[:,i]}, index=act.index)
    pred['subbasin'] = subbasin
    pred['station'] = station
    predictions_df = predictions_df.append(pred.reset_index(), sort=True)
    
    nse, mse = evaluate.evaluate_daily(station, pred['runoff'], act, writer=writer)
    nse_dict[subbasin] = nse
    mse_dict[subbasin] = mse
    print(station, subbasin, '\tNSE:', nse, '\tMSE:', mse, '(clipped to 0)')


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


02GA038 676 	NSE: 0.17319233149954527 	MSE: 136.04165795209028 (clipped to 0)
02GA047 677 	NSE: 0.2235027749417322 	MSE: 60.8696366243551 (clipped to 0)
04159492 678 	NSE: 0.33049676215087354 	MSE: 310.8462042531906 (clipped to 0)
04159900 680 	NSE: 0.4813748421598446 	MSE: 22.532688895010995 (clipped to 0)
04160600 681 	NSE: 0.43636008630943146 	MSE: 16.30661868596716 (clipped to 0)
02GG006 682 	NSE: 0.3276828105167251 	MSE: 36.04107528241876 (clipped to 0)
02GD004 683 	NSE: 0.4101024978191402 	MSE: 32.481569468712394 (clipped to 0)
02GA018 684 	NSE: 0.24168929665094518 	MSE: 190.45214495485243 (clipped to 0)
02GA010 685 	NSE: 0.23099568960229477 	MSE: 366.6409137024299 (clipped to 0)
02GB007 686 	NSE: 0.48760451786094605 	MSE: 16.02363896190412 (clipped to 0)
02GB001 687 	NSE: 0.20404269186125557 	MSE: 5977.540047537561 (clipped to 0)
04215000 688 	NSE: 0.26463168809725945 	MSE: 70.63419481681463 (clipped to 0)
04166100 689 	NSE: 0.21369968829322517 	MSE: 5.766182119439721 (clipped t

In [14]:
def print_nse_mse(name, nse_dict, mse_dict, subbasins):
    nses = list(nse_dict[s] for s in subbasins)
    mses = list(mse_dict[s] for s in subbasins)
    print(name, 'Median NSE (clipped to 0)', np.nanmedian(nses), '/ Min', np.nanmin(nses), '/ Max', np.nanmax(nses))
    print(' ' * len(name), 'Median MSE (clipped to 0)', np.nanmedian(mses), '/ Min', np.nanmin(mses), '/ Max', np.nanmax(mses))
    
    return np.nanmedian(nses)

nse_median_stations = print_nse_mse('Stations test', nse_dict, mse_dict, list(s for s in station_subbasins))

writer.add_scalar('nse_median_stations_temporal', nse_median_stations)

Stations test Median NSE (clipped to 0) 0.3441675021707803 / Min -0.258851868239365 / Max 0.4995215103160273
              Median MSE (clipped to 0) 113.4223465283722 / Min 5.766182119439721 / Max 35434.25206206072


In [15]:
nse_dict

{676: 0.17319233149954527,
 677: 0.2235027749417322,
 678: 0.33049676215087354,
 680: 0.4813748421598446,
 681: 0.43636008630943146,
 682: 0.3276828105167251,
 683: 0.4101024978191402,
 684: 0.24168929665094518,
 685: 0.23099568960229477,
 686: 0.48760451786094605,
 687: 0.20404269186125557,
 688: 0.26463168809725945,
 689: 0.21369968829322517,
 690: 0.2879089540604256,
 691: 0.18182046991763812,
 692: 0.3443659533850155,
 693: 0.2031484423390303,
 694: 0.29121166599660053,
 695: 0.38454252725568616,
 696: 0.45666460992524216,
 697: 0.3179438983897951,
 698: 0.43856329434296015,
 699: 0.38814696145784433,
 700: 0.48262400668202654,
 701: 0.44568004659959937,
 702: 0.3848845944762448,
 703: 0.47763934761187965,
 704: 0.4011102343563099,
 705: -0.258851868239365,
 706: 0.37030863228812205,
 707: 0.24905901257738416,
 709: 0.2323279187437628,
 710: 0.36963049825859495,
 712: 0.34396905095654506,
 713: 0.4995215103160273,
 714: 0.3142404263384352,
 715: 0.39068104003780246,
 716: 0.2590203

In [16]:
writer.close()

In [17]:
save_df = pd.merge(predictions_df.rename({'runoff': 'prediction'}, axis=1), 
                   test_dataset.simulated_streamflow, on=['date', 'subbasin'])
save_df = pd.merge(save_df, actuals.rename({'runoff': 'actual'}, axis=1), how='left', on=['date', 'station'])\
            [['date', 'subbasin', 'station', 'prediction', 'actual', 'simulated_streamflow']]
load_data.pickle_results('STGCN_stationTraining', save_df, time_stamp)

'STGCN_stationTraining_20190830-214200.pkl'

In [18]:
datetime.now().strftime('%Y%m%d-%H%M%S')

'20190831-000902'